In [1]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy
import pandas as pd

In [2]:
import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)

print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [3]:
def fit_data(dataset,number_of_idfvalues):
    #check the dataset is list or not using isinstance keyword
    if isinstance(dataset, list):
        #initialize a set to store unique words. Set will not have duplicate
        uniquewords = set() 
        #initialize a dict to store the unique words and index
        unique_dict = {}
        #length of dataset
        total_words_dataset = len(dataset)
        #loop through all the document in corpus
        for row in dataset:
            #loop through all the words in a document
            for word in row.split(" "):
                #add the unique words to the set 
                uniquewords.add(word)
        #conver the set to a list and sort it
        uniquewords = list(uniquewords)
        uniquewords.sort(reverse = True)
        #convert the list to dataframe
        uniquewords = pd.DataFrame(uniquewords)
        #filter the top 'n' idf features
        uniquewords = uniquewords.head(number_of_idfvalues)
        #convert the dataframe to list. Now the list will have top idf features 
        uniquewords = list(uniquewords[0])
        #Add the unique words to the dictionary
        unique_dict = {value:index for index, value in enumerate(uniquewords)}
        #call the function IDF to get idf value for each word
        IDF_values = IDF(dataset,uniquewords)
    else:
        #throw exception when the dataset is not a list
        raise ValueError('Give the dataset in list format')
    return unique_dict, IDF_values

In [4]:
def IDF(dataset, uniquewords):
    #initialize a dict which will store idf words and values
    idf_dict = {}
    #length of dataset
    N = len(dataset)
    #loop through all the uniquewords
    for word in uniquewords:
        #initialize the count variable
        count = 0
        #run each unique word against each document in corpus. 
        for document in dataset:
            if word in document.split():
                #if word exists in a document then increment the count variable
                count += 1
        #add the word and its idf value to the dict idf_dict
        idf_dict[word] = 1 + math.log((1+N)/(1+count)) 
    return idf_dict

In [5]:
def transform(dataset, unique_dict, IDF_values): 
    #sparse matrix which stores tfidf value for each word in each document of the corpus.
    sparse_matrix = csr_matrix( (len(dataset), len(unique_dict)), dtype='float64')
    #find number of words in document by using Counter and split keywords
    for row in range(0,len(dataset)):
        number_of_words_in_sentence = Counter(dataset[row].split())
        #for each word of a document, check if that exists in unique_dict
        #if exists, find ifidf_value using the formulat and add it to the sparse matrix
        for word in dataset[row].split():
            if word in list(unique_dict.keys()): 
                #formula to find tfidf values
                tfidf_value = (number_of_words_in_sentence[word]/len(dataset[row].split())) * IDF_values[word]
                sparse_matrix[row, unique_dict[word]] = tfidf_value
    #peform l2 normalization on the sparse matrix
    output = normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False)
    return output

In [6]:
unique_dict, IDF_values = fit_data(corpus, 50)
print(unique_dict)
print(IDF_values)

{'zombiez': 0, 'zombie': 1, 'zillion': 2, 'z': 3, 'yun': 4, 'youtube': 5, 'youthful': 6, 'younger': 7, 'young': 8, 'yet': 9, 'yes': 10, 'yelps': 11, 'years': 12, 'year': 13, 'yeah': 14, 'yawn': 15, 'yardley': 16, 'x': 17, 'wrote': 18, 'wrong': 19, 'written': 20, 'writing': 21, 'writers': 22, 'writer': 23, 'write': 24, 'wrap': 25, 'wow': 26, 'woven': 27, 'wouldnt': 28, 'would': 29, 'worthy': 30, 'worthwhile': 31, 'worthless': 32, 'worth': 33, 'worst': 34, 'worse': 35, 'worry': 36, 'world': 37, 'works': 38, 'working': 39, 'worked': 40, 'work': 41, 'words': 42, 'word': 43, 'wooden': 44, 'woo': 45, 'wont': 46, 'wong': 47, 'wonderfully': 48, 'wonderful': 49}
{'zombiez': 6.922918004572872, 'zombie': 6.517452896464707, 'zillion': 6.922918004572872, 'z': 6.922918004572872, 'yun': 6.922918004572872, 'youtube': 6.922918004572872, 'youthful': 6.922918004572872, 'younger': 6.922918004572872, 'young': 6.006627272698717, 'yet': 5.824305715904762, 'yes': 6.229770824012927, 'yelps': 6.922918004572872,

In [7]:
matrix=transform(corpus,unique_dict,IDF_values)
print(matrix.shape) 
print(matrix[0].toarray())

(746, 50)
[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]]


  self._set_intXint(row, col, x.flat[0])
