# IDF Implementation 

Calculating idf score for each word for given corpus/dataset and comparing it with idf score obtain from sklearn

In [89]:
import pandas as pd
from collections import Counter
import math
from scipy.sparse import csr_matrix
from tqdm import tqdm
from sklearn.preprocessing import normalize
import numpy

In [90]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',]

In [91]:

def fit(corpus):
    '''
    This function return the vocab and IDF value for the given corpus 
    '''
    if isinstance(corpus,(list,)):# check if the coupus is in list form 
        vocab=set() # sote the unique words
        IDF={}      # dict to store word as key and IDF as value 
        list_word=[] 
        for sentence in corpus: 
            # list_word store unique word in a given row/document i.e if word occur  
            # more then once in row/documnet it will be stored only once and we do this 
            # with each document and number of time the word occur in list_word 
            # that will be the number of  document contain the word. 
            list_word.extend(set(sentence.split(' '))) #used extend to add values of set 
            
            for word in sentence.split(' '): # storing the word greater then length of 2 
                if len(word)>=2:
                    vocab.add(word)
        vocab=sorted(list(vocab))
        vocab={j:i for i,j in enumerate(vocab)} # creating dict of unique word with key as word and value as index
        # now using list word to get the number of documnet contains the words
        word_occr=dict(Counter(list_word)) # create dict with key words and value the number of documnet countain the word
        N=1+len(corpus) #total number of documents/corpus 
        for word in vocab.keys():
            
            n=1+word_occr[word] #number of documents which contains the word  
            log_val=math.log(N/n)
            IDF[word]=1+log_val
        return vocab ,IDF
    else:
        print('Wrong Data type passed')
   



In [92]:
def transform(data,IDF,vocab):
    if isinstance(data,(list,)):
        rows=[]  # row number
        column=[] # column number
        tfidf=[]  # tfidf value
        for idx,row in enumerate(tqdm(data)): 
            word_list1=dict(Counter(row.split(' '))) #this will give dict of word as key and number of time word occur in documnet 
           
            for word,freq in word_list1.items():
                if word in IDF.keys():
                    #calculating tfidf value
                    N=len(row)
                    tf_val=freq/N
                    tfidf_val=tf_val*IDF[word]
                    
                    rows.append(idx)           # store the row number 
                    column.append(vocab[word]) # store the column number or index of word in final vector
                    tfidf.append(tfidf_val)   
                else:
                    #if word is not in IDF dict then ignore and continue  
                    continue
        csr_mat=csr_matrix((tfidf,(rows,column)),shape=(len(data),len(IDF)))   
        return normalize(csr_mat,norm='l2')
        
    else:
        print('Wrong Data type passed')
        



In [94]:
#fitting the model
vocab,IDF=fit(corpus) 

print(transform(corpus,IDF,vocab).toarray())

100%|██████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 3906.22it/s]

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]





In [97]:
print(list(IDF.values()))

[1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]


In [98]:
print (list(vocab.keys()))

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


### Comparing with TfidfVectorizer

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [25]:
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [26]:
print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [42]:
print(skl_output.toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


#  Task 2 - Top 50 IFD Value

In [99]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type

import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [100]:
import pandas as pd
from collections import Counter
import math
from scipy.sparse import csr_matrix
from tqdm import tqdm
from sklearn.preprocessing import normalize
import numpy

In [104]:
def fit(corpus):
    if isinstance(corpus,(list,)):
        top_vocab=[] #to store top word with top IDF
        vocab=set() #to store unique word in corpus
        IDF={}     #to store IDF of all unique words
        list_word=[]
            # list_word store unique word in a given row/document i.e if word occur  
            # more then once in row/documnet it will be stored only once and we do this 
            # with each document and number of time the word occur in list_word 
            # that will be the number of  document contain the word. 
        for sentence in corpus: 
            list_word.extend(set(sentence.split(' '))) #used extend to add values of set 
            
            for word in sentence.split(' '):# storing the word greater then length of 2 
                if len(word)>=2:
                    vocab.add(word)
        vocab=sorted(list(vocab))  # creating dict of unique word with key as word and value as index
       
    
        # now using list word to get the number of documnet contains the words    
        word_occr=dict(Counter(list_word))
    
    
        #calculating IDF for each word in vocab
        for word in vocab:
            N=1+len(corpus)
            n=1+word_occr[word]
            log_val=math.log(N/n)
            IDF[word]=1+log_val
            
        #sorting IDF and taking top 50 only    
        top_idf={i:j for i,j in sorted(IDF.items(),key=lambda item:item[1],reverse=True)[:50]}
        
        top_vocab={j:i for i,j in enumerate(sorted(top_idf.keys()))}   
        return top_vocab ,top_idf
    else:
        #if word is not in IDF dict then ignore and continue  
        print('Wrong Data type passed')
   

In [105]:
def transform(data,IDF,vocab):
    if isinstance(data,(list,)):
        rows=[]
        column=[]
        tfidf=[]
        for idx,row in enumerate(tqdm(data)):
            word_list1=dict(Counter(row.split(' ')))#this will give dict of word as key and number
                                                    #of time word occur in the documnet
           
            for word,freq in word_list1.items():
                #calculating tfidf
                if word in vocab.keys():
                    N=len(row)
                    tf_val=freq/N
                    tfidf_val=tf_val*IDF[word]
                   
                    rows.append(idx) # store the row number 
                    
                    column.append(vocab[word]) # store the column number or index of word in final vector
                    
                    tfidf.append(tfidf_val)   
                else:
                    continue
        csr_mat=csr_matrix((tfidf,(rows,column)),shape=(len(data),len(vocab)))   
        return normalize(csr_mat,norm='l2')
        
    else:
        print('Wrong Data type passed')

In [106]:
top_vocab,top_IDF=fit(corpus) 
# print(top_IDF.values())
print(transform(corpus,top_IDF,top_vocab))

100%|█████████████████████████████████████████████████████████████████████████████| 746/746 [00:00<00:00, 72014.33it/s]

  (0, 30)	1.0
  (68, 24)	1.0
  (72, 29)	1.0
  (74, 31)	1.0
  (119, 33)	1.0
  (135, 3)	0.3779644730092272
  (135, 10)	0.3779644730092272
  (135, 18)	0.3779644730092272
  (135, 20)	0.3779644730092272
  (135, 36)	0.3779644730092272
  (135, 40)	0.3779644730092272
  (135, 41)	0.3779644730092272
  (176, 49)	1.0
  (181, 13)	1.0
  (192, 21)	1.0
  (193, 23)	1.0
  (216, 2)	1.0
  (222, 47)	1.0
  (225, 19)	1.0
  (227, 17)	1.0
  (241, 44)	1.0
  (270, 1)	1.0
  (290, 25)	1.0
  (333, 26)	1.0
  (334, 15)	1.0
  (341, 43)	1.0
  (344, 42)	1.0
  (348, 8)	1.0
  (377, 37)	1.0
  (409, 5)	1.0
  (430, 39)	1.0
  (457, 45)	1.0
  (461, 4)	1.0
  (465, 38)	1.0
  (475, 35)	1.0
  (493, 6)	1.0
  (500, 48)	1.0
  (548, 0)	0.7071067811865476
  (548, 32)	0.7071067811865476
  (608, 14)	1.0
  (612, 11)	1.0
  (620, 46)	1.0
  (632, 7)	1.0
  (644, 12)	0.7071067811865476
  (644, 27)	0.7071067811865476
  (664, 28)	1.0
  (667, 22)	1.0
  (691, 34)	1.0
  (697, 9)	1.0
  (722, 16)	1.0





In [107]:
print(list(top_IDF.values()))

[6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872]


###### Comparing with TfidfVectorizer  top IDF values

In [109]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [110]:
#top 50 IDF values
print(sorted(vectorizer.idf_,reverse=True)[:50])


[6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872]
