In [1]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np 

In [2]:
## SkLearn# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

We will be defining a function IDF whose parameter will be the corpus and the unique words.

In [3]:
def Compute_IDF(corpus, unique_words):
    idf_dict={}
    N=len(corpus)
    for i in unique_words:
        count=0
        for sen in corpus:
            if i in sen.split():
                count=count+1
            idf_dict[i]=(1+math.log((1+N)/(count+1)))# to avoid zero devision error
    return idf_dict 

In [6]:
# it accepts only list of sentances
def fit(dataset):    
    unique_words = set() # at first we will initialize an empty set
    # check if its list type or not
    if isinstance(dataset, (list,)):
        for row in dataset: # for each review in the dataset
            for word in row.split(" "): # for each word in the review. #split method converts a string into list of words
                if len(word) < 2:
                    continue
                unique_words.add(word)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        Idf_values_unique_words=Compute_IDF(dataset,unique_words)
        
        return vocab,Idf_values_unique_words
    else:
        print("you need to pass list of sentance")


In [7]:
Vocabulary, idf_of_vocabulary=fit(corpus)

In [8]:
Vocabulary

{'and': 0,
 'document': 1,
 'first': 2,
 'is': 3,
 'one': 4,
 'second': 5,
 'the': 6,
 'third': 7,
 'this': 8}

In [9]:
idf_of_vocabulary

{'and': 1.916290731874155,
 'document': 1.2231435513142097,
 'first': 1.5108256237659907,
 'is': 1.0,
 'one': 1.916290731874155,
 'second': 1.916290731874155,
 'the': 1.0,
 'third': 1.916290731874155,
 'this': 1.0}

In [10]:
Vocabulary.keys()

dict_keys(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'])

In [11]:
idf_of_vocabulary.values()

dict_values([1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0])

In [30]:
#to find Tf_Idf values

def transform(dataset,vocabulary,idf_values):
    
    sparse_matrix= csr_matrix( (len(dataset), len(vocabulary))) #rows and columns
    
    for row  in range(len(dataset)):
        
        count_of_words=Counter(dataset[row].split()) #count of every word in a sentence
        
        for word in dataset[row].split():
            
            if word in  list(vocabulary.keys()):
                
                tf_idf_value=(count_of_words[word]/len(dataset[row].split()))*(idf_values[word])
                
                sparse_matrix[row,vocabulary[word]]=tf_idf_value
    
    print("normalized \n",normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False))
    
    output =normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False)
    return output

In [31]:
transform(corpus,Vocabulary,idf_of_vocabulary)



normalized 
   (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149


<4x9 sparse matrix of type '<class 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [54]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [55]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]
