# Custom implementation of tfidf vectorizer

Note:
- Corpus - Sample list of sentences.
- Corpus data - Cleaned strings data from [file](cleaned_strings).

## Task 1: 
Building a Tfidf vectorizer and comparing the results with Sklearn.

In [79]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy

def idf(dataset, word):
    '''function to find the idf of a word'''
    count = 0 # number of documents with the word in it, initially 0
    
    # for each review in dataset and if word is in the review increment count
    for row in dataset:
        if word in row:
            count += 1
    
    # calculate idf as 1+log((1+total number of documents)/(1+number of documents with the word))
    val = 1 + math.log((1+len(dataset))/(1+count)) # value of idf 
    return val

def fit(dataset):
    unique_words = set() # set of unique words at first empty
    if isinstance(dataset, list): # check if its list type
        
        # for each review in dataset and for each word in review add word to set
        for row in dataset:
            for words in row.split():
                if len(words) < 2: # ignore small words 
                    continue
                unique_words.add(words)
                
        # change type set to list and sort
        unique_words = sorted(list(unique_words))
        
        # vocabulary of words and index {unique word : index of word}
        vocab = {i:j for j,i in enumerate(unique_words)}
        return vocab
    else:
        print("Pass a list as sentence")
        
def transform(dataset, vocab):
    rows = []
    columns = []
    values = []
    if isinstance(dataset, list): # check if its list type
        for idx, row in enumerate(tqdm(dataset)): # for each document in dataset

            word_freq = dict(Counter(row.split())) # key: word, values: frequency of word
            doc_total_freq = len(row.split()) # total number of words in document
            
            for word, freq in word_freq.items(): # for each item in word_freq dictionary
                
                if len(word) < 2: # ignore small words 
                    continue
                        
                col_index = vocab.get(word, -1) # retreving the dimension number of a word
                # if the word exists
                if col_index != -1:
                    rows.append(idx) # store index of document
                    columns.append(col_index) # store dimensions of word
                    
                    # calculate tfidf value using formula
                    values.append((freq/doc_total_freq)*idf(dataset,word)) # store tfidf value of word
        return normalize(csr_matrix((values, (rows,columns)), shape = (len(dataset),len(vocab))), norm ='l2') # final output normalized 
    else:
        print("Pass list of strings")

### 1.1 Using corpus

In [109]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

vocab = fit(corpus)
print('\nCustom feature names:\n',list(vocab.keys()))
custom_idf = [idf(corpus,word) for word in list(vocab.keys())]
print('\nCustom idf values:\n',custom_idf)
custom_tfidf = transform(corpus, vocab)
print('\nCustom tfidf output:\n',custom_tfidf)

# sparse matrix into dense matrix
print('\nDense matrix for document 0:\n',custom_tfidf[0].toarray())

100%|██████████| 4/4 [00:00<00:00, 8710.91it/s]


Custom feature names:
 ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

Custom idf values:
 [1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]

Custom tfidf output:
   (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149

Dense matrix for document 0:
 [[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]





In [108]:
# sklearn implementation
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

print('\nSklearn feature names:\n',vectorizer.get_feature_names())

print('\nSklearn idf values:\n',vectorizer.idf_)

print('\nSklearn output:\n',skl_output)


Sklearn feature names:
 ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

Sklearn idf values:
 [1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]

Sklearn output:
   (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045
  (1, 8)	0.281088674033753
  (1, 6)	0.281088674033753
  (1, 5)	0.5386476208856763
  (1, 3)	0.281088674033753
  (1, 1)	0.6876235979836938
  (2, 8)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 0)	0.511848512707169
  (3, 8)	0.38408524091481483
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 2)	0.5802858236844359
  (3, 1)	0.46979138557992045


### 1.2 Using cleaned strings data

In [110]:
import pickle
with open('cleaned_strings', 'rb') as f:
    corpus_data = pickle.load(f)

vocab = fit(corpus_data)
custom_tfidf_data = transform(corpus_data, vocab)
print('\nCustom output:\n',custom_tfidf_data[0])

100%|██████████| 746/746 [00:00<00:00, 1404.87it/s]


Custom output:
   (0, 53)	0.4285381075435814
  (0, 688)	0.4285381075435814
  (0, 720)	0.4285381075435814
  (0, 1545)	0.22805947499067006
  (0, 1651)	0.1625302544476425
  (0, 1653)	0.3509903637504196
  (0, 2287)	0.342724587634745
  (0, 2878)	0.3605325019883844





In [111]:
# using sklearn
vectorizer_data = TfidfVectorizer()
vectorizer_data.fit(corpus_data)
skl_output_data = vectorizer_data.transform(corpus_data)
print(skl_output_data[0])

  (0, 2878)	0.35781145622317734
  (0, 2287)	0.3377679916467555
  (0, 1653)	0.35781145622317734
  (0, 1651)	0.16192317905848022
  (0, 1545)	0.30566026894803877
  (0, 720)	0.4123943870778812
  (0, 688)	0.4123943870778812
  (0, 53)	0.4123943870778812


## Task 2:
Implementing max features functionality, i.e, limiting to 50 terms with top idf values.

In [104]:
def idf(dataset, word):
    '''function to find the idf of a word'''
    count = 0 # number of documents with the word in it, initially 0
    
    # for each review in dataset and if word is in the review increment count
    for row in dataset:
        if word in row:
            count += 1
    
    # calculate idf as 1+log((1+total number of documents)/(1+number of documents with the word))
    val = 1 + math.log((1+len(dataset))/(1+count)) # value of idf 
    return val
        
def fit_50(dataset):
    '''function to fit 50 terms with top idf scores'''
    unique_words = set() # set of unique words at first empty
    if isinstance(dataset, list): # check if its list type
        
        # for each review in dataset and for each word in review add word to set
        for row in dataset:
            for words in row.split():
                if len(words) < 2: # ignore small words 
                    continue
                unique_words.add(words)
                
        # change type set to list
        unique_words = list(unique_words)
        
        # v = {word : idf of word}
        v = {word:idf(dataset,word) for word in unique_words}
        
        # the two lines below is to check top 50 sorted words by idf 
        # sorted_v = sorted(v,key = v.get,reverse=True)[:50]
        # print(sorted_v)

        # vocabulary of top 50 words by idf and index, from v sorted by idf {unique word : index of word}
        vocab = {word:i for i,word in enumerate(sorted(v,key=v.get,reverse=True)[:50])}

        return vocab
    else:
        print("Pass a list as sentence")
        
def transform(dataset, vocab):
    rows = []
    columns = []
    values = []
    if isinstance(dataset, list): # check if its list type
        for idx, row in enumerate(tqdm(dataset)): # for each document in dataset
            word_freq = dict(Counter(row.split())) # key: word, values: frequency of word
            doc_total_freq = len(row.split()) # total number of words in document
            
            for word, freq in word_freq.items(): # for each item in word_freq dictionary
                if len(word) < 2:
                    continue
                        
                col_index = vocab.get(word, -1) # retreving the dimension number of a word
                # if the word exists
                if col_index != -1:
                    rows.append(idx) # store index of document
                    columns.append(col_index) # store dimensions of word
                    
                    # calculate tfidf value using formula
                    values.append((freq/doc_total_freq)*idf(dataset,word)) # store tfidf value of word

        return normalize(csr_matrix((values, (rows,columns)), shape = (len(dataset),len(vocab))), norm ='l2') # final output normalized 
    else:
        print("Pass list of strings")

In [106]:
import pickle
with open('cleaned_strings', 'rb') as f:
    corpus_data = pickle.load(f)

vocab_50 = fit_50(corpus_data)
print('\nWords in sorted vocab:\n',list(vocab_50.keys()))
custom_idf_data_50 = [(word,idf(corpus_data,word)) for word in list(vocab_50.keys())]
print('\nIdf values:\n',custom_idf_data_50)
custom_tfidf_data_50 = transform(corpus_data, vocab_50)
print('\nFinal tfidf output\n',custom_tfidf_data_50)
print('\nFinal tfidf output for document 421\n',custom_tfidf_data_50[421])

print('\nDense matrix for document 421\n',custom_tfidf_data_50[421].toarray())
print('\nShape: ',custom_tfidf_data_50[421].toarray().shape)

100%|██████████| 746/746 [00:00<00:00, 79091.80it/s]


Words in sorted vocab:
 ['sheer', 'versus', 'juano', 'paolo', 'behind', 'shenanigans', 'stowe', 'contract', 'sacrifice', 'reasonable', 'shallow', 'renowned', 'gay', 'painted', 'ireland', 'avoided', 'trooper', 'aesthetically', 'subplots', 'gake', 'ursula', 'reminded', 'exemplars', 'hype', 'assaulted', 'shattered', 'excuses', 'incomprehensible', 'monica', 'roller', 'rubbish', 'zombiez', 'suggests', 'regrettable', 'repair', 'defensemen', 'detailing', 'alongside', 'reporter', 'boyle', 'says', 'smoothly', 'holds', 'stuff', 'wide', 'changing', 'nerves', 'judge', 'borrowed', 'notch']

Idf values:
 [('sheer', 6.922918004572872), ('versus', 6.922918004572872), ('juano', 6.922918004572872), ('paolo', 6.922918004572872), ('behind', 6.922918004572872), ('shenanigans', 6.922918004572872), ('stowe', 6.922918004572872), ('contract', 6.922918004572872), ('sacrifice', 6.922918004572872), ('reasonable', 6.922918004572872), ('shallow', 6.922918004572872), ('renowned', 6.922918004572872), ('gay', 6.92291


