#TF-IDF IMPLEMENTATION

##SKLEARN IMPLEMENTATION

In [1]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [2]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from scipy.sparse import csr_matrix
import math
from sklearn.preprocessing import normalize

In [3]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [4]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [5]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [6]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [7]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [8]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


##CUSTOM IMPLEMENTATION##

###IDF Function ###
parameters : corpus: set of documents; unique_words : list of sorted unique words

output : dictionary with word as key and its idf value as value


IDF formula :  $IDF(t) = 1+\log_{e}\frac{1\text{ }+\text{ Total  number of documents in collection}} {1+\text{Number of documents with term t in it}}.$



In [9]:
def idf(corpus,unique_words):
  idf_dict = {}
  total_number_of_documents_in_collection = len(corpus)
  for word in unique_words:
    count = 0
    for sentence in corpus:
      if word in sentence.split(" "):
        count +=1
      idf_dict[word]=(math.log((1+total_number_of_documents_in_collection)/(count+1)))+1
  return idf_dict

###FIT Function ###
parameters : corpus : set of documents

outputs : vocab: vocabulary dictionary; idf_dict

In [10]:
def fit(corpus):
  unique_words = set()
  if isinstance(corpus, (list,)):
    for row in corpus:
      for word in row.split(" "):
        if len(word) < 2:
          continue
        unique_words.add(word)
    unique_words = sorted(list(unique_words))
    vocab = {j:i for i, j in enumerate(unique_words)}
    idf_dict = idf(corpus, unique_words)
    return vocab, idf_dict
  else:
    print('Please Pass the list')

In [11]:
vocab, idf_dict = fit(corpus)

In [12]:
print(list(vocab.keys()))
print(list(idf_dict.values()))

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]


###Transform function###
parameters : corpus: set of documents, 
             vocab : vocabulary, 
             idf_dict

outputs : normalised sparse matrix

Using L2 Normalisation : document - https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html

In [13]:
def transform(corpus, vocab, idf_dict):
  rows = []
  columns = []
  values = []
  if isinstance(corpus, (list,)):
    for idx, string in enumerate(corpus):
      no_of_terms_in_document = len(string.split(" "))
      for word in string.split(" "):
        if word in list(vocab.keys()):
          no_of_times_word_in_string = string.split(" ").count(word)
          tf_idf_values = (no_of_times_word_in_string / no_of_terms_in_document) * idf_dict[word]
          col_index = vocab.get(word, -1)
          if col_index != -1:
            rows.append(idx)
            columns.append(col_index)
            values.append(tf_idf_values)
    output_matrix = csr_matrix((values, (rows,columns)), shape=(len(corpus),len(vocab)))
    output_norm_matrix = normalize(output_matrix, norm='l2', axis=1, copy=True, return_norm=False)
    return output_norm_matrix
  else:
    print("Please Pass the list")


In [14]:
csr_norm_matrix = transform(corpus, vocab, idf_dict)

In [15]:
# Shape of normalised matrix aftee transform matrix
print(csr_norm_matrix.shape)

(4, 9)


###Shape of sklearn implementation and Custom implementation matrices are same i.e (4,9)

In [16]:
print(csr_norm_matrix[0])

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149


### For Reference###

1st Matrix of Sklearn implementation

  (0, 8)	0.38408524091481483

  (0, 6)	0.38408524091481483

  (0, 3)	0.38408524091481483

  (0, 2)	0.5802858236844359
  
  (0, 1)	0.46979138557992045


#TASK-2#

In [17]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type

import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


Here in fit function i am taking top 50 idf values from idf dict into my vocab

Refernce: Taking top N values in sorted order from dictionary : https://www.geeksforgeeks.org/python-n-largest-values-in-dictionary/

In [18]:
import itertools 
from collections import OrderedDict
from operator import itemgetter
def fit_2(corpus):
  unique_words = set()
  if isinstance(corpus, (list,)):
    for row in corpus:
      for word in row.split(" "):
        if len(word) < 2:
          continue
        unique_words.add(word)
    unique_words = sorted(list(unique_words))
    idf_dict = idf(corpus, unique_words)
    vocab = dict(sorted(idf_dict.items(), key = itemgetter(1), reverse = True)[:50])
    return vocab, idf_dict
  else:
    print('Please Pass the list')

In [19]:
vocab, idf_dict= fit_2(corpus)

In [20]:
print(len(vocab))

50


In [21]:
def transform_2(corpus, vocab, idf_dict):
  rows = []
  columns = []
  values = []
  if isinstance(corpus, (list,)):
    for idx, string in enumerate(corpus):
      no_of_terms_in_document = len(string.split(" "))
      for word in string.split(" "):
        if word in list(vocab.keys()):
          no_of_times_word_in_string = string.split(" ").count(word)
          tf_idf_values = (no_of_times_word_in_string / no_of_terms_in_document) * idf_dict[word]
          col_index = vocab.get(word, -1)
          if col_index != -1:
            rows.append(idx)
            columns.append(col_index)
            values.append(tf_idf_values)
    output_matrix = csr_matrix((values, (rows,columns)), shape=(len(corpus),len(vocab)))
    output_norm_matrix = normalize(output_matrix, norm='l2', axis=1, copy=True, return_norm=False)
    return output_norm_matrix
  else:
    print("Please pass the list as argument")

In [22]:
csr_norm_matrix = transform_2(corpus, vocab, idf_dict)
print(csr_norm_matrix.shape)

(746, 50)
