In [1]:
!jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

Traceback (most recent call last):
  File "/usr/local/bin/jupyter-notebook", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python2.7/dist-packages/jupyter_core/application.py", line 267, in launch_instance
    return super(JupyterApp, cls).launch_instance(argv=argv, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/traitlets/config/application.py", line 657, in launch_instance
    app.initialize(argv)
  File "</usr/local/lib/python2.7/dist-packages/decorator.pyc:decorator-gen-7>", line 2, in initialize
  File "/usr/local/lib/python2.7/dist-packages/traitlets/config/application.py", line 87, in catch_config_error
    return method(app, *args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/notebook/notebookapp.py", line 1368, in initialize
    self.init_webapp()
  File "/usr/local/lib/python2.7/dist-packages/notebook/notebookapp.py", line 1188, in init_webapp
    self.http_server.listen(port, self.ip)
  File "/usr/local/lib/python2.7/dist-packages/torna

### Corpus

In [2]:
## SkLearn# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [4]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']




In [5]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [6]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [7]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [8]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Your custom implementation

In [9]:
# Write your code here.
# Make sure its well documented and readble with appropriate comments.
# Compare your results with the above sklearn tfidf vectorizer
# You are not supposed to use any other library apart from the ones given below

from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy 

In [10]:
# Finding Feature names of the vector 

feature_names = []
for sentence in corpus:
  for word in sentence.split():
    if len(word)>=2:
      feature_names.append(word)

feature_names = list(sorted(list(set(feature_names))))


# Function to return the frequency of a word in a sentence. 
def freq_tf(word, sentence):
  freq = 0
  for i in sentence.split():
    if i==word:
      freq+=1
  return freq

# Function to return the Term frequency value 
def tf(corpus, feature_names):
  tf = []
  for sentence in corpus:
    tf_vector = []
    denominator = len(list(sentence.split()))
    for i in feature_names:
# TF = No of occurences of a word in a document/ Total no of words in the document.
      tf_vector.append(freq_tf(i,sentence)/denominator)
    tf.append(tf_vector)
  return tf

# function to return a dictionary containing the keys as the features of the vector and their corresponding value pairs are the IDF value. 
def idf(corpus, feature_names):
  idf = {}
  No_docs = len(corpus)
  for word in feature_names:
    count = 0
    for sentence in corpus:
      if word in sentence.split():
        count+=1
      idf[word] = 1+math.log((1+No_docs)/(1+count))

  return idf

# Function to return vocabulary(dictionary in which keys are feature names and values are the postion of the feature name in the vector.) and IDF values in a dictionary. 
def fit(corpus,feature_names):
  vocab = {j:i for i,j in enumerate(feature_names)}
   
  idf_dict = idf(corpus, feature_names)

  return vocab, idf_dict


# Returns a normalized sparse matrix containing the TFIDF values
def transform(corpus, vocab, idf_dict, tf):
  
  sparse_matrix = csr_matrix((len(corpus), len(vocab)), dtype= numpy.float64)
  for i in range(0,len(corpus)):
    for j in corpus[i].split():
      if j in vocab.keys():
        sparse_matrix[i, vocab[j]] = tf[i][vocab[j]] * idf_dict[j]
  print("Normalized Spare Matrix\n", normalize(sparse_matrix, norm = 'l2', axis = 1, copy = True, return_norm = False))
  output = normalize(sparse_matrix, norm = 'l2', axis = 1, copy = True, return_norm = False)
  return output


  
  

vocab, idf_dict = fit(corpus, feature_names)
idf_l = idf(corpus, feature_names)
tf_l = tf(corpus, feature_names)

print(vocab, idf_dict)

transformed = transform(corpus, vocab, idf_dict, tf_l )

print(transformed.shape)






{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8} {'and': 1.916290731874155, 'document': 1.2231435513142097, 'first': 1.5108256237659907, 'is': 1.0, 'one': 1.916290731874155, 'second': 1.916290731874155, 'the': 1.0, 'third': 1.916290731874155, 'this': 1.0}
Normalized Spare Matrix
   (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149
(4, 9)


  self._set_intXint(row, col, x.flat[0])


# Compare with TfidfVectorizer


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer()

tfidf_vector = tf_idf_vectorizer.fit_transform(corpus)
print(tfidf_vector)

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483
  (1, 5)	0.5386476208856763
  (1, 1)	0.6876235979836938
  (1, 6)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 8)	0.281088674033753
  (2, 4)	0.511848512707169
  (2, 7)	0.511848512707169
  (2, 0)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 8)	0.267103787642168
  (3, 1)	0.46979138557992045
  (3, 2)	0.5802858236844359
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 8)	0.38408524091481483


## Observations:-
The sparse matrices obtained from our custom fuction and tfidfvectorizer's fit_transform method are the same.

In [12]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type

import pickle

from google.colab import drive
import os

# File handling for a pickle file from a google colab i-python notebook
drive.mount('/content/drive')
PATH = '/content/drive/My Drive/Assignment 3 tfidf/3_CountVectorizer'

file_name = 'cleaned_strings'
path_to_embed = os.path.join(PATH,file_name)

corpus = pickle.load(open(path_to_embed, "rb"))

#with open('cleaned_strings', 'rb') as f:
#    corpus = pickle.load(f)
    
#printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Mounted at /content/drive
Number of documents in corpus =  746


In [13]:
# Write your code here.
# Try not to hardcode any values.
# Make sure its well documented and readble with appropriate comments.

In [14]:
print(type(corpus))

<class 'list'>


In [20]:
#Finding the feature names of the tfidf vector. 
feature_names = []
for sentence in corpus:
  for word in sentence.split():
    if len(word)>=2:
      feature_names.append(word)

feature_names = list(sorted(list(set(feature_names))))




print(feature_names)




In [21]:
#Function to return the vocabulary and idf values of the top 50 feature names. 
def fit_top_50(corpus, feature_names):
    vocab , idf_dict = fit(corpus, feature_names)

    count = Counter(idf_dict)
    top_50 = count.most_common(50)
    top_50 = dict(top_50)
    
    vocab_50 = {j:i for i,j in enumerate(top_50.keys())}

    return vocab_50, top_50 



        
    
 



In [22]:
vocab_50 , idf_top_50 = fit_top_50(corpus, feature_names)

tf_l3 = tf(corpus, vocab_50.keys())




In [18]:
transformed_50 = transform(corpus, vocab_50, idf_top_50, tf_l3)

print(transformed_50.shape)

Normalized Spare Matrix
   (0, 30)	1.0
  (68, 24)	1.0
  (72, 29)	1.0
  (74, 31)	1.0
  (119, 33)	1.0
  (135, 3)	0.37796447300922725
  (135, 10)	0.37796447300922725
  (135, 18)	0.37796447300922725
  (135, 20)	0.37796447300922725
  (135, 36)	0.37796447300922725
  (135, 40)	0.37796447300922725
  (135, 41)	0.37796447300922725
  (176, 49)	1.0
  (181, 13)	1.0
  (192, 21)	1.0
  (193, 23)	1.0
  (216, 2)	1.0
  (222, 47)	1.0
  (225, 19)	1.0
  (227, 17)	1.0
  (241, 44)	1.0
  (270, 1)	1.0
  (290, 25)	1.0
  (333, 26)	1.0
  (334, 15)	1.0
  (341, 43)	1.0
  (344, 42)	1.0
  (348, 8)	1.0
  (377, 37)	1.0
  (409, 5)	1.0
  (430, 39)	1.0
  (457, 45)	1.0
  (461, 4)	1.0
  (465, 38)	1.0
  (475, 35)	1.0
  (493, 6)	1.0
  (500, 48)	1.0
  (548, 0)	0.7071067811865475
  (548, 32)	0.7071067811865475
  (608, 14)	1.0
  (612, 11)	1.0
  (620, 46)	1.0
  (632, 7)	1.0
  (644, 12)	0.7071067811865475
  (644, 27)	0.7071067811865475
  (664, 28)	1.0
  (667, 22)	1.0
  (691, 34)	1.0
  (697, 9)	1.0
  (722, 16)	1.0
(746, 50)


  self._set_intXint(row, col, x.flat[0])


# Compare with TfidfVectorizer()

In [19]:
"""from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer()"""

tfidf_vector = tf_idf_vectorizer.fit_transform(corpus)
print(tfidf_vector[:50])

  (0, 1545)	0.3056602689480387
  (0, 2878)	0.3578114562231773
  (0, 720)	0.41239438707788106
  (0, 688)	0.41239438707788106
  (0, 1651)	0.1619231790584802
  (0, 53)	0.41239438707788106
  (0, 1653)	0.3578114562231773
  (0, 2287)	0.33776799164675547
  (1, 2764)	0.3766212179754885
  (1, 1132)	0.3276587847617171
  (1, 1676)	0.4000516539584004
  (1, 149)	0.3365666231014132
  (1, 374)	0.23983327446209926
  (1, 966)	0.3766212179754885
  (1, 1511)	0.34710235966502895
  (1, 2446)	0.3599970590843251
  (1, 1712)	0.16744040526541595
  (2, 853)	0.28282863381171675
  (2, 1704)	0.2453945264455564
  (2, 64)	0.21707671739759152
  (2, 1482)	0.22619301571578695
  (2, 1873)	0.17804028847989228
  (2, 1889)	0.23164830224699004
  (2, 20)	0.162536633636425
  (2, 2085)	0.2379459683681101
  :	:
  (47, 172)	0.4195903237195789
  (47, 291)	0.45527382447788844
  (47, 2829)	0.4351779190007822
  (47, 1222)	0.4195903237195789
  (47, 534)	0.3071637415418937
  (47, 1132)	0.39608620264180894
  (48, 253)	0.528284001758813

## Observations:-
The values of our own implimentation of the tfidf vectorizer yieds different results from fit_transform() method from the tfidfvectorizer() 