In [60]:
!pip install contractions
import nltk
from nltk.corpus import brown
import contractions
import numpy as np
import math
from scipy.spatial import distance

nltk.download('brown')
nltk.download('wordnet')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [38]:
#fetching the documents in the cn12 file in the corpus
sents = nltk.Text(brown.sents('cn12'))
print(sents[0])
print(len(sents))

['When', 'several', 'minutes', 'had', 'passed', 'and', 'Curt', "hadn't", 'emerged', 'from', 'the', 'livery', 'stable', ',', 'Brenner', 'reentered', 'the', 'hotel', 'and', 'faced', 'Summers', 'across', 'the', 'counter', '.']
186


In [39]:
ps = nltk.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()


In [40]:
def removeContraction(sents_list):
  new_sents = []
  for s in sents_list:
    words = []
    for word in s:
      word = contractions.fix(word)
      word_split = word.split(' ')
      for w in word_split:
        words.append(w.lower())
    new_sents.append(words)

  return new_sents



In [41]:
sents = removeContraction(sents) #removing contraction for eg. I'll -> I will 

#pre-processing
modified_sents = []
for sent in sents:
  words = []
  for word in sent:
       word = ps.stem(word) #performing stemming for e.g several -> sever
       word = lemmatizer.lemmatize(word, pos="v") #performing lemmitization for eg. running -> run
       if word.isalpha():
        words.append(word)
  modified_sents.append(words)

print(modified_sents[0])

['when', 'sever', 'minut', 'have', 'pass', 'and', 'curt', 'have', 'not', 'emerg', 'from', 'the', 'liveri', 'stabl', 'brenner', 'reenter', 'the', 'hotel', 'and', 'face', 'summer', 'across', 'the', 'counter']


In [42]:
#creating vocabulary 
vocab = []

for sent in modified_sents:
  for word in sent:
    if word not in vocab:
      vocab.append(word)

print(len(vocab))

561


In [None]:
#creating map of word and position in vocab
posmap = {}
for i,word in enumerate(vocab):
  posmap[word] = i

#print(posmap)

In [44]:
#creating incidence matrix
mat = np.zeros((len(modified_sents),len(vocab)))
print(mat.shape)

for s_index,sent in enumerate(modified_sents):
  for word in sent:
    mat[s_index][posmap[word]] += 1


(186, 561)


In [45]:
#calculating document frequency
doc_freq = np.zeros(len(vocab))

for col in range(mat.shape[1]):
  for row in range(mat.shape[0]):
    if mat[row][col] !=0:
      doc_freq[col] +=1

In [46]:
#calculating inverse document frquency
idf = []
for df in doc_freq:
  idf.append(math.log(len(sents)/df,10))

idf = np.array(idf)
print(idf.shape)

(561,)


Weighted TF-IDF: (1+log(tf))*(log(N/df))

In [47]:
#calculating weighted tf-idf
for col in range(mat.shape[1]):
  for row in range(mat.shape[0]):
    if mat[row][col] !=0:
      mat[row][col] = (1+math.log(mat[row][col],10))*idf[col]

In [59]:
#defining query
query = ['give','messag','to','dian','molinari']

#defining query vector
query_vector = np.zeros(len(vocab))
for word in query:
  query_vector[posmap[word]] +=1


In [None]:
#finding minimun cosine distance to find most similar document
min_distance = 1.1
min_index = 0
for index in range(mat.shape[0]):
  d = distance.cosine(query_vector,mat[index])
  if d<min_distance:
    min_distance = d
    min_index = index


In [70]:
#printing the similar document 
print(modified_sents[min_index])
print(query)

['i', 'just', 'want', 'you', 'to', 'take', 'a', 'messag', 'to', 'dian', 'molinari']
['give', 'messag', 'to', 'dian', 'molinari']
