In [95]:
!pip install contractions
import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
import contractions
import numpy as np
from gensim.models import LsiModel
from gensim import models
from gensim import corpora
from gensim.similarities import MatrixSimilarity

nltk.download('brown')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [55]:
#loading the corpus
sents = nltk.Text(brown.sents('cn12'))
print(sents[0])
print("No. of Documents: ",len(sents))

['When', 'several', 'minutes', 'had', 'passed', 'and', 'Curt', "hadn't", 'emerged', 'from', 'the', 'livery', 'stable', ',', 'Brenner', 'reentered', 'the', 'hotel', 'and', 'faced', 'Summers', 'across', 'the', 'counter', '.']
No. of Documents:  186


In [56]:
ps = nltk.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [57]:
#remove contraction and stop words
def removeContraction(sents_list):
  new_sents = []
  for s in sents_list:
    words = []
    for word in s:
      word = contractions.fix(word)
      word_split = word.split(' ')
      for w in word_split:
        if not w.lower() in stop_words:
          words.append(w.lower())
    if len(words) != 0:
      new_sents.append(words)

  return new_sents

In [58]:
sents = removeContraction(sents) #removing contraction for eg. I'll -> I will 

#pre-processing
modified_sents = []
for sent in sents:
  words = []
  for word in sent:
       word = ps.stem(word) #performing stemming for e.g several -> sever
       word = lemmatizer.lemmatize(word, pos="v") #performing lemmitization for eg. running -> run
       if word.isalpha():
        words.append(word)
  if len(words) != 0:
    modified_sents.append(words)

print(modified_sents[0])
print(len(modified_sents))

['sever', 'minut', 'pass', 'curt', 'emerg', 'liveri', 'stabl', 'brenner', 'reenter', 'hotel', 'face', 'summer', 'across', 'counter']
180


In [134]:
#getting the vocabulary
vocab = corpora.Dictionary(modified_sents)
print(vocab)

Dictionary(480 unique tokens: ['across', 'brenner', 'counter', 'curt', 'emerg']...)


In [160]:
doc_term_matrix = [vocab.doc2bow(tokens) for tokens in modified_sents] #term frequency document wise

tfidf = models.TfidfModel(doc_term_matrix) 
corpus_tfidf = tfidf[doc_term_matrix] #obtaining tfidf from term frequency matrix
model = models.LsiModel(corpus_tfidf, id2word=vocab, num_topics=50) #reducing the dimensionality from 480 to 50

In [161]:
query = "give messag dian molinari"
query_vector = vocab.doc2bow(query.lower().split()) #converting query into vector

In [162]:
query_vector_ld = model[query_vector]
index = MatrixSimilarity(model[corpus_tfidf])
sim_doc = index[query_vector_ld] #obtaining the similarity of each document with the query
sim_doc = sorted(enumerate(sim_doc), key=lambda item: -item[1])[0:10] #sorting the similarity

In [163]:
for i,similarity in sim_doc:
  print(similarity,modified_sents[i]) #display top 10 similar document

0.8035782 ['want', 'take', 'messag', 'dian', 'molinari']
0.7777617 ['long', 'ride', 'four', 'take', 'must', 'give', 'good', 'appetit']
0.65213066 ['appar', 'sens', 'realiz', 'give', 'advantag', 'jess', 'becam', 'bold']
0.63398355 ['give', 'curt', 'time', 'stagger', 'feet']
0.4056505 ['find', 'weak', 'jess', 'weak', 'smart', 'enough', 'take', 'advantag']
0.4012061 ['lurch', 'drunkenli', 'feet', 'lower', 'head', 'take', 'one', 'step', 'away', 'wall']
0.34306452 ['somebodi', 'town', 'must', 'still', 'backbon']
0.33088678 ['want', 'brenner']
0.31547165 ['curt', 'want', 'get', 'jess', 'alon', 'without', 'interfer', 'anyon', 'even', 'spineless', 'person', 'store', 'owner']
0.27620968 ['want', 'hear', 'say']
