<a href="https://colab.research.google.com/github/Ramaseshanr/anlp/blob/master/ScottDeerwesterLSI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. Set up the corpus**

In [17]:
from gensim import corpora
documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]
print(documents)


['Human machine interface for lab abc computer applications', 'A survey of user opinion of computer system response time', 'The EPS user interface management system', 'System and human system engineering testing of EPS', 'Relation of user perceived response time to error measurement', 'The generation of random binary unordered trees', 'The intersection graph of paths in trees', 'Graph minors IV Widths of trees and well quasi ordering', 'Graph minors A survey']


**2. Tokenize the documents**

In [18]:
#Tokeninze
from pprint import pprint
stoplist = set('for a of the and to in'.split())
bow = [
     [word for word in document.lower().split() if word not in stoplist]
     for document in documents
]
pprint(bow)

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]


In [0]:
texts = [
  [word for word in document.lower().split() if word not in stoplist]
    for document in documents]

**Save the dictionary for persistence**

In [20]:
from gensim import corpora
dictionary = corpora.Dictionary(texts)
dictionary.save('/home/deerwester.dict')  # store the dictionary, for future reference
pprint(dictionary.token2id)


{'abc': 0,
 'applications': 1,
 'binary': 21,
 'computer': 2,
 'engineering': 15,
 'eps': 13,
 'error': 17,
 'generation': 22,
 'graph': 26,
 'human': 3,
 'interface': 4,
 'intersection': 27,
 'iv': 29,
 'lab': 5,
 'machine': 6,
 'management': 14,
 'measurement': 18,
 'minors': 30,
 'opinion': 7,
 'ordering': 31,
 'paths': 28,
 'perceived': 19,
 'quasi': 32,
 'random': 23,
 'relation': 20,
 'response': 8,
 'survey': 9,
 'system': 10,
 'testing': 16,
 'time': 11,
 'trees': 24,
 'unordered': 25,
 'user': 12,
 'well': 33,
 'widths': 34}


**Create TFIDF for term-Document Matrix**

In [21]:
from gensim import models
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
print(tfidf)
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
  print(doc)

TfidfModel(num_docs=9, num_nnz=51)
[(0, 0.4301019571350565), (1, 0.4301019571350565), (2, 0.2944198962221451), (3, 0.2944198962221451), (4, 0.2944198962221451), (5, 0.4301019571350565), (6, 0.4301019571350565)]
[(2, 0.3726494271826947), (7, 0.5443832091958983), (8, 0.3726494271826947), (9, 0.3726494271826947), (10, 0.27219160459794917), (11, 0.3726494271826947), (12, 0.27219160459794917)]
[(4, 0.438482464916089), (10, 0.32027755044706185), (12, 0.32027755044706185), (13, 0.438482464916089), (14, 0.6405551008941237)]
[(3, 0.3449874408519962), (10, 0.5039733231394895), (13, 0.3449874408519962), (15, 0.5039733231394895), (16, 0.5039733231394895)]
[(8, 0.30055933182961736), (11, 0.30055933182961736), (12, 0.21953536176370683), (17, 0.43907072352741366), (18, 0.43907072352741366), (19, 0.43907072352741366), (20, 0.43907072352741366)]
[(21, 0.48507125007266594), (22, 0.48507125007266594), (23, 0.48507125007266594), (24, 0.24253562503633297), (25, 0.48507125007266594)]
[(24, 0.316227766016837

In [0]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)  # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf]

In [23]:
corpus_tfidf = tfidf[corpus]
corpus_lsi = lsi[corpus_tfidf]
for doc in corpus_lsi:  # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
  print(doc)

[(0, 0.3405711798684177), (1, -0.20602251622679751)]
[(0, 0.6933040002171562), (1, 0.0072327583903847095)]
[(0, 0.5902607670389705), (1, -0.3526046949085597)]
[(0, 0.5214901821825111), (1, -0.3388797615405557)]
[(0, 0.3953319317635446), (1, -0.05919285336660332)]
[(0, 0.036353173528494105), (1, 0.18146550208818904)]
[(0, 0.1470901232877918), (1, 0.49432948127822274)]
[(0, 0.21407117317565655), (1, 0.6406456664453936)]
[(0, 0.40066568318171086), (1, 0.6413108299093987)]


In [24]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=5)  # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf]
lsi.print_topics(5)

[(0,
  '0.400*"system" + 0.318*"survey" + 0.290*"user" + 0.274*"eps" + 0.236*"management" + 0.236*"opinion" + 0.235*"time" + 0.235*"response" + 0.224*"interface" + 0.224*"computer"'),
 (1,
  '0.421*"minors" + 0.420*"graph" + 0.293*"survey" + 0.239*"trees" + 0.226*"paths" + 0.226*"intersection" + -0.204*"system" + -0.196*"eps" + 0.189*"quasi" + 0.189*"ordering"'),
 (2,
  '-0.318*"time" + -0.318*"response" + -0.261*"measurement" + -0.261*"relation" + -0.261*"error" + -0.261*"perceived" + 0.248*"eps" + -0.203*"opinion" + 0.195*"human" + 0.190*"engineering"'),
 (3,
  '0.416*"generation" + 0.416*"random" + 0.416*"unordered" + 0.416*"binary" + 0.256*"trees" + -0.225*"minors" + -0.177*"survey" + 0.161*"intersection" + 0.161*"paths" + 0.119*"perceived"'),
 (4,
  '0.398*"lab" + 0.398*"machine" + 0.398*"applications" + 0.398*"abc" + 0.301*"computer" + -0.242*"system" + -0.237*"eps" + -0.180*"testing" + -0.180*"engineering" + -0.166*"management"')]

In [25]:
for doc in corpus_lsi:  # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
  print(doc)

[(0, 0.3405711798684186), (1, -0.20602251622679582), (2, 0.25163229326121866), (3, -0.06646636031661206), (4, 0.8634175502633779)]
[(0, 0.6933040002171568), (1, 0.007232758390389324), (2, -0.42828031011610407), (3, 0.009221456247178576), (4, 0.07245540476224807)]
[(0, 0.5902607670389728), (1, -0.35260469490855734), (2, 0.30883209258107525), (3, 0.015857166285259898), (4, -0.24165669441507642)]
[(0, 0.521490182182514), (1, -0.33887976154055405), (2, 0.4328304015902543), (3, -0.03897722688933945), (4, -0.33361514099931716)]
[(0, 0.39533193176354525), (1, -0.05919285336659884), (2, -0.6817088379096496), (3, 0.2755580402790444), (4, -0.053930814779539246)]
[(0, 0.03635317352849356), (1, 0.18146550208818948), (2, 0.20409484571950304), (3, 0.8684447611727191), (4, 0.08100886500517024)]
[(0, 0.14709012328778936), (1, 0.4943294812782232), (2, 0.2520741552399367), (3, 0.25758865011153953), (4, -0.04099410336036623)]
[(0, 0.21407117317565316), (1, 0.640645666445394), (2, 0.212543956272626), (3, 

Query the corpus

In [27]:
from gensim import similarities
doc = "relation  emgineering"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]  # convert the query to LSI space
index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[vec_lsi] 
sims = sorted(enumerate(sims), key=lambda item: -item[1])
for index, similarity in sims:
  print(similarity,documents[index])


0.98137224 Relation of user perceived response time to error measurement
0.71539164 A survey of user opinion of computer system response time
0.1540112 The generation of random binary unordered trees
0.04207113 The EPS user interface management system
0.013246909 Graph minors A survey
-0.13690966 The intersection graph of paths in trees
-0.14376822 System and human system engineering testing of EPS
-0.17135456 Human machine interface for lab abc computer applications
-0.24740018 Graph minors IV Widths of trees and well quasi ordering


  if np.issubdtype(vec.dtype, np.int):
