In [51]:
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [52]:
from collections import defaultdict
from gensim import corpora

documents = [
    "This table illustrates the student’s feedback about the different aspects of McGill University over a period that spans ten years. Over the period of evaluation, the student’s service received a trending up-rating that started at 54% in 2005 and reached 95% by 2015, while the range of modules offered saw a declining rating that moved from 39% in 2005 down to 25% in 2015. In the meantime, sports and social facilities received a constant rating at 65%, and with almost the same trend the library resources received a rating that fluctuated around 87%. Finally, the teaching quality received 72% rating in 2010 down 2% from its rating in 2005, but by 2015 it went back up again with 78%. Overall, the aspects of services delivered by McGill University received different rating students with students services receiving the best up-word rating and range of modules offered received down-word rating, while the other aspects had a minor or no change in rating.",
    "The table given above illustrates the results that were obtained from a survey that was conducted in McGill University about different aspects of their services, during three different periods of time (2005, 2010 and 2015 respectively). It can be noticed that, in terms of teaching quality, sports and social facilities and library resources, the general satisfaction rate of individuals who had answered the survey had remained at a close constant between the years 2005 and 2015. For instance, when it comes to teaching quality, most individuals hold the belief that its quality has been the same since 2005 as it can be seen that it has only increased by 4% from 2005 and 2015. Subsequently, in terms of sport and social facilities, there has been no influx in results, not even to a small degree. In fact, the percentage has been at an astonishing constant in all three periods of time. (65%) On the other hand, however, aspects like student services have been on an overwhelmingly upward streak. This can be evidenced by the rise in percentage from 54% in 2005 to a mouth-gaping 95% in 2015. These dramatic changes can be seen elsewhere, but at a downward degree, when it comes to the range of modules offered. It can be noticed that the rating, when it comes to this particular aspect, has gone down by 14 points. To sum up, although, some services of McGill University have been on a downward spiral or a streak of consistency, the table depicts statistics that, for the most part, have been improving every 5 years.",
    "The chart illustrates percentages that have been taken from students from McGill University in three different years. Overall, library resources had the highest percentage of good rates in 2005 and 2010. However, student services overtook library resources in 2015 due to its significant increase over the given years. An analysis of the data shows that student services with 95% had the largest good rates among all other aspects overtaking both library resources and teaching quality that were on top on scale in 2005 with 86% and 74% respectively. Student services percentages rocketed up over the years, transferring from the second-lowest aspect with 54% in 2005 after the range of modules offered with a low of 39%, to be the top one in 2015, jumping approximately 41%. Along with teaching quality and library resources that also had a slight growth standing for 4% and 1% respectively. On the contrary of the range of modules offered in the university plunging from 39% to 25%. Student services at McGill University experienced a dramatic improvement from 2005 to 2015."
]

# remove common words and tokenize
stop_list = set('for a of the and to in'.split())

texts = [
    [word for word in document.lower().split() if word not in stop_list]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

2021-02-20 14:02:32,644 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-02-20 14:02:32,647 : INFO : built Dictionary(86 unique tokens: ['2005', '2010', '2015', '2015,', '2015.']...) from 3 documents (total 322 corpus positions)


In [53]:
from gensim import models

lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

2021-02-20 14:02:32,996 : INFO : using serial LSI version on this node
2021-02-20 14:02:32,998 : INFO : updating model with new documents
2021-02-20 14:02:33,001 : INFO : preparing a new chunk of documents
2021-02-20 14:02:33,002 : INFO : using 100 extra samples and 2 power iterations
2021-02-20 14:02:33,004 : INFO : 1st phase: constructing (86, 102) action matrix
2021-02-20 14:02:33,008 : INFO : orthonormalizing (86, 102) action matrix
2021-02-20 14:02:33,014 : INFO : 2nd phase: running dense svd on (86, 3) matrix
2021-02-20 14:02:33,015 : INFO : computing the final decomposition
2021-02-20 14:02:33,017 : INFO : keeping 2 factors (discarding 13.302% of energy spectrum)
2021-02-20 14:02:33,019 : INFO : processed documents up to #3
2021-02-20 14:02:33,022 : INFO : topic #0(23.783): -0.315*"that" + -0.247*"2005" + -0.238*"from" + -0.237*"it" + -0.209*"been" + -0.185*"services" + -0.178*"be" + -0.163*"university" + -0.162*"with" + -0.158*"on"
2021-02-20 14:02:33,025 : INFO : topic #1(13.7

In [54]:
doc = "Overall, the most striking set of statistics relate to approval for student services. There was a sharp increase in the number of students giving these services a good rating, particularly in the first five years: from 54 percent in 2005, to 81 percent in 2010, and 95 percent in 2015. There was also an overall improvement in ratings for teaching quality, though the increase was relatively small (74 percent in 2005 rising to 78 percent in 2015) and there was a decline in the interim (72 percent in 2010). There was also a fluctuation in attitudes to library resources, rising from 86 percent to 88 percent in the first five years and then falling by one percent in 2015. Good ratings for the university’s sports and social facilities were identical throughout, at 65 percent. Finally, there were poor ratings at the beginning of the period for the range of modules offered (39 percent in 2005) and they got worse, falling steadily to 31 percent in 2010 and 25 percent in 2015."
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]  # convert the query to LSI space
print(vec_lsi)

[(0, -3.247795517929719), (1, 0.7672402228696898)]


In [55]:
from gensim import similarities

index = similarities.MatrixSimilarity(lsi[corpus])

2021-02-20 14:02:33,799 : INFO : creating matrix with 3 documents and 2 features


In [56]:
index.save('/tmp/deerwester.index')
index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')

2021-02-20 14:02:34,227 : INFO : saving MatrixSimilarity object under /tmp/deerwester.index, separately None


FileNotFoundError: [Errno 2] No such file or directory: '/tmp/deerwester.index'

In [57]:
sims = index[vec_lsi]  # perform a similarity query against the corpus
print(list(enumerate(sims)))  # print (document_number, document_similarity) 2-tuples

[(0, 0.9182052), (1, 0.7571627), (2, 0.9602063)]


In [58]:
sims = sorted(enumerate(sims), key=lambda item:-item[1])

for doc_position, doc_score in sims:
    print(doc_score, documents[doc_position])

0.9602063 The chart illustrates percentages that have been taken from students from McGill University in three different years. Overall, library resources had the highest percentage of good rates in 2005 and 2010. However, student services overtook library resources in 2015 due to its significant increase over the given years. An analysis of the data shows that student services with 95% had the largest good rates among all other aspects overtaking both library resources and teaching quality that were on top on scale in 2005 with 86% and 74% respectively. Student services percentages rocketed up over the years, transferring from the second-lowest aspect with 54% in 2005 after the range of modules offered with a low of 39%, to be the top one in 2015, jumping approximately 41%. Along with teaching quality and library resources that also had a slight growth standing for 4% and 1% respectively. On the contrary of the range of modules offered in the university plunging from 39% to 25%. Stude