# Core Concepts

In [0]:
from pprint import pprint
from collections import defaultdict
from gensim import corpora, models, similarities

#### **Document**

In [0]:
# document - an object of the text sequence type (str)
document = "Human machine interface for lab abc computer applications"

#### **Corpus**

In [0]:
# corpus - collection of document objects
text_corpus = ["Human machine interface for lab abc computer applications",
               "A survey of user opinion of computer system response time",
               "The EPS user interface management system",
               "System and human system engineering testing of EPS",
               "Relation of user perceived response time to error measurement",
               "The generation of random binary unordered trees",
               "The intersection graph of paths in trees",
               "Graph minors IV Widths of trees and well quasi ordering",
               "Graph minors A survey"]

In [4]:
# Create a set of frequent words
stopwords = set('for a of the and to in'.split())

# Lowercase each document, split it by white space and filter out stopwords
texts = [ [ word for word in document.lower().split() if word not in stopwords ] for document in text_corpus ]
pprint(texts)

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]


In [5]:
# Count word frequencies
frequency = defaultdict(int)

for text in texts:
  for token in text:
    frequency[token] +=1

# Only retain words that appear more than once
processed_corpus = [ [ token for token in text if frequency[token] > 1 ] for text in texts ]
pprint(processed_corpus)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [6]:
# Map each word in corpus to a unique integer ID
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


#### **Vector**

In [7]:
# Token : integer ID mapping
pprint(dictionary.token2id)

{'computer': 0,
 'eps': 8,
 'graph': 10,
 'human': 1,
 'interface': 2,
 'minors': 11,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'trees': 9,
 'user': 7}


In [8]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
pprint(new_vec)

[(0, 1), (1, 1)]


In [9]:
# Bag of words representation
bow_corpus = [ dictionary.doc2bow(text) for text in processed_corpus ]
pprint(bow_corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


#### **Model**

> *tf-idf*: The tf-idf model transforms vectors from the bag-of-words representation to a vector space where the frequency counts are weighted according to the relative rarity of each word in the corpus.



In [10]:
# train the model
tfidf = models.TfidfModel(bow_corpus)

# transform the "system minors" string
words = "system minors".lower().split()
print(dictionary.doc2bow(words))
print(tfidf[dictionary.doc2bow(words)])

[(5, 1), (11, 1)]
[(5, 0.5898341626740045), (11, 0.8075244024440723)]


In [0]:
# build index
index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)

In [0]:
# create a query document
query_document = "system engineering".split()
query_bow = dictionary.doc2bow(query_document)

# query the similarity of our query document against every document in the corpus
sims = index[tfidf[query_bow]]

In [13]:
for document_number, score in sorted(enumerate(sims), key=lambda x:x[1], reverse = True):
  print(document_number, round(score*100, 2))

3 71.85
2 41.71
1 32.45
0 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0


Document 3 has a similarity score of 72%, document 2 has a similarity score of 42% etc.