In [1]:
import gensim

In [2]:
# Create some raw documents
raw_documents = ["I love tacos.",
                "She ran with the chicken.",
                "I don't choose to take a nap. The nap chooses me.",
                "That man is nice as pie with ice cream.",
                "This pizza is an affront to nature."]

In [3]:
# We are going to need to tokenize, so let's use NLTK
from nltk.tokenize import word_tokenize

In [4]:
def get_tokens(text):
    tokens = word_tokenize(text)
    return tokens

In [5]:
# A Gensim document is a list of tokens
# We could optionally make all of the tokens lower case
gen_docs = [get_tokens(text) for text in raw_documents]
print(gen_docs)

[['I', 'love', 'tacos', '.'], ['She', 'ran', 'with', 'the', 'chicken', '.'], ['I', 'do', "n't", 'choose', 'to', 'take', 'a', 'nap', '.', 'The', 'nap', 'chooses', 'me', '.'], ['That', 'man', 'is', 'nice', 'as', 'pie', 'with', 'ice', 'cream', '.'], ['This', 'pizza', 'is', 'an', 'affront', 'to', 'nature', '.']]


In [6]:
# Create dictionary from a list of documents
# A dictionary maps every words to a number
dictionary = gensim.corpora.Dictionary(gen_docs)
num_words = len(dictionary)
print("Num words in dictionary: {}".format(num_words))
for idx, word in dictionary.items():
    print(idx, word)

Num words in dictionary: 32
0 .
1 I
2 love
3 tacos
4 She
5 chicken
6 ran
7 the
8 with
9 The
10 a
11 choose
12 chooses
13 do
14 me
15 n't
16 nap
17 take
18 to
19 That
20 as
21 cream
22 ice
23 is
24 man
25 nice
26 pie
27 This
28 affront
29 an
30 nature
31 pizza


In [7]:
print(dictionary[7])
print(dictionary.id2token[7])

the
the


In [8]:
print(dictionary.token2id['ran'])

6


In [9]:
# Create bag of words
# A bag of words is tf term frequency (tf) of tf-idf
# Called a "bag of words" because order is lost
# Note that "!" is not in the dictionary
bow_doc = dictionary.doc2bow(['I', 'love', 'love', 'love', 'tacos', '!'])
print(bow_doc)

[(1, 1), (2, 3), (3, 1)]


In [10]:
# Create corpus
# A corpus is a list of bags of words
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(0, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(0, 2), (1, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 2), (17, 1), (18, 1)], [(0, 1), (8, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(0, 1), (18, 1), (23, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)]]


In [11]:
# Create tf-idf model from corpus
# num_nnz is the number of tokens
tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)

TfidfModel(num_docs=5, num_nnz=40)


In [12]:
# Show document in text form, bag of words, and tf-idf
# 0 is tacos, 1 is love, 2 is I
# Value for I is lower because occurs multiple times.
# Value for '.' is 0 because it occurs in all sentences and log_2(1) = 0.
# Vectors are normalized so they sum to 1
print(gen_docs[0])
print(corpus[0])
print(tf_idf[corpus][0])

['I', 'love', 'tacos', '.']
[(0, 1), (1, 1), (2, 1), (3, 1)]
[(1, 0.37344696513776354), (2, 0.6559486886294514), (3, 0.6559486886294514)]


In [13]:
# Show bag of words and tf-idf for new document
# Note it is similar to the document above
bow = dictionary.doc2bow(['I', 'love', 'pizza', '.'])
print(bow)
print(tf_idf[bow])

[(0, 1), (1, 1), (2, 1), (31, 1)]
[(1, 0.37344696513776354), (2, 0.6559486886294514), (31, 0.6559486886294514)]


In [14]:
# Create similarity measure object in tf-idf space
# First arg is temp external storage
# https://radimrehurek.com/gensim/similarities/docsim.html
import os
os.makedirs('output', exist_ok=True)

output_obj = os.path.join('output', 'similarity')
sims = gensim.similarities.Similarity(output_obj, tf_idf[corpus],
                                     num_features=len(dictionary))
print(sims)

Similarity index with 5 documents in 0 shards (stored under output/similarity)


In [15]:
# Create query document and convert to tf-idf
query_doc = "chicken with taco love".split()
print(query_doc)
query_doc_bow = dictionary.doc2bow(query_doc)
print(query_doc_bow)
query_doc_tf_idf = tf_idf[query_doc_bow]
print(query_doc_tf_idf)

['chicken', 'with', 'taco', 'love']
[(2, 1), (5, 1), (8, 1)]
[(2, 0.6559486886294514), (5, 0.6559486886294514), (8, 0.37344696513776354)]


In [16]:
sims[query_doc_tf_idf]

array([0.4302687 , 0.41768694, 0.        , 0.07687882, 0.        ],
      dtype=float32)