# Gensim Tutorial – A Complete Beginners Guide
https://www.machinelearningplus.com/nlp/gensim-tutorial/

## How to create a dictionary from a list of sentences?

In [None]:
import gensim
from gensim import corpora
from pprint import pprint

documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]

# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents]

# Create dictionary
dictionary = corpora.Dictionary(texts)

# Get information about the dictionary
print(dictionary)

In [None]:
print(dictionary.token2id)

In [None]:
# Add new document into an existing dictionary
texts_2 = [[text for text in doc.split()] for doc in documents_2]

dictionary.add_documents(texts_2)

print(dictionary)

print(dictionary.token2id)

## Create gensim dictionary form a single file

In [17]:
from gensim.utils import simple_preprocess
from smart_open import smart_open
import os

dictionary = corpora.Dictionary(simple_preprocess(line, deacc=True) for line in open('sample.txt', encoding='utf-8'))

# Token to Id map
dictionary.token2id

2019-03-19 20:14:56,935 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-03-19 20:14:56,938 : INFO : built Dictionary(93 unique tokens: ['enjoy', 'capabilities', 'to', 'and', 'technology']...) from 11 documents (total 158 corpus positions)


{'according': 35,
 'and': 22,
 'appointment': 23,
 'army': 0,
 'as': 43,
 'at': 24,
 'better': 85,
 'by': 36,
 'capabilities': 86,
 'ceremony': 25,
 'china': 1,
 'chinese': 2,
 'civilian': 75,
 'combat': 87,
 'companies': 14,
 'conduct': 48,
 'contribute': 88,
 'could': 89,
 'counterparts': 66,
 'daily': 15,
 'deepening': 76,
 'defense': 37,
 'design': 49,
 'development': 77,
 'enhancement': 90,
 'enjoy': 67,
 'experts': 26,
 'fellow': 44,
 'fields': 50,
 'firms': 68,
 'five': 58,
 'for': 59,
 'force': 3,
 'founding': 27,
 'from': 16,
 'hao': 28,
 'his': 45,
 'honored': 46,
 'in': 78,
 'innovation': 91,
 'integration': 79,
 'into': 51,
 'launching': 60,
 'letters': 29,
 'liberation': 4,
 'like': 52,
 'make': 92,
 'marks': 80,
 'members': 53,
 'military': 81,
 'missile': 61,
 'missiles': 62,
 'national': 38,
 'network': 63,
 'new': 82,
 'of': 5,
 'on': 17,
 'other': 30,
 'overall': 54,
 'owned': 69,
 'panel': 39,
 'people': 6,
 'pla': 18,
 'private': 19,
 'published': 40,
 'received': 3

In [18]:
# Create gensim dictionary form a multiple files
class ReadTxtFiles(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname), encoding='latin'):
                yield simple_preprocess(line)

path_to_text_directory = "lsa_sports_food_docs"

dictionary = corpora.Dictionary(ReadTxtFiles(path_to_text_directory))

# Token to Id map
dictionary.token2id

2019-03-19 20:14:58,195 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-03-19 20:14:58,321 : INFO : built Dictionary(525 unique tokens: ['invented', 'pan', 'commercially', 'prepared', 'once']...) from 19 documents (total 1204 corpus positions)


{'accompanying': 355,
 'according': 42,
 'achaya': 43,
 'across': 158,
 'activity': 159,
 'ad': 44,
 'added': 356,
 'advances': 509,
 'advantage': 281,
 'after': 401,
 'ago': 357,
 'aid': 470,
 'all': 0,
 'allow': 282,
 'along': 1,
 'already': 45,
 'also': 79,
 'alters': 283,
 'although': 160,
 'america': 254,
 'amongst': 145,
 'an': 203,
 'ancient': 46,
 'and': 2,
 'another': 146,
 'any': 402,
 'are': 3,
 'areas': 204,
 'around': 47,
 'as': 4,
 'association': 80,
 'associazione': 235,
 'at': 284,
 'attempts': 403,
 'available': 471,
 'back': 285,
 'badminton': 161,
 'baked': 205,
 'baking': 447,
 'ball': 286,
 'baseball': 504,
 'bases': 510,
 'bat': 404,
 'batsmen': 405,
 'batter': 5,
 'batting': 406,
 'be': 6,
 'beach': 162,
 'beans': 7,
 'became': 81,
 'because': 82,
 'become': 206,
 'been': 358,
 'between': 407,
 'birthplace': 48,
 'black': 111,
 'body': 112,
 'boiling': 359,
 'both': 485,
 'bounce': 287,
 'bounces': 288,
 'breakfast': 113,
 'breaks': 114,
 'broad': 448,
 'but': 32

## How to create a bag of words corpus in gensim?

In [None]:
# List with 2 sentences
my_docs = ["Who let the dogs out?",
           "Who? Who? Who? Who?"]

# Tokenize the docs
tokenized_list = [simple_preprocess(doc) for doc in my_docs]

# Create the Corpus
mydict = corpora.Dictionary()
mycorpus = [mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list]
pprint(mycorpus)

In [None]:
word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus]
pprint(word_counts)

## How to create a bag of words corpus from a text file?

In [None]:
from gensim.utils import simple_preprocess
from smart_open import smart_open
import nltk
nltk.download('stopwords')  # run once
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

class BoWCorpus(object):
    def __init__(self, path, dictionary):
        self.filepath = path
        self.dictionary = dictionary

    def __iter__(self):
        global mydict  # OPTIONAL, only if updating the source dictionary.
        for line in smart_open(self.filepath, encoding='latin'):
            # tokenize
            tokenized_list = simple_preprocess(line, deacc=True)

            # create bag of words
            bow = self.dictionary.doc2bow(tokenized_list, allow_update=True)

            # update the source dictionary (OPTIONAL)
            mydict.merge_with(self.dictionary)

            # lazy return the BoW
            yield bow


# Create the Dictionary
mydict = corpora.Dictionary()

# Create the Corpus
bow_corpus = BoWCorpus('sample.txt', dictionary=mydict)  # memory friendly

# Print the token_id and count for each line.
for line in bow_corpus:
    print(line)

## How to save a gensim dictionary and corpus to disk and load them back?

In [None]:
# Save the Dict and Corpus
mydict.save('mydict.dict')  # save dict to disk
corpora.MmCorpus.serialize('bow_corpus.mm', bow_corpus)  # save corpus to disk

In [None]:
# Load them back
loaded_dict = corpora.Dictionary.load('mydict.dict')

corpus = corpora.MmCorpus('bow_corpus.mm')
for line in corpus:
    print(line)

## How to create the TFIDF matrix (corpus) in gensim?

In [None]:
from gensim import models
import numpy as np

documents = ["This is the first line",
             "This is the second sentence",
             "This third document"]

# Create the Dictionary and Corpus
mydict = corpora.Dictionary([simple_preprocess(line) for line in documents])
corpus = [mydict.doc2bow(simple_preprocess(line)) for line in documents]

# Show the Word Weights in Corpus
for doc in corpus:
    print([[mydict[id], freq] for id, freq in doc])

In [None]:
# Create the TF-IDF model
tfidf = models.TfidfModel(corpus, smartirs='ntc')

# Show the TF-IDF weights
for doc in tfidf[corpus]:
    print([[mydict[id], np.around(freq, decimals=2)] for id, freq in doc])

## How to use gensim downloader API to load datasets?

In [None]:
import gensim.downloader as api

# Get information about the model or dataset
api.info('glove-wiki-gigaword-50')

In [None]:
# Download
w2v_model = api.load("glove-wiki-gigaword-50")
w2v_model.most_similar('blue')

## How to create bigrams and trigrams using Phraser models?

In [None]:
dataset = api.load("text8")
dataset = [wd for wd in dataset]

dct = corpora.Dictionary(dataset)
corpus = [dct.doc2bow(line) for line in dataset]

# Build the bigram models
bigram = gensim.models.phrases.Phrases(dataset, min_count=3, threshold=10)

# Construct bigram
print(bigram[dataset[0]])

In [None]:
# Build the trigram models
trigram = gensim.models.phrases.Phrases(bigram[dataset], threshold=10)

# Construct trigram
print(trigram[bigram[dataset[0]]])

## How to create Topic Models with LDA?

In [4]:
# Step 0: Import packages and stopwords
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
from gensim.utils import simple_preprocess, lemmatize
from nltk.corpus import stopwords
from gensim import corpora
import re
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
stop_words = stopwords.words('english')
stop_words = stop_words + ['com', 'edu', 'subject', 'lines', 'organization', 'would', 'article', 'could']

In [2]:
# Step 1: Import the dataset and get the text and real topic of each news article
dataset = api.load("text8")
print(dataset)
#data = [d for d in dataset]

<text8.Dataset object at 0x7f7b817592e8>


In [5]:
# Step 3: Create the Inputs of LDA model: Dictionary and Corpus
dct = corpora.Dictionary(data_processed)
corpus = [dct.doc2bow(line) for line in data_processed]
#print(corpus)

NameError: name 'data_processed' is not defined

In [None]:
# Step 4: Train the LDA model in Multicore
lda_model = LdaMulticore(corpus=corpus,
                         id2word=dct,
                         random_state=100,
                         num_topics=7,
                         passes=10,
                         chunksize=1000,
                         batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)

# save the model
lda_model.save('lda_model.model')

# See the topics
lda_model.print_topics(-1)

In [None]:
# Step 4': Train the LDA model in Monocore, the results are not the same as multicore
lda_model = LdaModel(corpus=corpus,
                         id2word=dct,
                         random_state=100,
                         num_topics=7,
                         passes=10,
                         chunksize=1000,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)

# save the model
lda_model.save('lda_model.model')

# See the topics
lda_model.print_topics(-1)

## How to interpret the LDA Topic Model’s output?

In [None]:
# Reference: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/topic_methods.ipynb
for c in lda_model[corpus[5:8]]:
    print("Document Topics      : ", c[0])      # [(Topics, Perc Contrib)]
    print("Word id, Topics      : ", c[1][:3])  # [(Word id, [Topics])]
    print("Phi Values (word id) : ", c[2][:2])  # [(Word id, [(Topic, Phi Value)])]
    print("Word, Topics         : ", [(dct[wd], topic) for wd, topic in c[1][:2]])   # [(Word, [Topics])]
    print("Phi Values (word)    : ", [(dct[wd], topic) for wd, topic in c[2][:2]])  # [(Word, [(Topic, Phi Value)])]
    print("------------------------------------------------------\n")

## How to create a LSI topic model using gensim?

In [None]:
from gensim.models import LsiModel

# Build the LSI Model
lsi_model = LsiModel(corpus=corpus, id2word=dct, num_topics=7, decay=0.5)

# View Topics
pprint(lsi_model.print_topics(-1))

## How to train Word2Vec model using gensim?

In [None]:
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api

# Download dataset
dataset = api.load("text8")
data = [d for d in dataset]

# Split the data into 2 parts. Part 2 will be used later to update the model
data_part1 = data[:1000]
data_part2 = data[1000:]

# Train Word2Vec model. Defaults result vector size = 100
model = Word2Vec(data_part1, min_count = 0, workers=cpu_count())

In [None]:
# Get the word vector for given word
model['topic']

In [None]:
model.most_similar('topic')

In [None]:
# Save and Load Model
model.save('newmodel')
model = Word2Vec.load('newmodel')

## How to update an existing Word2Vec model with new data?

In [None]:
# Update the model with new data.
model.build_vocab(data_part2, update=True)
model.train(data_part2, total_examples=model.corpus_count, epochs=model.iter)

In [None]:
model['topic']

## How to extract word vectors using pre-trained Word2Vec and FastText models?

In [2]:
import gensim.downloader as api

In [6]:
# Download the models
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')

2019-03-19 19:27:07,339 : INFO : loading projection weights from /home/cuda_user/gensim-data/fasttext-wiki-news-subwords-300/fasttext-wiki-news-subwords-300.gz
2019-03-19 19:31:19,555 : INFO : loaded (999999, 300) matrix from /home/cuda_user/gensim-data/fasttext-wiki-news-subwords-300/fasttext-wiki-news-subwords-300.gz


In [4]:
word2vec_model300 = api.load('word2vec-google-news-300')

In [3]:
glove_model300 = api.load('glove-wiki-gigaword-300')

In [7]:
# Try fasttext_model300 word embeddings
fasttext_model300.most_similar('support')

2019-03-19 19:31:25,547 : INFO : precomputing L2-norms of word weight vectors


[('suppport', 0.8470075726509094),
 ('supporting', 0.837419331073761),
 ('supports', 0.8059129118919373),
 ('suport', 0.7670574188232422),
 ('supportin', 0.7664282321929932),
 ('supported', 0.7623896598815918),
 ('non-support', 0.7357579469680786),
 ('suppor', 0.732284665107727),
 ('suppports', 0.7290899157524109),
 ('suppporting', 0.7196788191795349)]

In [8]:
# Try word2vec_model300 word embeddings
word2vec_model300.most_similar('support')

2019-03-19 19:31:28,170 : INFO : precomputing L2-norms of word weight vectors


[('supporting', 0.6251285076141357),
 ('suport', 0.6071150302886963),
 ('suppport', 0.6053199768066406),
 ('Support', 0.6044273376464844),
 ('supported', 0.6009396910667419),
 ('backing', 0.6007589101791382),
 ('supports', 0.5269277095794678),
 ('assistance', 0.5207138061523438),
 ('sup_port', 0.5192490220069885),
 ('supportive', 0.5110025405883789)]

In [9]:
# Try glove_model300 word embeddings
glove_model300.most_similar('support')

2019-03-19 19:31:37,004 : INFO : precomputing L2-norms of word weight vectors


[('supported', 0.740031898021698),
 ('supporting', 0.6803102493286133),
 ('backing', 0.6659233570098877),
 ('supports', 0.6377385258674622),
 ('provide', 0.6045100092887878),
 ('assistance', 0.587337076663971),
 ('efforts', 0.5793647766113281),
 ('providing', 0.561307430267334),
 ('strong', 0.5610021352767944),
 ('help', 0.5547006130218506)]

In [10]:
# Evaluate which one performs better

# Word2ec_accuracy
word2vec_model300.evaluate_word_analogies(analogies="questions-words.txt")[0]

# fasttext_accuracy
fasttext_model300.evaluate_word_analogies(analogies="questions-words.txt")[0]

# GloVe accuracy
glove_model300.evaluate_word_analogies(analogies="questions-words.txt")[0]

2019-03-19 19:32:01,340 : INFO : Evaluating word analogies for top 300000 words in the model on questions-words.txt
2019-03-19 19:32:08,392 : INFO : capital-common-countries: 83.2% (421/506)
2019-03-19 19:33:00,714 : INFO : capital-world: 81.3% (3552/4368)
2019-03-19 19:33:10,395 : INFO : currency: 28.5% (230/808)
2019-03-19 19:33:41,292 : INFO : city-in-state: 72.1% (1779/2467)
2019-03-19 19:33:47,886 : INFO : family: 86.2% (436/506)
2019-03-19 19:34:00,192 : INFO : gram1-adjective-to-adverb: 29.2% (290/992)
2019-03-19 19:34:10,390 : INFO : gram2-opposite: 43.5% (353/812)
2019-03-19 19:34:26,561 : INFO : gram3-comparative: 91.3% (1216/1332)
2019-03-19 19:34:40,330 : INFO : gram4-superlative: 88.0% (987/1122)
2019-03-19 19:34:53,059 : INFO : gram5-present-participle: 78.5% (829/1056)
2019-03-19 19:35:12,952 : INFO : gram6-nationality-adjective: 90.2% (1442/1599)
2019-03-19 19:35:31,531 : INFO : gram7-past-tense: 65.4% (1020/1560)
2019-03-19 19:35:47,349 : INFO : gram8-plural: 87.0% (11

0.7195422354510931

## How to create document vectors using Doc2Vec?

In [11]:
import gensim
import gensim.downloader as api

# Download dataset
dataset = api.load("text8")
data = [d for d in dataset]

In [12]:
# Create the tagged document needed for Doc2Vec
def create_tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

train_data = list(create_tagged_document(data))

print(train_data[:1])

[TaggedDocument(words=['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived', 'from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political', 'philosophy', 'is', 'the', 'belief', 'that', 'rulers', 'are', 'unnecessary', 'and', 'should', 'be', 'abolished', 'although', 'there', 'are', 'differing', 'interpretations', 'of', 'what', 'this', 'means', 'anarchism', 'also', 'refers'

In [13]:
# Init the Doc2Vec model
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

# Build the Volabulary
model.build_vocab(train_data)

# Train the Doc2Vec model
model.train(train_data, total_examples=model.corpus_count, epochs=model.epochs)

2019-03-19 19:47:21,280 : INFO : collecting all words and their counts
2019-03-19 19:47:21,282 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2019-03-19 19:47:24,860 : INFO : collected 253854 word types and 1701 unique tags from a corpus of 1701 examples and 17005207 words
2019-03-19 19:47:24,863 : INFO : Loading a fresh vocabulary
2019-03-19 19:47:25,481 : INFO : effective_min_count=2 retains 135335 unique words (53% of original 253854, drops 118519)
2019-03-19 19:47:25,483 : INFO : effective_min_count=2 leaves 16886688 word corpus (99% of original 17005207, drops 118519)
2019-03-19 19:47:25,981 : INFO : deleting the raw counts dictionary of 253854 items
2019-03-19 19:47:25,998 : INFO : sample=0.001 downsamples 37 most-common words
2019-03-19 19:47:25,999 : INFO : downsampling leaves estimated 12689806 word corpus (75.1% of prior 16886688)
2019-03-19 19:47:26,537 : INFO : estimated required memory for 135335 words and 50 dimensions: 122141700 bytes
201

2019-03-19 19:48:19,786 : INFO : EPOCH - 4 : training on 17005207 raw words (12690368 effective words) took 12.7s, 995968 effective words/s
2019-03-19 19:48:20,808 : INFO : EPOCH 5 - PROGRESS: at 7.94% examples, 996973 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:48:21,811 : INFO : EPOCH 5 - PROGRESS: at 15.87% examples, 998766 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:48:22,814 : INFO : EPOCH 5 - PROGRESS: at 23.93% examples, 1006582 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:48:23,815 : INFO : EPOCH 5 - PROGRESS: at 31.10% examples, 984593 words/s, in_qsize 6, out_qsize 0
2019-03-19 19:48:24,819 : INFO : EPOCH 5 - PROGRESS: at 38.62% examples, 979220 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:48:25,823 : INFO : EPOCH 5 - PROGRESS: at 46.27% examples, 977635 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:48:26,827 : INFO : EPOCH 5 - PROGRESS: at 54.20% examples, 982350 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:48:27,836 : INFO : EPOCH 5 - PROGRESS: at 62.26% exampl

2019-03-19 19:49:21,092 : INFO : EPOCH 9 - PROGRESS: at 78.54% examples, 990285 words/s, in_qsize 5, out_qsize 1
2019-03-19 19:49:22,092 : INFO : EPOCH 9 - PROGRESS: at 86.65% examples, 993751 words/s, in_qsize 6, out_qsize 0
2019-03-19 19:49:23,094 : INFO : EPOCH 9 - PROGRESS: at 94.65% examples, 994896 words/s, in_qsize 6, out_qsize 0
2019-03-19 19:49:23,752 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-03-19 19:49:23,758 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-03-19 19:49:23,760 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-03-19 19:49:23,761 : INFO : EPOCH - 9 : training on 17005207 raw words (12691713 effective words) took 12.7s, 995792 effective words/s
2019-03-19 19:49:24,786 : INFO : EPOCH 10 - PROGRESS: at 7.88% examples, 980873 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:49:25,791 : INFO : EPOCH 10 - PROGRESS: at 15.70% examples, 981957 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:49:26,7

2019-03-19 19:50:18,494 : INFO : EPOCH 14 - PROGRESS: at 31.45% examples, 993899 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:50:19,498 : INFO : EPOCH 14 - PROGRESS: at 38.74% examples, 980818 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:50:20,512 : INFO : EPOCH 14 - PROGRESS: at 46.91% examples, 988646 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:50:21,516 : INFO : EPOCH 14 - PROGRESS: at 55.09% examples, 995955 words/s, in_qsize 6, out_qsize 0
2019-03-19 19:50:22,516 : INFO : EPOCH 14 - PROGRESS: at 63.08% examples, 998517 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:50:23,520 : INFO : EPOCH 14 - PROGRESS: at 71.08% examples, 1000199 words/s, in_qsize 4, out_qsize 1
2019-03-19 19:50:24,529 : INFO : EPOCH 14 - PROGRESS: at 79.01% examples, 998401 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:50:25,531 : INFO : EPOCH 14 - PROGRESS: at 86.71% examples, 996272 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:50:26,531 : INFO : EPOCH 14 - PROGRESS: at 94.77% examples, 997980 words/

2019-03-19 19:51:17,920 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-03-19 19:51:17,923 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-03-19 19:51:17,924 : INFO : EPOCH - 18 : training on 17005207 raw words (12691901 effective words) took 12.8s, 989847 effective words/s
2019-03-19 19:51:18,944 : INFO : EPOCH 19 - PROGRESS: at 7.47% examples, 931600 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:51:19,952 : INFO : EPOCH 19 - PROGRESS: at 14.40% examples, 901325 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:51:20,953 : INFO : EPOCH 19 - PROGRESS: at 21.87% examples, 915821 words/s, in_qsize 6, out_qsize 0
2019-03-19 19:51:21,967 : INFO : EPOCH 19 - PROGRESS: at 29.75% examples, 936468 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:51:22,975 : INFO : EPOCH 19 - PROGRESS: at 37.51% examples, 946052 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:51:23,982 : INFO : EPOCH 19 - PROGRESS: at 45.15% examples, 949476 words/s, in_qsize 5, out_qs

2019-03-19 19:52:17,324 : INFO : EPOCH 23 - PROGRESS: at 63.02% examples, 993384 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:52:18,331 : INFO : EPOCH 23 - PROGRESS: at 71.13% examples, 996956 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:52:19,333 : INFO : EPOCH 23 - PROGRESS: at 79.54% examples, 1002072 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:52:20,336 : INFO : EPOCH 23 - PROGRESS: at 87.65% examples, 1004142 words/s, in_qsize 6, out_qsize 0
2019-03-19 19:52:21,348 : INFO : EPOCH 23 - PROGRESS: at 95.88% examples, 1006404 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:52:21,840 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-03-19 19:52:21,850 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-03-19 19:52:21,853 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-03-19 19:52:21,854 : INFO : EPOCH - 23 : training on 17005207 raw words (12692307 effective words) took 12.6s, 1006950 effective words/s
2019-03-19 1

2019-03-19 19:53:14,702 : INFO : EPOCH 28 - PROGRESS: at 15.70% examples, 988134 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:53:15,715 : INFO : EPOCH 28 - PROGRESS: at 23.81% examples, 998977 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:53:16,723 : INFO : EPOCH 28 - PROGRESS: at 31.57% examples, 995769 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:53:17,727 : INFO : EPOCH 28 - PROGRESS: at 39.21% examples, 990986 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:53:18,732 : INFO : EPOCH 28 - PROGRESS: at 47.03% examples, 991172 words/s, in_qsize 6, out_qsize 0
2019-03-19 19:53:19,733 : INFO : EPOCH 28 - PROGRESS: at 55.09% examples, 996294 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:53:20,739 : INFO : EPOCH 28 - PROGRESS: at 63.14% examples, 999062 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:53:21,751 : INFO : EPOCH 28 - PROGRESS: at 71.43% examples, 1003923 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:53:22,752 : INFO : EPOCH 28 - PROGRESS: at 79.42% examples, 1003221 words

2019-03-19 19:54:15,381 : INFO : EPOCH 32 - PROGRESS: at 94.47% examples, 995522 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:54:16,139 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-03-19 19:54:16,154 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-03-19 19:54:16,156 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-03-19 19:54:16,157 : INFO : EPOCH - 32 : training on 17005207 raw words (12689806 effective words) took 12.8s, 989638 effective words/s
2019-03-19 19:54:17,164 : INFO : EPOCH 33 - PROGRESS: at 8.05% examples, 1013467 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:54:18,165 : INFO : EPOCH 33 - PROGRESS: at 15.93% examples, 1004301 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:54:19,168 : INFO : EPOCH 33 - PROGRESS: at 23.57% examples, 992620 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:54:20,173 : INFO : EPOCH 33 - PROGRESS: at 31.10% examples, 984225 words/s, in_qsize 6, out_qsize 0
2019-03-19 19:5

2019-03-19 19:55:13,240 : INFO : EPOCH 37 - PROGRESS: at 47.09% examples, 989448 words/s, in_qsize 6, out_qsize 0
2019-03-19 19:55:14,247 : INFO : EPOCH 37 - PROGRESS: at 54.85% examples, 988699 words/s, in_qsize 6, out_qsize 0
2019-03-19 19:55:15,252 : INFO : EPOCH 37 - PROGRESS: at 62.61% examples, 987860 words/s, in_qsize 6, out_qsize 0
2019-03-19 19:55:16,254 : INFO : EPOCH 37 - PROGRESS: at 70.55% examples, 989945 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:55:17,266 : INFO : EPOCH 37 - PROGRESS: at 78.72% examples, 991978 words/s, in_qsize 5, out_qsize 0
2019-03-19 19:55:18,273 : INFO : EPOCH 37 - PROGRESS: at 86.89% examples, 995087 words/s, in_qsize 6, out_qsize 0
2019-03-19 19:55:19,275 : INFO : EPOCH 37 - PROGRESS: at 95.06% examples, 998289 words/s, in_qsize 6, out_qsize 0
2019-03-19 19:55:19,859 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-03-19 19:55:19,861 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-03-19 19:55:19,

In [14]:
print(model.infer_vector(['australian', 'captain', 'elected', 'to', 'bowl']))

[ 0.06032689  0.18639418 -0.39468604 -0.3626974  -0.09318312  0.04502558
  0.14077036 -0.18113542  0.5441968  -0.0596909   0.17224409 -0.0774765
  0.25857627  0.15406573 -0.08637781  0.34137353  0.04922651  0.21410146
  0.10973113  0.09114561 -0.17765002 -0.54862624 -0.04218041 -0.40009996
 -0.3396933  -0.27378172  0.10261839  0.42458376 -0.05328482  0.13288994
  0.28360254  0.16502663  0.47124362 -0.1931318  -0.04624787  0.12779057
 -0.28575918 -0.42581347  0.14722782  0.18899105 -0.35213348 -0.41971874
 -0.06345312  0.62705654 -0.43539104  0.69622445 -0.06633121  0.2987238
 -0.25494263  0.15704694]


## How to compute similarity metrics like cosine similarity and soft cosine similarity?

In [19]:
from gensim.matutils import softcossim
from gensim import corpora

sent_1 = 'Sachin is a cricket player and a opening batsman'.split()
sent_2 = 'Dhoni is a cricket player too He is a batsman and keeper'.split()
sent_3 = 'Anand is a chess player'.split()

# Prepare the similarity matrix
similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

# Prepare a dictionary and a corpus.
documents = [sent_1, sent_2, sent_3]
dictionary = corpora.Dictionary(documents)

# Convert the sentences into bag-of-words vectors.
sent_1 = dictionary.doc2bow(sent_1)
sent_2 = dictionary.doc2bow(sent_2)
sent_3 = dictionary.doc2bow(sent_3)

  if __name__ == '__main__':
2019-03-19 20:15:20,242 : INFO : constructing a sparse term similarity matrix using <gensim.models.keyedvectors.WordEmbeddingSimilarityIndex object at 0x7f8e2447d278>
2019-03-19 20:15:20,244 : INFO : iterating over columns in dictionary order
2019-03-19 20:15:20,249 : INFO : PROGRESS: at 0.19% columns (1 / 525, 0.190476% density, 0.190476% projected density)
2019-03-19 20:15:46,268 : INFO : constructed a sparse term similarity matrix with 1.316644% density
2019-03-19 20:15:46,278 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-03-19 20:15:46,280 : INFO : built Dictionary(14 unique tokens: ['He', 'batsman', 'too', 'chess', 'is']...) from 3 documents (total 26 corpus positions)


In [20]:
# Compute soft cosine similarity
print(softcossim(sent_1, sent_2, similarity_matrix))
print(softcossim(sent_1, sent_3, similarity_matrix))
print(softcossim(sent_2, sent_3, similarity_matrix))

0.788882958139703
0.5051107137297167
0.5620230844057823


  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [21]:
# Which word from the given list doesn't go with the others?
print(fasttext_model300.doesnt_match(['india', 'australia', 'pakistan', 'china', 'beetroot']))

beetroot


In [22]:
# Compute cosine distance between two words.
print(fasttext_model300.distance('king', 'queen'))

0.22957539558410645


In [23]:
# Compute cosine distances from given word or vector to all words in `other_words`.
print(fasttext_model300.distances('king', ['queen', 'man', 'woman']))

[0.22957546 0.465837   0.547001  ]


In [24]:
# Compute cosine similarities
# Note: Queen + Man is very similar to King.
print(fasttext_model300.cosine_similarities(fasttext_model300['king'], 
                                            vectors_all=(fasttext_model300['queen'], 
                                                        fasttext_model300['man'], 
                                                        fasttext_model300['woman'],
                                                        fasttext_model300['queen'] + fasttext_model300['man'])))

[0.77042454 0.534163   0.45299897 0.76572543]


In [25]:
# Get the words closer to w1 than w2
print(glove_model300.words_closer_than(w1='king', w2='kingdom'))

['prince', 'queen', 'monarch']


In [26]:
# Find the top-N most similar words.
print(fasttext_model300.most_similar(positive='king', negative=None, topn=5, restrict_vocab=None, indexer=None))

[('king-', 0.7838029265403748), ('boy-king', 0.7704817652702332), ('queen', 0.7704246044158936), ('prince', 0.7700966596603394), ('kings', 0.7668929696083069)]


In [33]:
# Find the top-N most similar words, using the multiplicative combination objective,
print(glove_model300.most_similar_cosmul(positive='king', negative=None, topn=5))

[('mesoamerican', 1.3182628154754639), ('thrombin', 1.3123780488967896), ('luther', 1.3051434755325317), ('snedden', 1.2918351888656616), ('jerod', 1.2863011360168457)]


In [41]:
# Find the top-N most similar words, using the multiplicative combination objective,
print(glove_model300.most_similar_cosmul(positive=['woman', 'king'], negative=['man'], topn=5))

[('queen', 0.9199351072311401), ('princess', 0.8403170108795166), ('throne', 0.8287888765335083), ('monarch', 0.8201609253883362), ('elizabeth', 0.8025429248809814)]


In [45]:
print(glove_model300.most_similar_cosmul(positive=['doctor', 'woman'], negative=['man'], topn=5))

[('nurse', 0.929076075553894), ('physician', 0.9234046936035156), ('doctors', 0.9137151837348938), ('pregnant', 0.8704767227172852), ('dentist', 0.8677982687950134)]


## How to summarize text documents?

In [37]:
from gensim.summarization import summarize, keywords
from pprint import pprint

text = " ".join((line for line in smart_open('sample.txt', encoding='utf-8')))

# Summarize the paragraph
pprint(summarize(text, word_count=20))

2019-03-19 20:36:20,204 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-03-19 20:36:20,206 : INFO : built Dictionary(70 unique tokens: ['integr', 'liber', 'state', 'counterpart', 'armi']...) from 11 documents (total 102 corpus positions)
2019-03-19 20:36:20,208 : INFO : Building graph
2019-03-19 20:36:20,209 : INFO : Filling graph
2019-03-19 20:36:20,211 : INFO : Removing unreachable nodes of graph
2019-03-19 20:36:20,213 : INFO : Pagerank graph
2019-03-19 20:36:20,217 : INFO : Sorting pagerank scores


('the PLA Rocket Force national defense science and technology experts panel, '
 'according to a report published by the')


In [38]:
# Important keywords from the paragraph
print(keywords(text))

experts
force
zhang
pla
rocket
missiles missile


In [47]:
# "boy" is to "father" as "girl" is to ...?
glove_model300.most_similar(['girl', 'father'], ['boy'], topn=3)
more_examples = ["he his she", "big bigger bad", "going went being"]
for example in more_examples:
    a, b, x = example.split()
    predicted = glove_model300.most_similar([x, b], [a])[0][0]
    print("'%s' is to '%s' as '%s' is to '%s'" % (a, b, x, predicted))

'he' is to 'his' as 'she' is to 'her'
'big' is to 'bigger' as 'bad' is to 'worse'
'going' is to 'went' as 'being' is to 'subsequently'


In [46]:
# which word doesn't go with the others?
glove_model300.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'