In [6]:
# Example 

t_corpus = [
               "A survey of user opinion of computer system response time",
               "Relation of user perceived response time to error measurement",
               "The generation of random binary unordered trees",
               "The intersection graph of paths in trees",
               "Graph minors IV Widths of trees and well quasi ordering",
          ]

# No of documents : 5

In [13]:
# Stop words
stoplist = set('for a of the and to in'.split(' '))

In [14]:
stoplist

{'a', 'and', 'for', 'in', 'of', 'the', 'to'}

In [17]:
# Remove stop words and Split 
processed_corpus = [[word for word in document.lower().split() if word not in stoplist] for document in t_corpus]

In [19]:
processed_corpus

[['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering']]

In [22]:
# Converting corpus into list of vectors

from gensim import corpora
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary(25 unique tokens: ['computer', 'opinion', 'response', 'survey', 'system']...)


In [26]:
# Word to id

print(dictionary.token2id)

{'computer': 0, 'opinion': 1, 'response': 2, 'survey': 3, 'system': 4, 'time': 5, 'user': 6, 'error': 7, 'measurement': 8, 'perceived': 9, 'relation': 10, 'binary': 11, 'generation': 12, 'random': 13, 'trees': 14, 'unordered': 15, 'graph': 16, 'intersection': 17, 'paths': 18, 'iv': 19, 'minors': 20, 'ordering': 21, 'quasi': 22, 'well': 23, 'widths': 24}


In [32]:
# bag-of-word representation for a document 

BoW_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

print(BoW_corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(2, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)], [(11, 1), (12, 1), (13, 1), (14, 1), (15, 1)], [(14, 1), (16, 1), (17, 1), (18, 1)], [(14, 1), (16, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]]


In [111]:
dictionary.doc2bow(['The', 'Saudis', 'are', 'preparing', 'a', 'report', 'that', 'will', 'acknowledge', 'that'], allow_update=True)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1)]

In [35]:
from gensim import models

tfidf = models.TfidfModel(BoW_corpus)

words = "trees graph".lower().split()

print(tfidf[dictionary.doc2bow(words)])

[(14, 0.4869354917707381), (16, 0.8734379353188121)]


In [84]:
# Show the word to id map

print(dictionary.token2id)

{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32}


In [85]:
# If you get new documents in the future.it is also possible to update an existing dictionary to include the new words.

documents_2 = ["The intersection graph of paths in trees",
               "Graph minors IV Widths of trees and well quasi ordering",
               "Graph minors A survey"]

texts_2 = [[text for text in doc.split()] for doc in documents_2]

dictionary.add_documents(texts_2)

In [88]:
# Show the word to id map

print(dictionary.token2id)

{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32, 'graph': 33, 'in': 34, 'intersection': 35, 'paths': 36, 'trees': 37, 'Graph': 38, 'IV': 39, 'Widths': 40, 'and': 41, 'minors': 42, 'ordering': 43, 'quasi': 44, 'well': 45, 'A': 46, 'survey': 47}


In [100]:
dictionary[0]

'Saudis'

In [101]:
dictionary[4]

'are'

In [96]:
# Create a bag of words corpus in gensim

#  Bag of Words : It contains the word id and its frequeny in each document

'Saudis'

In [115]:
bow_corpus = [dictionary.doc2bow(x, allow_update=True) for x in texts]

bow_corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1)], [(9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(7, 2), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(23, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)]]

In [116]:
dictionary.doc2bow(['The', 'Saudis', 'are', 'preparing', 'a', 'report', 'that', 'will', 'acknowledge', 'that'], allow_update=True)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1)]

In [118]:
# Save the Dictionary and Corpus

dictionary.save('mydict.dict')  # save dict to disk
corpora.MmCorpus.serialize('bow_corpus.mm', bow_corpus)  # save corpus to disk

In [120]:
# Load them back
loaded_dict = corpora.Dictionary.load('mydict.dict')

corpus = corpora.MmCorpus('bow_corpus.mm')
for line in corpus:
    print(line)

[(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 2.0), (8, 1.0)]
[(9, 1.0), (10, 1.0), (11, 1.0), (12, 1.0), (13, 1.0), (14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0), (18, 1.0)]
[(7, 2.0), (18, 1.0), (19, 1.0), (20, 1.0), (21, 1.0), (22, 1.0), (23, 1.0), (24, 1.0), (25, 1.0)]
[(23, 2.0), (26, 1.0), (27, 1.0), (28, 1.0), (29, 1.0), (30, 1.0), (31, 1.0), (32, 1.0)]


In [121]:
# create bigrams and trigrams using Phraser models?

documents_3 = ["The intersection graph of paths in trees",
               "Graph minors IV Widths of trees and well quasi ordering",
               "Graph minors A survey"]

texts_3 = [[text for text in doc.split()] for doc in documents_3]

In [127]:
from gensim import corpora

dct = corpora.Dictionary(texts_3)

corpus = [dct.doc2bow(line) for line in texts_3]

corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(4, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)], [(7, 1), (11, 1), (15, 1), (16, 1)]]

In [135]:
bigram = gensim.models.phrases.Phrases("The intersection graph of paths in trees", min_count=3, threshold=10)

print(bigram['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'])

['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time']


In [139]:
# Tutorials

In [None]:
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

In [143]:
# Preprocess the Dataset

import gensim
import os
from gensim.utils import simple_preprocess

# List of Tokens
tokenized  = [doc.split() for doc in documents]

tokenized

[['The', 'Saudis', 'are', 'preparing', 'a', 'report', 'that', 'will', 'acknowledge', 'that'], ['Saudi', 'journalist', 'Jamal', "Khashoggi's", 'death', 'was', 'the', 'result', 'of', 'an'], ['interrogation', 'that', 'went', 'wrong,', 'one', 'that', 'was', 'intended', 'to', 'lead'], ['to', 'his', 'abduction', 'from', 'Turkey,', 'according', 'to', 'two', 'sources.']]

In [146]:
# Create a Dictionary

from gensim import corpora

# storing the extracted tokens into the dictionary
my_dictionary = corpora.Dictionary(tokenized)

print(my_dictionary)

Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


In [148]:
# convertig to a bag of word corpus

BOW_corpus = [my_dictionary.doc2bow(doc, allow_update = True) for doc in tokenized]

print(BOW_corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1)], [(9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(7, 2), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(23, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)]]


In [151]:
#  Create a TFIDF matrix in Gensim
import numpy as np

word_weight =[]
for doc in BoW_corpus:
    for id, freq in doc:
        word_weight.append([my_dictionary[id], freq])

print(word_weight)

[['Saudis', 1], ['The', 1], ['a', 1], ['acknowledge', 1], ['a', 1], ['acknowledge', 1], ['are', 2], ['Saudis', 2], ['acknowledge', 3], ['preparing', 2], ['report', 1], ['that', 2], ['will', 1]]


In [153]:
from gensim import models

# create TF-IDF model
tfIdf = models.TfidfModel(BoW_corpus, smartirs ='ntc')

# TF-IDF Word Weight
weight_tfidf =[]
for doc in tfIdf[BoW_corpus]:
    for id, freq in doc:
        weight_tfidf.append([my_dictionary[id], np.around(freq, decimals = 3)])
        
print(weight_tfidf)      

[['Saudis', 0.403], ['The', 0.805], ['a', 0.403], ['acknowledge', 0.167], ['a', 0.241], ['acknowledge', 0.1], ['are', 0.965], ['Saudis', 0.296], ['acknowledge', 0.184], ['preparing', 0.593], ['report', 0.296], ['that', 0.593], ['will', 0.296]]


In [3]:
import gensim.downloader as api
from multiprocessing import cpu_count
from gensim.models.word2vec import Word2Vec

# load the text8 dataset
dataset = api.load("text8")

data = []
for word in dataset:
    data.append(word)

In [4]:
len(data)

1701

In [5]:
# Word2Vec Implementation and Understanding

In [6]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [7]:
# Building the Word2Vec 

model = Word2Vec(sentences=common_texts, max_vocab_size=100, window=5, min_count=1, workers=4)

In [8]:
# model.save("word2vec.model")
# model = Word2Vec.load("word2vec.model")

In [9]:
# If you save the model you can continue training it later:
model.train([["hello", "world"]], total_examples=1, epochs=1)

(0, 2)

In [10]:
model.wv.vocab

{'human': <gensim.models.keyedvectors.Vocab at 0x22f8711bc08>,
 'interface': <gensim.models.keyedvectors.Vocab at 0x22fcf783648>,
 'computer': <gensim.models.keyedvectors.Vocab at 0x22fcf783548>,
 'survey': <gensim.models.keyedvectors.Vocab at 0x22fcf783708>,
 'user': <gensim.models.keyedvectors.Vocab at 0x22fcf7830c8>,
 'system': <gensim.models.keyedvectors.Vocab at 0x22fcf783848>,
 'response': <gensim.models.keyedvectors.Vocab at 0x22fcf783748>,
 'time': <gensim.models.keyedvectors.Vocab at 0x22fcf7836c8>,
 'eps': <gensim.models.keyedvectors.Vocab at 0x22fcf783788>,
 'trees': <gensim.models.keyedvectors.Vocab at 0x22fcf7837c8>,
 'graph': <gensim.models.keyedvectors.Vocab at 0x22fcf783688>,
 'minors': <gensim.models.keyedvectors.Vocab at 0x22fcf783808>}

In [11]:
# Bulding the vector : Numpy array for the word 

model.wv['computer']  # get numpy vector of a word

array([-4.2501707e-03, -2.7321007e-03, -7.6862931e-04,  9.1923954e-05,
       -2.8173858e-03,  7.0831645e-04, -2.3521099e-03, -3.1246059e-03,
        3.3323485e-03,  3.1188789e-03, -4.4940435e-04, -4.7564129e-03,
        1.8066826e-03, -2.6453191e-03, -3.0376876e-03, -4.4731266e-04,
       -4.3843221e-03, -4.5795701e-03,  4.5116562e-03, -3.9353822e-03,
       -4.2268410e-03,  3.0567494e-04,  2.5643543e-03,  4.3433835e-03,
        3.7493519e-03, -1.6155867e-03,  3.4571714e-03,  4.6889563e-03,
        3.3529974e-03,  2.4794789e-03, -1.0330174e-03, -3.0484376e-03,
        3.4154563e-03, -1.7702365e-03,  8.1581093e-04, -3.7170528e-03,
        3.4322981e-03, -3.2531032e-03,  3.4183101e-03, -3.3949781e-03,
        2.5810129e-03,  2.7383575e-03,  3.1628022e-03,  1.5606054e-03,
       -3.0438297e-03, -4.3455143e-03,  2.7804571e-04, -4.1938298e-03,
       -3.1540147e-03,  4.8645400e-03, -1.1250745e-03,  2.7935701e-05,
       -2.4871968e-03, -7.7383523e-04,  1.0868582e-03, -1.8671358e-03,
      

In [12]:
model.wv["Tv"]

# word 'Tv' not in vocabulary . it will through error 

KeyError: "word 'Tv' not in vocabulary"

In [13]:
model.wv['computer','human']

array([[-4.2501707e-03, -2.7321007e-03, -7.6862931e-04,  9.1923954e-05,
        -2.8173858e-03,  7.0831645e-04, -2.3521099e-03, -3.1246059e-03,
         3.3323485e-03,  3.1188789e-03, -4.4940435e-04, -4.7564129e-03,
         1.8066826e-03, -2.6453191e-03, -3.0376876e-03, -4.4731266e-04,
        -4.3843221e-03, -4.5795701e-03,  4.5116562e-03, -3.9353822e-03,
        -4.2268410e-03,  3.0567494e-04,  2.5643543e-03,  4.3433835e-03,
         3.7493519e-03, -1.6155867e-03,  3.4571714e-03,  4.6889563e-03,
         3.3529974e-03,  2.4794789e-03, -1.0330174e-03, -3.0484376e-03,
         3.4154563e-03, -1.7702365e-03,  8.1581093e-04, -3.7170528e-03,
         3.4322981e-03, -3.2531032e-03,  3.4183101e-03, -3.3949781e-03,
         2.5810129e-03,  2.7383575e-03,  3.1628022e-03,  1.5606054e-03,
        -3.0438297e-03, -4.3455143e-03,  2.7804571e-04, -4.1938298e-03,
        -3.1540147e-03,  4.8645400e-03, -1.1250745e-03,  2.7935701e-05,
        -2.4871968e-03, -7.7383523e-04,  1.0868582e-03, -1.86713

In [14]:
# get other similar words

sims = model.wv.most_similar('computer', topn=10)  

sims

[('eps', 0.22728891670703888),
 ('system', 0.14085102081298828),
 ('graph', 0.06629200279712677),
 ('survey', 0.06482338905334473),
 ('trees', 0.030946407467126846),
 ('response', 0.008782591670751572),
 ('minors', 0.002620246261358261),
 ('interface', -0.023094624280929565),
 ('user', -0.04825381562113762),
 ('human', -0.04944903403520584)]