In [1]:
import gensim
import logging
import os
import nltk.data
import string
%matplotlib inline

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

print ("PACKAGES LOADED")

PACKAGES LOADED


In [2]:
#The following class defined a Python generator which parses all files (recursively) in a given directory, 
#and yields the sentences there one at a time (thus saving loads of memory).
class SentGen(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for path,dirs,files in os.walk(self.dirname):
            for fname in files:
                for line in get_sentences(path + '/' + fname):
                    yield line.split()

def get_sentences(fname):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    fp = open(fname, 'r', encoding="utf-8")
    data = fp.read()
    fp.close()
    trans_table = dict((ord(char), None) for char in string.punctuation)
    sentences = nltk.sent_tokenize(data)
    for sent in sentences:
        yield sent.translate(trans_table)

In [3]:
#creating empty model
model = gensim.models.Word2Vec(iter=1, min_count=5)

In [4]:
model.iter


  """Entry point for launching an IPython kernel.


1

In [5]:
model.build_vocab(SentGen('aclImdb'), progress_per=200000)

2020-02-08 12:13:03,420 : INFO : collecting all words and their counts
2020-02-08 12:13:03,437 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-02-08 12:13:21,659 : INFO : PROGRESS: at sentence #200000, processed 4235964 words, keeping 117989 word types
2020-02-08 12:13:39,825 : INFO : PROGRESS: at sentence #400000, processed 8470628 words, keeping 177074 word types
2020-02-08 12:13:58,857 : INFO : PROGRESS: at sentence #600000, processed 12910549 words, keeping 229396 word types
2020-02-08 12:16:08,387 : INFO : PROGRESS: at sentence #800000, processed 17357138 words, keeping 275456 word types
2020-02-08 12:18:47,015 : INFO : PROGRESS: at sentence #1000000, processed 21567523 words, keeping 315310 word types
2020-02-08 12:19:54,228 : INFO : collected 329953 word types from a corpus of 23197079 raw words and 1074524 sentences
2020-02-08 12:19:54,230 : INFO : Loading a fresh vocabulary
2020-02-08 12:19:54,609 : INFO : effective_min_count=5 retains 75839 uni

In [6]:
model.train(SentGen('aclImdb'), total_examples=model.corpus_count,epochs=model.iter, report_delay=8.0)           

  """Entry point for launching an IPython kernel.
2020-02-08 12:22:19,099 : INFO : training model with 3 workers on 75839 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-02-08 12:22:21,155 : INFO : EPOCH 1 - PROGRESS: at 0.04% examples, 3822 words/s, in_qsize 0, out_qsize 0
2020-02-08 12:22:29,441 : INFO : EPOCH 1 - PROGRESS: at 1.21% examples, 19995 words/s, in_qsize 0, out_qsize 0
2020-02-08 12:22:37,506 : INFO : EPOCH 1 - PROGRESS: at 3.41% examples, 31519 words/s, in_qsize 0, out_qsize 0
2020-02-08 12:22:45,665 : INFO : EPOCH 1 - PROGRESS: at 5.30% examples, 33898 words/s, in_qsize 0, out_qsize 0
2020-02-08 12:22:53,683 : INFO : EPOCH 1 - PROGRESS: at 6.87% examples, 33720 words/s, in_qsize 0, out_qsize 0
2020-02-08 12:23:01,830 : INFO : EPOCH 1 - PROGRESS: at 8.80% examples, 34967 words/s, in_qsize 0, out_qsize 0
2020-02-08 12:23:10,133 : INFO : EPOCH 1 - PROGRESS: at 9.71% examples, 32259 words/s, in_qsize 0, out_qsize 0
2020-02-08 12:23:18,421 

2020-02-08 12:31:54,192 : INFO : EPOCH 1 - PROGRESS: at 74.12% examples, 22829 words/s, in_qsize 0, out_qsize 0
2020-02-08 12:32:02,410 : INFO : EPOCH 1 - PROGRESS: at 75.13% examples, 22807 words/s, in_qsize 0, out_qsize 0
2020-02-08 12:32:10,585 : INFO : EPOCH 1 - PROGRESS: at 76.17% examples, 22800 words/s, in_qsize 0, out_qsize 0
2020-02-08 12:32:18,599 : INFO : EPOCH 1 - PROGRESS: at 80.30% examples, 23675 words/s, in_qsize 0, out_qsize 0
2020-02-08 12:32:26,622 : INFO : EPOCH 1 - PROGRESS: at 85.52% examples, 24841 words/s, in_qsize 0, out_qsize 0
2020-02-08 12:32:34,669 : INFO : EPOCH 1 - PROGRESS: at 90.96% examples, 26036 words/s, in_qsize 0, out_qsize 0
2020-02-08 12:32:42,676 : INFO : EPOCH 1 - PROGRESS: at 95.60% examples, 27003 words/s, in_qsize 0, out_qsize 0
2020-02-08 12:32:49,188 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-02-08 12:32:49,190 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-02-08 12:32:49,209 : INFO : w

(17631244, 23197079)

In [7]:
model.save('aclImdb.model')

2020-02-08 12:43:04,729 : INFO : saving Word2Vec object under aclImdb.model, separately None
2020-02-08 12:43:04,731 : INFO : not storing attribute vectors_norm
2020-02-08 12:43:04,734 : INFO : not storing attribute cum_table
2020-02-08 12:43:06,954 : INFO : saved aclImdb.model


In [8]:
len(model.wv.vocab)

75839

In [9]:
'women' in model.wv.vocab

True

In [10]:
words = sorted(model.wv.vocab.keys())
print("Number of words:", len(words))

Number of words: 75839


In [11]:
# Save words to file: words.txt
fp = open("words.txt", "w", encoding="utf-8")
for word in words:
    fp.write(word + '\n')
fp.close()

In [12]:
print (words[1500:1550]) 

['Afterschool', 'Aftershocks', 'Afterward', 'Afterwards', 'Afterwords', 'Aga', 'Agador', 'Again', 'Against', 'Agamemnon', 'Agar', 'Agars', 'Agashe', 'Agatha', 'Age', 'Aged', 'Agency', 'Agenda', 'Agent', 'Agents', 'Ages', 'Agey', 'Aggie', 'Agi', 'Aging', 'Agnes', 'Agnew', 'Agnieszka', 'Agnihotri', 'Agnus', 'Agnès', 'Ago', 'Agostino', 'Agrabah', 'Agrade', 'Agree', 'Agreed', 'Agreement', 'Agren', 'Agrippina', 'Agro', 'Aguilar', 'Aguirre', 'Agustin', 'Agutter', 'Agutters', 'Ah', 'Aha', 'Ahab', 'Ahead']


In [17]:
model.similarity('woman', 'man')


  """Entry point for launching an IPython kernel.


0.8743907

In [18]:
model.similarity('paris', 'train')  # low similarity


  """Entry point for launching an IPython kernel.


0.26157176

In [15]:
def get_unmatching_word(words):
    for word in words:
        if not word in model.wv.vocab:
            print("Word is not in vocabulary:", word)
            return None
    return model.wv.doesnt_match(words)

In [16]:
get_unmatching_word(['breakfast', 'cereal', 'dinner', 'lunch'])

2020-02-08 12:44:46,745 : INFO : precomputing L2-norms of word weight vectors
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'cereal'

In [19]:
# The word 'woman' comes out 6th as the most similar
model.most_similar(positive=['king', 'man'], negative=['queen'], topn=6)

  


[('person', 0.7372632622718811),
 ('guy', 0.6792922019958496),
 ('soldier', 0.6550565361976624),
 ('killer', 0.6520779132843018),
 ('boy', 0.6508388519287109),
 ('woman', 0.638922929763794)]

In [20]:
model.most_similar(positive=['low', 'lower'], negative=['high'], topn=10)

  """Entry point for launching an IPython kernel.


[('higher', 0.7832080125808716),
 ('greater', 0.6772675514221191),
 ('funnier', 0.6765692234039307),
 ('More', 0.6740158796310425),
 ('bigger', 0.6664255857467651),
 ('cheaper', 0.6573895215988159),
 ('dumber', 0.6536562442779541),
 ('scarier', 0.6463117599487305),
 ('uglier', 0.6443842053413391),
 ('quicker', 0.6440134644508362)]

In [21]:
print(model['king'])

[ 0.361104    0.190657    0.14281137 -0.21910924 -0.15460844  0.1962104
  0.12719002 -0.14337867 -0.46280435  0.448402   -0.03699146  0.31851628
 -0.29610476 -0.41346842  0.31622934  0.1828613   0.55195045 -0.3636207
  0.25049934 -0.11199825 -0.0961617   0.03697551 -0.05407988 -0.7127565
  0.37536404  0.04514764 -0.19552095 -0.09615923  0.04099998 -0.16848631
 -0.31574374  0.32440543  0.33974066  0.30078834 -0.53485286 -0.00327764
  0.47179636  0.4307772   0.35343587  0.34145278  0.87021303 -0.40276602
  0.07428315  0.03480908  0.06824161 -0.11767775 -0.3321068  -0.04795282
  0.23878944 -0.14243603  0.09008274 -0.12202803 -0.40510413 -0.27023304
 -0.18543649 -0.06488868 -0.18493608  0.05292498  0.76916003 -0.04596116
  0.13890158  0.19652385 -0.25257763  0.09267151 -0.47033966 -0.06862667
  0.10347904 -0.07482861 -0.4453664   0.10569251 -0.32196185 -0.12069732
  0.09291036 -0.1328929   0.5519674  -0.29313296 -0.32023013  0.6032624
 -0.18750583  0.6700869  -0.14192538 -0.3616553   0.304

  """Entry point for launching an IPython kernel.
