In [1]:
import gensim
print(f'gensim: {gensim.__version__}')



gensim: 3.4.0


Let's download some pre-trained GLove embeddings: 

In [2]:
!conda install -y tqdm

Solving environment: ...working... done

# All requested packages already installed.



In [3]:
from tqdm import tqdm
class TqdmUpTo(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None: self.total = tsize
        self.update(b * bsize - self.n)

def get_data(url, filename):
    """
    Download data if the filename does not exist already
    Uses Tqdm to show download progress
    """
    import os
    from urllib.request import urlretrieve
    
    if not os.path.exists(filename):

        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        with TqdmUpTo(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:
            urlretrieve(url, filename, reporthook=t.update_to)

In [4]:
embedding_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
get_data(embedding_url, 'data/glove.6B.zip')

In [5]:
# # We need to run this only once, can unzip manually unzip to the data directory too
!unzip data/glove.6B.zip 
!mv glove.6B.300d.txt data/glove.6B.300d.txt 
!mv glove.6B.200d.txt data/glove.6B.200d.txt 
!mv glove.6B.100d.txt data/glove.6B.100d.txt 
!mv glove.6B.50d.txt data/glove.6B.50d.txt 

Archive:  data/glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [6]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'data/glove.6B.300d.txt'

word2vec_output_file = 'data/glove.6B.300d.word2vec.txt'

In [7]:
import os
if not os.path.exists(word2vec_output_file):
    glove2word2vec(glove_input_file, word2vec_output_file)

### KeyedVectors API

In [8]:
from gensim.models import KeyedVectors
filename = word2vec_output_file 

In [9]:
%%time
# load the Stanford GloVe model from file, this is Disk I/O and can be slow
pretrained_w2v_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
# binary=False format for human readable text (.txt) files, and binary=True for .bin files 

Wall time: 1min 24s


In [10]:
# calculate: (king - man) + woman = ?
result = pretrained_w2v_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

  


[('queen', 0.6713277101516724)]


In [11]:
# calculate: (india - canada) +  = ?
result = pretrained_w2v_model.most_similar(positive=['quora', 'facebook'], negative=['linkedin'], topn=1)
print(result)

[('twitter', 0.37966805696487427)]


In [12]:
pretrained_w2v_model.most_similar('india')

[('indian', 0.7355823516845703),
 ('pakistan', 0.7285579442977905),
 ('delhi', 0.6846905946731567),
 ('bangladesh', 0.620319128036499),
 ('lanka', 0.609517514705658),
 ('sri', 0.6011613607406616),
 ('kashmir', 0.5746493935585022),
 ('nepal', 0.5421023964881897),
 ('pradesh', 0.5405810475349426),
 ('maharashtra', 0.518537700176239)]

#### What is missing in both word2vec and GloVe? 

In [13]:
try:
    pretrained_w2v_model.wv.most_similar('nirant')
except Exception as e:
    print(e)

  


"word 'nirant' not in vocabulary"


### How to handle OOV words? 

In [14]:
ted_dataset = "https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip"
get_data(ted_dataset, "data/ted_en.zip")

In [15]:
import zipfile
import lxml.etree
with zipfile.ZipFile('data/ted_en.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
input_text = '\n'.join(doc.xpath('//content/text()'))

In [16]:
input_text[:500]

"Here are two reasons companies fail: they only do more of the same, or they only do what's new.\nTo me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation. Both are necessary, but it can be too much of a good thing.\nConsider Facit. I'm actually old enough to remember them. Facit was a fantastic company. They were born deep in the Swedish forest, and they made the best mechanical calculators in the world. Everybody used them. A"

In [17]:
import re
# remove parenthesis 
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)

# store as list of sentences
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
    sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)

# store as list of lists of words
sentences_ted = []
for sent_str in sentences_strings_ted:
    tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
    sentences_ted.append(tokens)

In [18]:
print(sentences_ted[:2])

[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']]


In [19]:
import json
with open('ted_clean_sentences.json', 'w') as fp:
    json.dump(sentences_ted, fp)

In [20]:
with open('ted_clean_sentences.json', 'r') as fp:
    sentences_ted = json.load(fp)

In [21]:
print(sentences_ted[:2])

[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']]


### Train FastText Embedddings

In [22]:
from gensim.models.fasttext import FastText

In [23]:
%%time
fasttext_ted_model = FastText(sentences_ted, size=100, window=5, min_count=5, workers=-1, sg=1)
# sg = 1 denotes skipgram, else CBOW is used

Wall time: 5.48 s


In [24]:
fasttext_ted_model.wv.most_similar("india")

[('indians', 0.5911639928817749),
 ('indian', 0.5406097769737244),
 ('indiana', 0.4898717999458313),
 ('indicated', 0.44004374742507935),
 ('indicate', 0.4042605757713318),
 ('internal', 0.39166826009750366),
 ('interior', 0.3871103823184967),
 ('byproducts', 0.37529298663139343),
 ('princesses', 0.37265270948410034),
 ('indications', 0.369659960269928)]

### Train word2vec Embeddings

In [25]:
from gensim.models.word2vec import Word2Vec

In [26]:
%%time
word2vec_ted_model = Word2Vec(sentences=sentences_ted, size=100, window=5, min_count=5, workers=-1, sg=1)

Wall time: 1.44 s


In [27]:
word2vec_ted_model.wv.most_similar("india")

[('bordered', 0.41709238290786743),
 ('hovering', 0.4083016514778137),
 ('almost', 0.3865964710712433),
 ('sad', 0.3704090118408203),
 ('supporters', 0.3616541624069214),
 ('spite', 0.3598758280277252),
 ('wrinkles', 0.3590206205844879),
 ('guaranteed', 0.3535975515842438),
 ('hd', 0.3512127995491028),
 ('assistant', 0.346971333026886)]

## fastText or word2vec? 

# Document Embeddings

In [28]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
from pprint import pprint
import multiprocessing

In [29]:
import zipfile
import lxml.etree
with zipfile.ZipFile('data/ted_en.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
    
talks = doc.xpath('//content/text()')

In [30]:
def read_corpus(talks, tokens_only=False):
    for i, line in enumerate(talks):
        if tokens_only:
            yield gensim.utils.simple_preprocess(line)
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [31]:
read_corpus(talks)

<generator object read_corpus at 0x00000218240FE990>

In [32]:
ted_talk_docs = list(read_corpus(talks)) 

In [33]:
ted_talk_docs[0]

TaggedDocument(words=['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 'new', 'to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation', 'both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'good', 'thing', 'consider', 'facit', 'actually', 'old', 'enough', 'to', 'remember', 'them', 'facit', 'was', 'fantastic', 'company', 'they', 'were', 'born', 'deep', 'in', 'the', 'swedish', 'forest', 'and', 'they', 'made', 'the', 'best', 'mechanical', 'calculators', 'in', 'the', 'world', 'everybody', 'used', 'them', 'and', 'what', 'did', 'facit', 'do', 'when', 'the', 'electronic', 'calculator', 'came', 'along', 'they', 'continued', 'doing', 'exactly', 'the', 'same', 'in', 'six', 'months', 'they', 'went', 'from', 'maximum', 'revenue', 'and', 'they', 'were', 'gone'

In [34]:
cores = multiprocessing.cpu_count()
print(cores)

8


In [35]:
model = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, epochs=5, workers=cores)

In [36]:
%time model.build_vocab(ted_talk_docs)

Wall time: 1.4 s


In [37]:
sentence_1 = 'Modern medicine has changed the way we think about healthcare, life spans and by extension career and marriage'

In [38]:
sentence_2 = 'Modern medicine is not just a boon to the rich, making the raw chemicals behind these is also pollutes the poorest neighborhoods'

In [39]:
sentence_3 = 'Modern medicine has changed the way we think about healthcare, and increased life spans, delaying weddings'

In [40]:
model.docvecs.similarity_unseen_docs(model, sentence_1.split(), sentence_3.split())

-0.14454556996040863

In [41]:
model.docvecs.similarity_unseen_docs(model, sentence_1.split(), sentence_2.split())

-0.04978240807521571

In [42]:
%time model.train(ted_talk_docs, total_examples=model.corpus_count, epochs=model.epochs)

Wall time: 6.77 s


In [43]:
model.infer_vector(sentence_1.split())

array([ 0.20152442,  0.07655947,  0.04110149, -0.09114903, -0.02466601,
        0.10063498, -0.04590227, -0.16054891, -0.23367156, -0.07714292,
       -0.32246125,  0.10532021,  0.11020374, -0.02373328, -0.06048575,
        0.06041928, -0.20840394,  0.11885054, -0.09653657,  0.02215091,
        0.01846626,  0.06881414, -0.01988592,  0.01138998,  0.06924792,
        0.11989842,  0.09510404,  0.01230403,  0.05453861,  0.05833528,
        0.22496092,  0.06185873,  0.15445319, -0.13073249,  0.1320086 ,
        0.15955518,  0.09083826, -0.262743  ,  0.07112081, -0.12404393,
       -0.07876749, -0.17020509, -0.08309909,  0.20299006, -0.07867863,
       -0.19080839, -0.00371094, -0.2119167 , -0.11631834, -0.12984131,
       -0.11451794,  0.12690201, -0.02519317,  0.23437414, -0.11313629,
        0.06674401, -0.0190409 ,  0.3384525 , -0.13124712, -0.12843844,
       -0.2605964 ,  0.22317892, -0.20078087, -0.05607577, -0.08431446,
       -0.20859231,  0.15535517,  0.0073873 , -0.11435535,  0.16

In [44]:
model.docvecs.similarity_unseen_docs(model, sentence_1.split(), sentence_3.split())

0.9073806748252071

In [45]:
model.docvecs.similarity_unseen_docs(model, sentence_1.split(), sentence_2.split())

0.7626341790517841

In [46]:
model.docvecs.similarity_unseen_docs(model, sentence_2.split(), sentence_3.split())

0.8026655396100536

# Model Assessment

In [47]:
ranks = []
for idx in range(len(ted_talk_docs)):
    inferred_vector = model.infer_vector(ted_talk_docs[idx].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(idx)
    ranks.append(rank)

In [48]:
import collections
collections.Counter(ranks)  # Results vary due to random seeding + very small corpus

Counter({0: 2080, 3: 1, 4: 1, 1: 1, 2: 1, 6: 1})

In [49]:
doc_slice = ' '.join(ted_talk_docs[idx].words)[:500]
print(f'Document ({idx}): «{doc_slice}»\n')
print(f'SIMILAR/DISSIMILAR DOCS PER MODEL {model}')
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
      doc_slice = ' '.join(ted_talk_docs[sims[index][0]].words)[:500]
      print(f'{label} {sims[index]}: «{doc_slice}»\n')

Document (2084): «if you re here today and very happy that you are you ve all heard about how sustainable development will save us from ourselves however when we re not at ted we are often told that real sustainability policy agenda is just not feasible especially in large urban areas like new york city and that because most people with decision making powers in both the public and the private sector really don feel as though they re in danger the reason why here today in part is because of dog an abandoned puppy»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dbow,d100,n5,mc2,s0.001,t8)
MOST (2084, 0.8938855528831482): «if you re here today and very happy that you are you ve all heard about how sustainable development will save us from ourselves however when we re not at ted we are often told that real sustainability policy agenda is just not feasible especially in large urban areas like new york city and that because most people with decision making powers in both the public and the priv