In [1]:
import gensim
print(f'gensim: {gensim.__version__}')

gensim: 3.8.1


Let's download some pre-trained GLove embeddings: 

In [2]:
!conda install -y tqdm

Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/sangram/anaconda3

  added / updated specs:
    - tqdm


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    tqdm-4.40.2                |             py_0          53 KB
    ------------------------------------------------------------
                                           Total:          53 KB

The following packages will be UPDATED:

  tqdm                 pkgs/main/osx-64::tqdm-4.31.1-py37_1 --> pkgs/main/noarch::tqdm-4.40.2-py_0



Downloading and Extracting Packages
tqdm-4.40.2          | 53 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done


In [3]:
from tqdm import tqdm
class TqdmUpTo(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None: self.total = tsize
        self.update(b * bsize - self.n)

def get_data(url, filename):
    """
    Download data if the filename does not exist already
    Uses Tqdm to show download progress
    """
    import os
    from urllib.request import urlretrieve
    
    if not os.path.exists(filename):

        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        with TqdmUpTo(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:
            urlretrieve(url, filename, reporthook=t.update_to)

In [4]:
embedding_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
get_data(embedding_url, 'data/glove.6B.zip')

glove.6B.zip: 862MB [08:41, 1.65MB/s]                               


In [5]:
# # We need to run this only once, can unzip manually unzip to the data directory too
!unzip data/glove.6B.zip 
!mv glove.6B.300d.txt data/glove.6B.300d.txt 
!mv glove.6B.200d.txt data/glove.6B.200d.txt 
!mv glove.6B.100d.txt data/glove.6B.100d.txt 
!mv glove.6B.50d.txt data/glove.6B.50d.txt 

Archive:  data/glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [6]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'data/glove.6B.300d.txt'

word2vec_output_file = 'data/glove.6B.300d.word2vec.txt'

In [8]:
import os
if not os.path.exists(word2vec_output_file):
    glove2word2vec(glove_input_file, word2vec_output_file)

### KeyedVectors API

In [9]:
from gensim.models import KeyedVectors
filename = word2vec_output_file 

In [10]:
%%time
# load the Stanford GloVe model from file, this is Disk I/O and can be slow
pretrained_w2v_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
# binary=False format for human readable text (.txt) files, and binary=True for .bin files 

CPU times: user 1min 27s, sys: 763 ms, total: 1min 28s
Wall time: 1min 28s


In [11]:
# calculate: (king - man) + woman = ?
result = pretrained_w2v_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

  


[('queen', 0.6713277101516724)]


In [12]:
# calculate: (india - canada) +  = ?
result = pretrained_w2v_model.most_similar(positive=['quora', 'facebook'], negative=['linkedin'], topn=1)
print(result)

[('twitter', 0.37966805696487427)]


In [13]:
pretrained_w2v_model.most_similar('india')

[('indian', 0.7355823516845703),
 ('pakistan', 0.7285579442977905),
 ('delhi', 0.6846907138824463),
 ('bangladesh', 0.6203191876411438),
 ('lanka', 0.609517514705658),
 ('sri', 0.6011613607406616),
 ('kashmir', 0.5746493935585022),
 ('nepal', 0.5421023368835449),
 ('pradesh', 0.5405811071395874),
 ('maharashtra', 0.518537700176239)]

#### What is missing in both word2vec and GloVe? 

In [14]:
try:
    pretrained_w2v_model.wv.most_similar('sangram')
except Exception as e:
    print(e)

"word 'nirant' not in vocabulary"


  


### How to handle OOV words? 

In [15]:
ted_dataset = "https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip"
get_data(ted_dataset, "data/ted_en.zip")

ted_en-20160408.zip&filename=ted_en-20160408.zip: 16.0MB [00:20, 794kB/s] 


In [16]:
import zipfile
import lxml.etree
with zipfile.ZipFile('data/ted_en.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
input_text = '\n'.join(doc.xpath('//content/text()'))

In [17]:
input_text[:500]

"Here are two reasons companies fail: they only do more of the same, or they only do what's new.\nTo me the real, real solution to quality growth is figuring out the balance between two activities: exploration and exploitation. Both are necessary, but it can be too much of a good thing.\nConsider Facit. I'm actually old enough to remember them. Facit was a fantastic company. They were born deep in the Swedish forest, and they made the best mechanical calculators in the world. Everybody used them. A"

In [18]:
import re
# remove parenthesis 
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)

# store as list of sentences
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
    sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)

# store as list of lists of words
sentences_ted = []
for sent_str in sentences_strings_ted:
    tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
    sentences_ted.append(tokens)

In [19]:
print(sentences_ted[:2])

[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']]


In [20]:
import json
with open('ted_clean_sentences.json', 'w') as fp:
    json.dump(sentences_ted, fp)

In [21]:
with open('ted_clean_sentences.json', 'r') as fp:
    sentences_ted = json.load(fp)

In [22]:
print(sentences_ted[:2])

[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']]


### Train FastText Embedddings

In [23]:
from gensim.models.fasttext import FastText

In [24]:
%%time
fasttext_ted_model = FastText(sentences_ted, size=100, window=5, min_count=5, workers=-1, sg=1)
# sg = 1 denotes skipgram, else CBOW is used

CPU times: user 12.1 s, sys: 1.97 s, total: 14.1 s
Wall time: 14.1 s


In [25]:
fasttext_ted_model.wv.most_similar("india")

[('indian', 0.6187378764152527),
 ('indiana', 0.5800899267196655),
 ('indians', 0.5743587017059326),
 ('indignant', 0.4656485915184021),
 ('indirect', 0.43688109517097473),
 ('inventors', 0.41344690322875977),
 ('individualized', 0.412820041179657),
 ('indigo', 0.3998013138771057),
 ('something', 0.3908068537712097),
 ('ethical', 0.3907659351825714)]

### Train word2vec Embeddings

In [26]:
from gensim.models.word2vec import Word2Vec

In [27]:
%%time
word2vec_ted_model = Word2Vec(sentences=sentences_ted, size=100, window=5, min_count=5, workers=-1, sg=1)

CPU times: user 4.82 s, sys: 48.5 ms, total: 4.87 s
Wall time: 3.96 s


In [28]:
word2vec_ted_model.wv.most_similar("india")

[('trolley', 0.4043310880661011),
 ('paramount', 0.4010774791240692),
 ('heinrich', 0.37615031003952026),
 ('amber', 0.3754728436470032),
 ('champion', 0.36570775508880615),
 ('murderer', 0.36432310938835144),
 ('bites', 0.3610832691192627),
 ('cloak', 0.3566766381263733),
 ('positively', 0.3540581464767456),
 ('estimated', 0.3401758372783661)]

## fastText or word2vec? 

# Document Embeddings

In [29]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
from pprint import pprint
import multiprocessing

In [30]:
import zipfile
import lxml.etree
with zipfile.ZipFile('data/ted_en.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
    
talks = doc.xpath('//content/text()')

In [31]:
def read_corpus(talks, tokens_only=False):
    for i, line in enumerate(talks):
        if tokens_only:
            yield gensim.utils.simple_preprocess(line)
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [32]:
read_corpus(talks)

<generator object read_corpus at 0x7fe27a0c1e58>

In [33]:
ted_talk_docs = list(read_corpus(talks)) 

In [34]:
ted_talk_docs[0]

TaggedDocument(words=['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 'new', 'to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation', 'both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'good', 'thing', 'consider', 'facit', 'actually', 'old', 'enough', 'to', 'remember', 'them', 'facit', 'was', 'fantastic', 'company', 'they', 'were', 'born', 'deep', 'in', 'the', 'swedish', 'forest', 'and', 'they', 'made', 'the', 'best', 'mechanical', 'calculators', 'in', 'the', 'world', 'everybody', 'used', 'them', 'and', 'what', 'did', 'facit', 'do', 'when', 'the', 'electronic', 'calculator', 'came', 'along', 'they', 'continued', 'doing', 'exactly', 'the', 'same', 'in', 'six', 'months', 'they', 'went', 'from', 'maximum', 'revenue', 'and', 'they', 'were', 'gone'

In [35]:
cores = multiprocessing.cpu_count()
print(cores)

12


In [36]:
model = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, epochs=5, workers=cores)

In [37]:
%time model.build_vocab(ted_talk_docs)

CPU times: user 6.22 s, sys: 88.5 ms, total: 6.31 s
Wall time: 6.32 s


In [38]:
sentence_a = 'Modern medicine has changed the way we think about healthcare, life spans and by extension career and marriage'

In [39]:
sentence_b = 'Modern medicine is not just a boon to the rich, making the raw chemicals behind these is also pollutes the poorest neighborhoods'

In [40]:
sentence_c = 'Modern medicine has changed the way we think about healthcare, and increased life spans, delaying weddings'

In [41]:
model.docvecs.similarity_unseen_docs(model, sentence_a.split(), sentence_c.split())

0.08022483

In [42]:
model.docvecs.similarity_unseen_docs(model, sentence_a.split(), sentence_b.split())

-0.062307782

In [43]:
%time model.train(ted_talk_docs, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 21.8 s, sys: 167 ms, total: 22 s
Wall time: 5.75 s


In [44]:
model.infer_vector(sentence_a.split())

array([ 0.03525059, -0.1302001 , -0.10057113, -0.06664311,  0.01500211,
        0.04596484,  0.02954374,  0.10806137,  0.08382607,  0.15114088,
        0.06105766,  0.00202065, -0.05098704, -0.0542175 ,  0.03858076,
        0.01167453, -0.0638482 , -0.05530548,  0.14384449,  0.06772038,
        0.04906154, -0.0785329 , -0.03195518, -0.12691094, -0.01219349,
       -0.01389712, -0.14068377, -0.08287106, -0.02886995, -0.12752861,
       -0.01733368,  0.05183846, -0.09669829,  0.06306656, -0.0680143 ,
        0.00067281,  0.08220522,  0.02236368,  0.0470303 ,  0.07940231,
       -0.03391537, -0.20405623,  0.02754878,  0.08919586, -0.00795464,
        0.06342661, -0.01824609,  0.15627132, -0.09400826, -0.01535255,
        0.04404864, -0.06645317, -0.02399574, -0.18312019, -0.0006153 ,
       -0.03808543, -0.13044794,  0.091874  ,  0.00850958,  0.05326273,
        0.03017448,  0.15139206,  0.06555739,  0.00453148, -0.2491518 ,
        0.16126929,  0.27457458,  0.14156994,  0.13067663, -0.04

In [45]:
model.docvecs.similarity_unseen_docs(model, sentence_a.split(), sentence_c.split())

0.94174534

In [46]:
model.docvecs.similarity_unseen_docs(model, sentence_a.split(), sentence_c.split())

0.9575233

In [47]:
model.docvecs.similarity_unseen_docs(model, sentence_b.split(), sentence_c.split())

0.8728296

# Model Assessment

In [48]:
ranks = []
for idx in range(len(ted_talk_docs)):
    inferred_vector = model.infer_vector(ted_talk_docs[idx].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(idx)
    ranks.append(rank)

In [49]:
import collections
collections.Counter(ranks)  # Results vary due to random seeding + very small corpus

Counter({0: 2081, 1: 1, 8: 1, 4: 1, 2: 1})

In [50]:
doc_slice = ' '.join(ted_talk_docs[idx].words)[:500]
print(f'Document ({idx}): «{doc_slice}»\n')
print(f'SIMILAR/DISSIMILAR DOCS PER MODEL {model}')
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
      doc_slice = ' '.join(ted_talk_docs[sims[index][0]].words)[:500]
      print(f'{label} {sims[index]}: «{doc_slice}»\n')

Document (2084): «if you re here today and very happy that you are you ve all heard about how sustainable development will save us from ourselves however when we re not at ted we are often told that real sustainability policy agenda is just not feasible especially in large urban areas like new york city and that because most people with decision making powers in both the public and the private sector really don feel as though they re in danger the reason why here today in part is because of dog an abandoned puppy»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dbow,d100,n5,mc2,s0.001,t12)
MOST (2084, 0.9372626543045044): «if you re here today and very happy that you are you ve all heard about how sustainable development will save us from ourselves however when we re not at ted we are often told that real sustainability policy agenda is just not feasible especially in large urban areas like new york city and that because most people with decision making powers in both the public and the pri