In [54]:
import os
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.test.utils import common_texts
from gensim.models.fasttext import FastText

import numpy as np
import pandas as pd

In [2]:
from tqdm import tqdm
class TqdmUpTo(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None: self.total = tsize
        self.update(b * bsize - self.n)

def get_data(url, filename):
    """
    Download data if the filename does not exist already
    Uses Tqdm to show download progress
    """
    import os
    from urllib.request import urlretrieve
    
    if not os.path.exists(filename):

        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        with TqdmUpTo(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:
            urlretrieve(url, filename, reporthook=t.update_to)

In [3]:
embedding_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
get_data(embedding_url, 'data/glove.6B.zip')

glove.6B.zip: 862MB [07:19, 1.96MB/s]                               


In [4]:
!unzip data/glove.6B.zip 
!mv glove.6B.300d.txt data/glove.6B.300d.txt 
!mv glove.6B.200d.txt data/glove.6B.200d.txt 
!mv glove.6B.100d.txt data/glove.6B.100d.txt 
!mv glove.6B.50d.txt data/glove.6B.50d.txt

/usr/bin/sh: 1: unzip: not found
mv: cannot stat 'glove.6B.300d.txt': No such file or directory
mv: cannot stat 'glove.6B.200d.txt': No such file or directory
mv: cannot stat 'glove.6B.100d.txt': No such file or directory
mv: cannot stat 'glove.6B.50d.txt': No such file or directory


In [3]:
glove_input_file = 'data/glove.6B.300d.txt'
word2vec_output_file = 'data/glove.6B.300d.word2vec.txt'

In [4]:
if not os.path.exists(word2vec_output_file):
    glove2word2vec(glove_input_file, word2vec_output_file)

In [5]:
filename = word2vec_output_file 

In [6]:
%%time
# load the Stanford GloVe model from file, this is Disk I/O and can be slow
pretrained_w2v_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
# binary=False format for human readable text (.txt) files, and binary=True for .bin files 

CPU times: user 2min 8s, sys: 5.36 s, total: 2min 13s
Wall time: 2min 46s


In [7]:
# calculate: (king - man) + woman = ?
result = pretrained_w2v_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

[('queen', 0.6713276505470276)]


In [14]:
!cd .. && cd utils && cd data && cd processed_data && ls

dev.csv  train.csv


In [20]:
df = pd.read_csv("../utils/data/processed_data/train.csv", names = ["story_id", "obs1", "obs2", "hyp1", "hyp2", "label"])

In [23]:
obs1 = df["obs1"].tolist()
obs2 = df["obs2"].tolist()
hyp1 = df["hyp1"].tolist()
hyp2 = df["hyp2"].tolist()

In [25]:
all_text = obs1 + obs2 + hyp1 + hyp2

In [33]:
processed_text = []
for text in all_text:
    processed_text.append(text.split(" "))

In [45]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(processed_text)]

# Training a Doc2Vec model

In [46]:
doc2vec_base_model = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, epochs=5, workers=-1)

In [48]:
%%time

doc2vec_base_model.build_vocab(documents)

CPU times: user 2.94 s, sys: 234 ms, total: 3.17 s
Wall time: 3.17 s


In [49]:
doc2vec_base_model.infer_vector(processed_text[0])

array([-0.00290196, -0.00148584, -0.00139852,  0.00190343, -0.00373284,
       -0.00137311, -0.00450431, -0.00383791,  0.00236699, -0.0031576 ,
        0.00148719, -0.00291967,  0.00300254, -0.00267439, -0.00274993,
       -0.00127494,  0.00227505, -0.00320041, -0.00394575, -0.00174675,
        0.00142009, -0.00391585,  0.00038249, -0.00446946, -0.00064276,
        0.00168029,  0.00441429, -0.00157894,  0.00166953,  0.00288979,
        0.00407829,  0.0039332 , -0.00069688,  0.00191259,  0.00029152,
       -0.0048022 , -0.00035559,  0.00424788,  0.00351371,  0.00401065,
       -0.00236547, -0.00473242,  0.00312831,  0.00463834,  0.00300489,
       -0.00406328, -0.00181197,  0.00452758, -0.00156318, -0.00124377,
       -0.00049196,  0.00382311, -0.00321055,  0.00011194, -0.00491443,
        0.00071475, -0.00222727, -0.00247447, -0.00074043,  0.00462786,
       -0.00456153, -0.00444072, -0.00034415, -0.00464947, -0.00204079,
       -0.00424233, -0.00282344,  0.0011066 ,  0.00480799, -0.00

# Training a FastText model

In [52]:
fasttext_base_model = FastText(processed_text, vector_size=100, window=5, min_count=5, workers=-1, sg=1)

In [53]:
fasttext_base_model.wv["Chad"]

array([-2.2939106e-03,  1.1612010e-03,  2.2233955e-03,  1.1669899e-03,
       -1.0534826e-03,  2.4040132e-04, -2.5102382e-03, -2.0270837e-04,
        1.2936351e-04, -2.9297101e-03,  3.1619554e-04,  5.8306177e-04,
       -1.9637542e-03,  5.0960947e-03,  2.5835370e-03,  3.3056847e-04,
        2.1819957e-04,  3.4515810e-04, -1.6507604e-03,  4.5414671e-04,
       -1.5155672e-03,  2.6436618e-03, -7.6703227e-04,  9.7881549e-04,
        4.0063684e-04,  2.2443405e-03,  3.4055440e-03, -1.7007887e-03,
        3.8962893e-03, -1.3323580e-03,  2.7593544e-03, -2.6234465e-03,
        4.3789347e-05,  3.1632686e-03, -2.1066098e-03, -8.1642630e-04,
        7.8480656e-04,  7.1367639e-04,  1.4462688e-03,  1.1740879e-03,
        6.5752474e-04, -9.9569187e-04,  1.9538745e-03,  2.1816550e-03,
       -2.6893481e-03,  2.9551669e-05,  9.0989994e-04,  2.0642153e-03,
        2.8098500e-03,  3.2143644e-03,  1.0430771e-03, -1.7371923e-03,
       -9.3971752e-04, -4.6474437e-04,  1.3880846e-04, -1.3102989e-05,
      

In [55]:
def get_sentence_vector(model, text):
    vectors = []
    for token in text:
        vec = model.wv[token]
        norm = np.linalg.norm(vec)
        if norm > 0:
            vec /= norm
        vectors.append(vec)
    return np.mean(vectors, axis=0)

In [56]:
get_sentence_vector(fasttext_base_model, processed_text[0])

ValueError: output array is read-only