## 0. Imports

In [1]:
import pandas as pd
import gensim
import spacy
from tqdm import tqdm



In [2]:
tqdm.pandas(desc="Progress")

In [3]:
nlp_en = spacy.load("en_core_web_md")

## 1. Train word embeddings

#### 1.1 Get data

In [4]:
pd_data = pd.read_csv("Reviews.csv")

In [17]:
pd_data.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,tokens
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,"[I, have, bought, several, of, the, Vitality, ..."
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,"[Product, arrived, labeled, as, Jumbo, Salted,..."
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,"[This, is, a, confection, that, has, been, aro..."


#### 1.2. Process data

In [6]:
def get_tokens(sentence):
    return [x.text for x in nlp_en(sentence)]

In [7]:
pd_data["tokens"] = pd_data["Text"].progress_apply(get_tokens)

Progress: 100%|██████████████████████████████████████████████████████████████| 568454/568454 [4:14:40<00:00, 37.20it/s]


In [11]:
pd_data.to_pickle("pd_data_tokenized.pickle")

In [2]:
pd_data = pd.read_pickle("pd_data_tokenized.pickle")

#### 1.3. Train word embeddings using word2vec

In [8]:
model_w2v = gensim.models.Word2Vec(pd_data["tokens"].tolist(), min_count=5, window = 9, size = 100)

#### 1.4. Train word embeddings using fasttext

In [9]:
model_ft = gensim.models.FastText(pd_data["tokens"].tolist(), min_count=5, window = 9, size = 100)

#### 1.5. Persistence

In [None]:
model_w2v.save("model_w2v.model")
model_ft.save("model_ft.model")

In [2]:
model_w2v = gensim.models.Word2Vec.load("model_w2v.model")
model_ft = gensim.models.FastText.load("model_ft.model")

#### 1.6. Similarity

In [7]:
model_w2v.most_similar("salmon", topn=5)

  if __name__ == '__main__':


[('fish', 0.8536328077316284),
 ('tuna', 0.7662709951400757),
 ('chicken', 0.7630202174186707),
 ('seafood', 0.7627329230308533),
 ('turkey', 0.7592297792434692)]

In [8]:
model_w2v.most_similar(positive=['cheese'], topn=5)

  if __name__ == '__main__':


[('cheddar', 0.7746697068214417),
 ('mozzarella', 0.7572810649871826),
 ('parmesan', 0.7331867218017578),
 ('chedder', 0.7296013236045837),
 ('mayo', 0.7252874374389648)]

#### 1.7. Correlation

In [1]:
model_w2v.most_similar(positive=['pea', 'salsa'], negative=['tomato'], topn=3)

NameError: name 'model_w2v' is not defined

In [14]:
model_w2v.most_similar(positive=['lemon', 'water'], topn=3)

  if __name__ == '__main__':


[('tequila', 0.7341920137405396),
 ('lemonade', 0.7284362316131592),
 ('juice', 0.7281173467636108)]

In [15]:
model_w2v.most_similar(positive=['salami', 'crust'], topn=3)

  if __name__ == '__main__':


[('bread', 0.7283815145492554),
 ('pizza', 0.7018527388572693),
 ('dough', 0.6836484670639038)]

In [16]:
model_w2v.most_similar(positive=['beef', 'bun'], topn=3)

  if __name__ == '__main__':


[('hamburger', 0.814429521560669),
 ('ham', 0.795830488204956),
 ('sausage', 0.7887133359909058)]

## 2. Visualise them

In [33]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource, Range1d, LabelSet, Label

In [27]:
%%time
model_w2v = gensim.models.Word2Vec(pd_data["tokens"].tolist(), min_count=500, window = 9, size = 100)

Wall time: 2min 7s


In [29]:
tokens = []
labels = []

for x in model_w2v.wv.vocab:
    tokens.append(model_w2v[x])
    labels.append(x)



In [30]:
%%time
tsne_model = TSNE(n_components=2, random_state=11)
fitted = tsne_model.fit_transform(tokens)

Wall time: 2min 47s


In [34]:
output_file("plot.html")
            
p = figure(plot_width=1000, plot_height=1000)

lst = list(model_w2v.wv.vocab)



p.circle(fitted[:, 0], fitted[:, 1], size=2, color="navy", alpha=0.5)

texts = lst


source = ColumnDataSource(data=dict(x=fitted[:, 0], y=fitted[:, 1], text=texts))

labels = LabelSet(x='x', y='y', text='text',
         x_offset=5, y_offset=5, source=source)
p.add_layout(labels)



show(p)