### Training Word2Vec on text8

#### 1. Importing the Libraries

In [56]:
import matplotlib.pyplot as plt
# Import api's through which we can download different type of models
import gensim.downloader as api

from gensim.models import Word2Vec
from sklearn.decomposition import PCA

import pandas as pd
import plotly.io as pio
import plotly.graph_objects as go
from matplotlib import pyplot as plt

#### 2. Loading the Corpus and Training the Model

In [57]:
corpus = api.load('text8')
corpus = [word for word in corpus]

corpora = ''
for corp in corpus:
    corpora+= ' '.join(corp)
    
print('Corpus size :',len(corpora.split(' ')))
print('Vocab size  :',len(set(corpora.split(' '))))

del corpora
# Take the words which appear at least 2000 times in text corpus
model = Word2Vec(corpus, min_count = 2000, vector_size=300,window=3, sg=0, epochs=5)

Corpus size : 17003507
Vocab size  : 255310


In [58]:
model.wv.most_similar('first',topn=10)

[('last', 0.4780724346637726),
 ('second', 0.4321524500846863),
 ('third', 0.4040554463863373),
 ('final', 0.361699640750885),
 ('next', 0.3391777276992798),
 ('later', 0.33212709426879883),
 ('originally', 0.31756410002708435),
 ('after', 0.2876306474208832),
 ('during', 0.28711268305778503),
 ('title', 0.27928686141967773)]

In [59]:
# Converting a 300 dimension to a 2D matrix
model.wv['the'].shape

(300,)

#### Reducing the Dimensions

In [61]:
# Using PCA for Dimensionality Reduction
X     = [model.wv[word] for word in model.wv.index_to_key]
words = list(model.wv.index_to_key)


print(len(X)) # word vector of all words
print(len(words)) # No of words

print("Dimensions of each Vector | Before :",len(X[0]))

pca    = PCA(n_components=2)
result = pca.fit_transform(X)

print("Dimensions of each Vector | After :",len(result[0]))

data = {'X':result[:,0],'y':result[:,1], 'label':words}
df = pd.DataFrame(data)

871
871
Dimensions of each Vector | Before : 300
Dimensions of each Vector | After : 2


### Visualising Word Vectors

In [62]:
df.head()

Unnamed: 0,X,y,label
0,0.511195,-0.262493,the
1,0.326286,-0.951887,of
2,0.207591,-0.784186,and
3,6.313051,0.261817,one
4,1.660067,-0.28408,in


In [64]:
scatter = go.Scatter(
    x=df['X'], 
    y=df['y'], 
    mode='markers+text', 
    text=df['label'], 
    textposition='top center', 
    marker=dict(size=4, color='rgba(157,0,0,0.8)')
)

layout = go.Layout(
    title='Word Vectors Visualisation',
    xaxis=dict(title='Dimension-1'),
    yaxis=dict(title='Dimension-2'),
    hovermode='closest'
)

fig = go.Figure(data=[scatter], layout=layout)

pio.show(fig)
