In [1]:
import gensim 
import numpy 
import os

In [2]:
from nltk import sent_tokenize
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
# Loop through all files in "data" directory 
# Open each file and read full content of the file
# Split the content into sentences using NLTK
# Loop throgh each sentence 
# Apply simple preprocessing on each word
# Remove stopwords and store as a list of words
import nltk
nltk.download('punkt')
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))
story=[]
for filename in os.listdir("data"): 
    f=open(os.path.join('data',filename))
    corpus=f.read()
    raw_sent=sent_tokenize(corpus)
    for sent in raw_sent:
        tokens=simple_preprocess(sent)# Tokenize, lowercase and clean each sentence
        filtered_tokens=[word for word in tokens if word not in stop_words]
        story.append(filtered_tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# story

In [4]:
len(story)

145020

In [5]:
# Build model of Word2Vec
# window=10 - Prediction of the center word , we will take 10 words on both sides
# min_count=2 - Consider only those sentences having min words 2
# vector_size=100 - Output vector will be of 100 size
model=gensim.models.Word2Vec(window=10,min_count=2,vector_size=100)

In [6]:
# Build vocabulary
model.build_vocab(story)

In [7]:
# Train Word2Vec Model
# total_examples - total sentences in entire corpus 
# default epoch value = 5
model.train(story,total_examples=model.corpus_count,epochs=10)

(8792986, 9158780)

In [8]:
# Access the Word2Vec model
model.wv.most_similar('daenerys')

[('stormborn', 0.7561278343200684),
 ('viserys', 0.7058680653572083),
 ('targaryen', 0.7032095193862915),
 ('dragons', 0.6567473411560059),
 ('khaleesi', 0.6381340622901917),
 ('unburnt', 0.6364657878875732),
 ('unworthy', 0.6279802322387695),
 ('xhoan', 0.623015820980072),
 ('daxos', 0.6149370074272156),
 ('xaro', 0.5990347862243652)]

In [9]:
model.wv.doesnt_match(['cersei','jaime','bronn','tyrion'])

'bronn'

In [10]:
# Vector representation of a particular word
model.wv['jon']

array([-2.957254  ,  0.72497284,  0.6341044 , -1.4184879 ,  2.1713216 ,
        0.95460504,  1.1813219 ,  2.029756  ,  1.2839499 , -1.1807473 ,
       -0.05563115, -2.513837  , -1.4205059 , -0.9757407 , -0.8380898 ,
       -0.7749819 ,  1.0170084 , -0.3115102 ,  1.6024581 , -0.62404734,
       -1.0574028 ,  0.5266758 , -1.4273187 ,  0.7921439 ,  1.0898852 ,
        2.1377928 , -1.9508506 ,  0.15065914,  2.1389174 ,  0.8328787 ,
       -1.7771014 ,  1.0417929 , -0.5806373 ,  1.2954824 , -0.74022275,
       -1.9346622 , -0.69096404, -0.5714575 ,  1.588735  , -0.43385276,
        0.05179471,  1.8972554 , -0.9075457 , -0.409522  , -1.9359645 ,
        0.5291797 ,  2.0669353 , -1.5781862 , -2.8161538 ,  2.9925497 ,
        1.1550667 , -2.0017517 ,  0.33409363, -2.7196743 ,  2.1051776 ,
       -0.01094539, -1.629925  ,  1.6222441 ,  2.4786959 ,  4.027443  ,
       -1.8333949 , -1.1570858 ,  1.6455967 , -2.641025  ,  1.8101553 ,
        0.29122937,  0.45606643,  1.0684153 ,  0.45955974,  1.09

In [11]:
model.wv['jon'].shape

(100,)

In [12]:
model.wv.similarity('arya','sansa')

0.6651052

In [13]:
model.wv.similarity('cersei','sansa')

0.6212648

In [14]:
model.wv.similarity('tywin','sansa')

0.21467058

In [15]:
# Vector Representataion of all the words
# model.wv.get_normed_vectors()

In [16]:
# Total unique words whose vector representation was created 
model.wv.get_normed_vectors().shape

(17310, 100)

In [17]:
y=model.wv.index_to_key
# y

In [18]:
# Applying dimentionality Reduction using PCA 
# So that we van have a Visual representation in 3D
from sklearn.decomposition import PCA
pca =PCA(n_components=3)
x=pca.fit_transform(model.wv.get_normed_vectors())
x.shape # Dimentionality reduced from 100 to 3 

(17310, 3)

In [19]:
import plotly.express as px # 3D representation
fig=px.scatter_3d(x[:100],x=0,y=1,z=2,color=y[:100]) # ploting 100 words
fig.show()