In [2]:
import gensim
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
import pandas as pd
import os
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from sklearn.decomposition import PCA
import plotly.express as px

In [3]:
story = []
for filename in os.listdir('data'):
    file_path = os.path.join('data', filename)
    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
        corpus = f.read()
        raw_sent = sent_tokenize(corpus)
        for sent in raw_sent:
            story.append(simple_preprocess(sent))


In [4]:
len(story)

145020

In [5]:
model = gensim.models.Word2Vec(
    window = 10,
    min_count =2,
    
)

In [6]:
model.build_vocab(story)

In [7]:
model.train(story, total_examples= model.corpus_count, epochs = model.epochs)

(6570860, 8628145)

In [8]:
model.wv.most_similar('daenerys')

[('stormborn', 0.8421695232391357),
 ('targaryen', 0.7608627080917358),
 ('princess', 0.7545852661132812),
 ('unburnt', 0.7275064587593079),
 ('queen', 0.7180227041244507),
 ('myrcella', 0.7177507877349854),
 ('elia', 0.6964242458343506),
 ('margaery', 0.6850305199623108),
 ('dorne', 0.6773446798324585),
 ('martell', 0.6683799624443054)]

In [9]:
model.wv.doesnt_match(['jon', 'rikon', 'robb', 'arya', 'sansa', 'bran'])

'jon'

In [10]:
model.wv.doesnt_match(['cersi', 'jaime', 'bronn', 'tyrion'])

'bronn'

In [11]:
model.wv['king']

array([ 1.7821164e+00,  1.4644153e+00,  1.5677588e+00,  2.0458815e+00,
       -2.5858350e+00,  7.4167323e-01,  7.7090645e-01,  2.9579082e-01,
       -2.5191467e+00, -1.4541456e+00, -6.8955427e-01, -5.1413733e-01,
       -1.6042132e+00,  3.3266952e+00, -3.5379035e+00, -1.3351809e+00,
       -2.1522398e+00,  2.1602900e+00, -6.8097287e-01, -1.0119178e-01,
       -2.0269135e-03,  2.4828408e+00,  1.9622446e+00, -2.1499498e+00,
       -2.7329674e+00,  3.0356520e-01, -2.5584035e+00, -8.3221108e-01,
       -1.3036914e-01,  2.6771367e+00, -2.4211566e+00,  1.1501516e+00,
        9.0823430e-01,  8.1909442e-01,  1.1681321e+00, -2.1508844e+00,
       -4.3534780e+00, -1.2290065e+00,  2.4437513e+00, -1.7679212e+00,
        1.8103527e-01,  3.0383747e+00,  2.5277705e+00, -1.0984271e+00,
       -1.0012864e+00, -1.2618830e+00,  4.7769007e-01, -1.6676835e+00,
        3.7382331e+00, -2.9800334e+00, -2.5975816e+00, -9.4789904e-01,
       -6.2458938e-01, -3.8747714e+00,  4.2885189e+00, -1.1942421e+00,
      

In [12]:
model.wv.similarity('arya', 'sansa')

0.8274951

In [13]:
model.wv.similarity('tywin', 'sansa')

0.21869513

In [14]:
model.wv.get_normed_vectors()

array([[ 0.01029525, -0.09266075,  0.02818765, ...,  0.01557963,
        -0.13049269,  0.07486587],
       [-0.20410103, -0.05639393,  0.00227876, ...,  0.03515319,
        -0.09190443,  0.25875264],
       [ 0.16004951, -0.08909918, -0.14313614, ..., -0.01888517,
         0.1496589 , -0.11547495],
       ...,
       [-0.10652535,  0.0888118 , -0.02029969, ..., -0.08736303,
        -0.03389684,  0.03487512],
       [ 0.05030248,  0.04717222,  0.00201907, ..., -0.00288412,
        -0.00290709, -0.06847835],
       [ 0.02707318,  0.09469596, -0.01177805, ..., -0.01780103,
        -0.05049935, -0.01768755]], dtype=float32)

In [15]:
model.wv.get_normed_vectors().shape

(17453, 100)

In [16]:
y = model.wv.index_to_key

In [17]:
y

['the',
 'and',
 'to',
 'of',
 'he',
 'his',
 'was',
 'you',
 'her',
 'in',
 'it',
 'had',
 'that',
 'she',
 'as',
 'with',
 'him',
 'not',
 'but',
 'for',
 'they',
 'is',
 'at',
 'on',
 'said',
 'my',
 'have',
 'be',
 'lord',
 'them',
 'no',
 'from',
 'would',
 'were',
 'me',
 'your',
 'one',
 'all',
 'when',
 'will',
 'ser',
 'if',
 'so',
 'their',
 'we',
 'could',
 'are',
 'man',
 'there',
 'this',
 'up',
 'been',
 'what',
 'did',
 'by',
 'king',
 'do',
 'men',
 'back',
 'out',
 'more',
 'or',
 'who',
 'down',
 'well',
 'than',
 'only',
 'like',
 'jon',
 'some',
 'father',
 'old',
 'hand',
 'even',
 'too',
 'before',
 'tyrion',
 'never',
 'an',
 'off',
 'know',
 'see',
 'into',
 'made',
 'now',
 'eyes',
 'black',
 'told',
 'lady',
 'thought',
 'time',
 'then',
 'how',
 'long',
 'has',
 'can',
 'might',
 'us',
 'come',
 'where',
 'here',
 'through',
 'still',
 'face',
 'head',
 'red',
 'll',
 'way',
 'boy',
 'page',
 'must',
 'once',
 'queen',
 'good',
 'two',
 'brother',
 'night',
 

In [18]:
pca = PCA(n_components =3)

In [19]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [20]:
X[:5]

array([[-0.13387172, -0.5815795 ,  0.04496148],
       [-0.16438915, -0.33217555, -0.05691348],
       [ 0.2983716 , -0.5628532 , -0.22535056],
       [-0.02564286, -0.3523813 ,  0.08569622],
       [ 0.1124983 , -0.5594691 , -0.28072166]], dtype=float32)

In [21]:
X.shape

(17453, 3)

In [22]:
fig = px.scatter_3d(X[:100], x= 0,y=1, z=2, color = y[:100])
fig.show()