STATIC Embedding - custom model for word2vec

In [1]:
import pandas as pd
import numpy as np
import gensim
import nltk
import os
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [2]:
story=[]
for filename in os.listdir("data"):
  file_path=os.path.join("data",filename)
  with open(file_path,encoding='unicode_escape') as f:
    corpus=f.read()
  raw_sent=sent_tokenize(corpus)
  for sent in raw_sent:
    story.append(simple_preprocess(sent))

In [3]:
len(story)

145020

In [4]:
from gensim.models import Word2Vec

In [5]:
model=Word2Vec(window=10,min_count=5,vector_size=150)

In [6]:
model.build_vocab(story)

In [7]:
model.corpus_count

145020

In [8]:
model.epochs

5

In [9]:
model.corpus_total_words

1725638

In [10]:
model.train(story,total_examples=model.corpus_count, epochs=5)

(6484551, 8628190)

In [11]:
model.wv["king"]

array([ 1.8213556 ,  2.3473508 , -1.1468114 , -0.61086005, -1.6954284 ,
       -2.3783681 , -1.6260967 , -1.4870921 , -1.6656766 ,  1.8992511 ,
        0.5031965 ,  1.1017445 , -0.6479549 , -2.1593692 , -4.1195035 ,
        0.40523058,  1.2553421 ,  2.5821877 , -2.323379  ,  1.3162984 ,
       -1.7927756 ,  0.99659103, -1.3757925 , -1.4952253 , -0.08691862,
       -0.5666266 ,  0.8569716 , -3.0669935 , -0.7740389 ,  1.0756698 ,
        3.3497486 ,  0.54998547, -0.43681848, -3.8005521 ,  0.9174523 ,
       -2.0670311 ,  1.1737636 , -0.4134522 ,  1.5600294 ,  0.45597813,
        0.69423974,  1.5622146 ,  0.22658639,  0.93126756,  0.72388846,
        0.95135945,  1.6668042 , -0.30490914,  1.4379836 , -2.0626814 ,
       -1.1989062 , -0.85838187,  1.6056644 , -0.5906851 , -0.34186715,
       -0.29208165,  0.6738932 ,  1.6585488 ,  2.5267215 ,  2.088576  ,
        1.0261335 , -1.5229226 ,  0.23849832,  0.6292847 , -0.6451711 ,
       -0.50820154,  1.0639036 ,  0.8404291 , -0.8381949 , -0.12

In [12]:
model.wv.most_similar("king")

[('realm', 0.6815700531005859),
 ('baratheon', 0.6807931065559387),
 ('prince', 0.6512925028800964),
 ('usurper', 0.619055449962616),
 ('council', 0.6083788275718689),
 ('conqueror', 0.6065840125083923),
 ('tourney', 0.5901256203651428),
 ('throne', 0.5857936143875122),
 ('battle', 0.576046347618103),
 ('victory', 0.5618283152580261)]

In [13]:
model.wv.doesnt_match(['jon','rikon','robb','arya','sansa','bran'])

'jon'

In [14]:
model.wv["king"]

array([ 1.8213556 ,  2.3473508 , -1.1468114 , -0.61086005, -1.6954284 ,
       -2.3783681 , -1.6260967 , -1.4870921 , -1.6656766 ,  1.8992511 ,
        0.5031965 ,  1.1017445 , -0.6479549 , -2.1593692 , -4.1195035 ,
        0.40523058,  1.2553421 ,  2.5821877 , -2.323379  ,  1.3162984 ,
       -1.7927756 ,  0.99659103, -1.3757925 , -1.4952253 , -0.08691862,
       -0.5666266 ,  0.8569716 , -3.0669935 , -0.7740389 ,  1.0756698 ,
        3.3497486 ,  0.54998547, -0.43681848, -3.8005521 ,  0.9174523 ,
       -2.0670311 ,  1.1737636 , -0.4134522 ,  1.5600294 ,  0.45597813,
        0.69423974,  1.5622146 ,  0.22658639,  0.93126756,  0.72388846,
        0.95135945,  1.6668042 , -0.30490914,  1.4379836 , -2.0626814 ,
       -1.1989062 , -0.85838187,  1.6056644 , -0.5906851 , -0.34186715,
       -0.29208165,  0.6738932 ,  1.6585488 ,  2.5267215 ,  2.088576  ,
        1.0261335 , -1.5229226 ,  0.23849832,  0.6292847 , -0.6451711 ,
       -0.50820154,  1.0639036 ,  0.8404291 , -0.8381949 , -0.12

In [15]:
vectors=model.wv.get_normed_vectors()

In [16]:
y=model.wv.index_to_key

In [17]:
model.save("word2vec.bin")

In [29]:
import gzip

In [32]:
import gzip
import shutil

def compress_file(input_file, output_file):
    with open(input_file, 'rb') as f_in:
        with gzip.open(output_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

# Replace 'word2vec.bin' and 'word2vec.bin.gz' with your actual file names
input_file = 'word2vec.bin'
output_file = 'word2vec.bin.gz'

compress_file(input_file, output_file)

In [19]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
X=pca.fit_transform(vectors)

In [20]:
X.shape

(11760, 3)

In [21]:
import plotly.express as px
fig = px.scatter_3d(X[200:250],x=0,y=1,z=2, color=y[200:250])
fig.show()