<i><u>Dataset</u>: <link>https://www.kaggle.com/datasets/khulasasndh/game-of-thrones-books</link></i>

In [1]:
# Importing necessary libraries
import os
import spacy
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords

# Ensure NLTK stopwords are available
nltk.download('stopwords', quiet=True)
stop_words = set[str](stopwords.words('english'))

# Fallback to a blank English pipeline if the model `en_core_web_sm` is missing
try:
    nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
except OSError: # [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.
    nlp = spacy.blank("en")
    # Add the sentencizer to the blank pipeline so doc.sents works
    nlp.add_pipe('sentencizer')

# Allow processing of longer texts -> ERROR: ValueError: [E088] Text of length 1607894 exceeds maximum of 1000000. 
nlp.max_length = 5_000_000  # raise limit to ~5M characters

story = []
for filename in os.listdir('data'):
    with open(os.path.join('data', filename), encoding='utf-8', errors='ignore') as f: # `errors='ignore'` some characters are not encodable 
        corpus = f.read()

    # Turn the raw text into a spaCy Doc object so we can easily split it into sentences/words
    doc = nlp(corpus)
    
    # Splitting the sentences into sentences
    for sent in doc.sents: # We can also split it into words
        tokens = simple_preprocess(sent.text)
        # Remove stopwords
        filtered_tokens = [word for word in tokens if word not in stop_words]
        story.append(filtered_tokens)

In [2]:
# 2nd and 3rd sentence - The document is tokenized into sentences and inside that the sentences are tokenized into words
story[2:4]

[['dead', 'frighten'], ['ser', 'waymar', 'royce', 'asked', 'hint', 'smile']]

In [3]:
# Building embedding model
model = gensim.models.Word2Vec(
    window=10, # Number of words on both side of the center word
    min_count=2, # Minimum frequency of words in the sentence must be greater than 2
    vector_size=100, # Embedding dimension
    workers=8, # CPU cores
    epochs=5 # Model Training
)

In [4]:
# Build vocabulary (DataFrame) from a sequence of sentences
model.build_vocab(story)

In [5]:
# Model training
model.train(story, total_examples=model.corpus_count, epochs=model.epochs) # corpus_count -> number of sentences

(4416953, 4596315)

In [6]:
# Similarity score
model.wv.most_similar('daenerys')

[('unburnt', 0.8961785435676575),
 ('stormborn', 0.89205402135849),
 ('targaryen', 0.8480511903762817),
 ('myrcella', 0.82525235414505),
 ('elia', 0.8220338225364685),
 ('trystane', 0.8192443251609802),
 ('mellario', 0.8180558085441589),
 ('regent', 0.814903974533081),
 ('princess', 0.8097535967826843),
 ('rhaegar', 0.8051192164421082)]

In [7]:
# Odd one out
model.wv.doesnt_match(['jon','rikon','robb','arya','sansa','bran'])

'jon'

In [8]:
model.wv.doesnt_match(['cersei', 'jaime', 'bronn', 'tyrion'])

'bronn'

In [9]:
# Word embedding - 100 Dim
model.wv['king']

array([-1.7076805 ,  0.3378017 , -0.14002812,  1.4123836 , -0.70694906,
       -2.178242  , -0.83933663,  3.497339  ,  0.52729666, -2.326885  ,
        1.1667737 , -1.4921064 , -0.19102168,  0.722947  , -1.2649503 ,
        2.6134615 ,  2.3453095 , -1.8898864 ,  0.6683241 , -0.65343577,
        0.00441224,  2.24324   , -0.14758177, -2.756603  ,  2.375252  ,
       -0.4343123 , -0.95208514,  0.6880082 , -0.42334163, -0.46767798,
        1.1637002 ,  0.37063858,  1.7552576 ,  1.8531127 , -2.3360245 ,
        0.9880474 ,  0.29965758, -0.03597204,  0.7160568 ,  0.8822614 ,
        1.3992923 ,  0.8334106 , -0.16992705, -0.7842182 , -0.09359948,
        0.62809706, -0.8358456 ,  0.01073305, -0.45149785,  1.4737802 ,
        0.81154656,  1.9759731 , -1.2341201 ,  1.4714824 ,  0.16409235,
       -2.063168  , -0.32532382, -0.31593338,  0.315437  , -0.8316158 ,
        0.09527929,  2.263629  ,  0.56427777, -0.03446501, -1.7788727 ,
        0.48453107, -1.0970069 , -1.1295917 ,  0.95109653, -0.21

In [10]:
# Similarity score between two words
model.wv.similarity('arya','sansa')

np.float32(0.818137)

In [11]:
model.wv.similarity('cersei','sansa')

np.float32(0.72531044)

In [12]:
model.wv.similarity('tywin','sansa')

np.float32(0.35171378)

In [13]:
# Words embeddings of all unique words
model.wv.get_normed_vectors()

array([[ 0.09263884,  0.0588668 ,  0.03136105, ..., -0.00877592,
         0.01238426,  0.24706131],
       [-0.09221041,  0.11929084,  0.13559452, ..., -0.05017456,
        -0.07251734,  0.14086954],
       [-0.09155999,  0.06388756,  0.01005942, ..., -0.00671783,
         0.06718885, -0.08216917],
       ...,
       [-0.16364694,  0.21829976,  0.06539023, ..., -0.15042563,
         0.03285289,  0.06592924],
       [-0.14206408,  0.15002455,  0.00848819, ..., -0.21780747,
         0.07253899,  0.14084102],
       [-0.05010073, -0.01804507, -0.10845242, ..., -0.09725441,
        -0.10349167,  0.12259864]], shape=(17726, 100), dtype=float32)

In [14]:
# Label of word embeddings of above function
y = model.wv.index_to_key
y

['said',
 'lord',
 'would',
 'one',
 'ser',
 'could',
 'man',
 'men',
 'back',
 'king',
 'well',
 'like',
 'jon',
 'old',
 'hand',
 'even',
 'never',
 'tyrion',
 'know',
 'see',
 'made',
 'father',
 'eyes',
 'black',
 'told',
 'thought',
 'lady',
 'time',
 'long',
 'might',
 'us',
 'come',
 'still',
 'face',
 'head',
 'red',
 'way',
 'page',
 'boy',
 'must',
 'good',
 'two',
 'little',
 'brother',
 'took',
 'came',
 'though',
 'say',
 'night',
 'three',
 'away',
 'queen',
 'dead',
 'son',
 'blood',
 'take',
 'go',
 'half',
 'make',
 'arya',
 'saw',
 'white',
 'day',
 'first',
 'jaime',
 'look',
 'want',
 'much',
 'enough',
 'tell',
 'sword',
 'great',
 'looked',
 'bran',
 'girl',
 'left',
 'knew',
 'gave',
 'asked',
 'called',
 'wall',
 'every',
 'heard',
 'maester',
 'yet',
 'went',
 'let',
 'sansa',
 'turned',
 'need',
 'behind',
 'dany',
 'around',
 'another',
 'beneath',
 'across',
 'snow',
 'keep',
 'gods',
 'found',
 'knight',
 'woman',
 'gold',
 'last',
 'grace',
 'castle',
 'th

In [15]:
# Visualization
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X = pca.fit_transform(model.wv.get_normed_vectors())

In [None]:
# Visualizing the vectors in 3D space
import plotly.express as px

fig = px.scatter_3d(X[0:500],x=0,y=1,z=2, color=y[0:500])
fig.show()