In [None]:
# Here, we will train a model on game of thrones story data.

In [4]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [5]:
import gensim

In [15]:
import nltk
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [18]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [20]:
story = []

for filename in ["/content/001ssb.txt", "/content/002ssb.txt", "/content/003ssb.txt", "/content/004ssb.txt", "/content/005ssb.txt"]:
  with open(filename, "r", encoding = "utf-8", errors = "ignore") as f:
    corpus = f.read()
    raw_sent = sent_tokenize(corpus)
    for sent in raw_sent:
      story.append(simple_preprocess(sent))

In [25]:
print(story[:2])

[['game', 'of', 'thrones', 'book', 'one', 'of', 'song', 'of', 'ice', 'and', 'fire', 'by', 'george', 'martin', 'prologue', 'we', 'should', 'start', 'back', 'gared', 'urged', 'as', 'the', 'woods', 'began', 'to', 'grow', 'dark', 'around', 'them'], ['the', 'wildlings', 'are', 'dead']]


In [26]:
model = gensim.models.Word2Vec(
    window = 5,   # window = 5 means hum middle word ke left ke 5 words and right ke 5 words ko context maan rahe hain.
    vector_size = 100, # size of embeddings
    min_count = 5 # consider those sentences which have greater than or equal to 5 words
)

In [27]:
model.build_vocab(story)

In [28]:
model.train(
    story,
    total_examples = model.corpus_count,
    epochs = model.epochs
)

# model.train() trains the Word2Vec model on our tokenized sentences
#
# story → training corpus (list of tokenized sentences)
# total_examples → total number of sentences used for training (same as used in build_vocab)
# epochs → number of full passes over the entire dataset. THe default value of model.epochs = 5. But if we wanted 10, then before calling model.train, explicitly do model.epochs = 10
#
# In simple words:
# The model reads all sentences `epochs` times and learns word embeddings from context windows.

(6485021, 8625265)

In [29]:
# As we can see, model.trained returned (trained_words, total_words)
# It means 6485021 words were actually trained out of 8625265 total words available in the corpus

In [30]:
model.wv.most_similar("daenerys")

[('targaryen', 0.8108153939247131),
 ('rhaegar', 0.7588443160057068),
 ('stormborn', 0.7552147507667542),
 ('myrcella', 0.7427701354026794),
 ('martell', 0.7214646339416504),
 ('doran', 0.7076268792152405),
 ('aegon', 0.7033505439758301),
 ('unburnt', 0.6930316686630249),
 ('princess', 0.6911218762397766),
 ('elia', 0.683988630771637)]

In [34]:
model.wv.doesnt_match(['jon', 'rikon', 'robb', 'arya', 'sansa', 'bran'])
# wv stands for Word Vectors
# Output: 'jon' -> as jon ke alaawa baaki sab bhai behan hain



'jon'

In [35]:
model.wv.doesnt_match(['cersei', 'jaime', 'bronn', 'tyrion'])

'bronn'

In [36]:
model.wv["jon"]

array([-1.1417359 , -0.76701295, -0.04729752, -1.9475315 ,  0.6180081 ,
        1.2416173 ,  0.35969636,  0.03677973, -1.584901  , -0.761219  ,
        0.22449465, -0.8442932 , -1.5748904 ,  1.0577421 , -1.8903779 ,
       -0.11233115,  0.5369906 ,  1.8036904 , -1.0232834 , -1.7875987 ,
        2.2877347 , -0.44566616,  0.3230564 ,  1.4278219 , -1.3681276 ,
        1.2708513 ,  0.6455826 ,  0.56810904,  0.04345945,  0.10140568,
        0.09782472,  0.23890033,  0.35742068,  0.28402495,  0.39344025,
        0.8905451 ,  0.33072942,  0.26133546,  0.72246754,  0.71952194,
       -1.0771602 , -1.825463  , -0.41441748,  1.852197  ,  0.8623074 ,
       -0.20993707,  0.12177436, -0.15658516, -0.27558976, -0.5777635 ,
        1.2757686 , -0.6370818 ,  2.0504487 , -0.6105739 , -0.39468333,
        1.4488578 , -0.8815424 , -2.2243495 , -0.11393473,  0.09227539,
        0.6915091 ,  0.26477367, -0.3430763 , -0.4684272 ,  0.09704905,
       -0.5732741 ,  1.3105139 ,  1.6315547 ,  0.20728004, -0.16

In [39]:
model.wv.similarity("tywin", "sansa")

np.float32(0.33564815)

In [41]:
# model.wv.get_normed_vectors()
# # through above code, word vectors in the vocabulary are returned in normalized form.

In [42]:
model.wv.get_normed_vectors().shape

(11975, 100)

In [None]:
# NOTE: Understanding Word2Vec tokens vs vocabulary
#
# model.train() returns (trained_tokens, total_tokens)
# - trained_tokens = total word occurrences used for training (after filtering by min_count = 5)
# - total_tokens = all word occurrences in the corpus
#
# model.wv.get_normed_vectors().shape returns (vocab_size, embedding_dim)
# - vocab_size = number of unique words that have embeddings
# - embedding_dim = dimensionality of each word vector
#
# ✅ Key point:
# Many word occurrences (tokens) contribute to training each word vector,
# but only one vector is stored per unique word.

In [45]:
y = model.wv.index_to_key
# y contains a list of all unique words in your vocabulary, ordered by decreasing frequency.
# y[0] → the most frequent word in your corpus (after min_count filtering)
# y[1] → the second most frequent word

In [48]:
y[:5]

['the', 'and', 'to', 'of', 'he']

In [None]:
# Output mein saare stopwords hain kyunki humne stopwords bina hataaye yeh kaam kiya hai.