In [1]:
import pandas as pd 
import numpy as np

In [2]:
import os
import gensim

In [23]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Initialize an empty list to store the processed story
story = []

# Read and process each file
for filename in os.listdir('dataGOT'):
    with open(os.path.join('dataGOT', filename), 'r', encoding='cp1252') as f:
        corpus = f.read()
    
    # Tokenize the corpus into sentences
    raw_sent = sent_tokenize(corpus)
    
    # Process each sentence
    for sent in raw_sent:
        # Simple preprocessing (tokenizing and lowercasing)
        tokens = simple_preprocess(sent)
        
        # Remove stopwords
        filtered_tokens = [word for word in tokens if word not in stop_words]
        
        # Append the cleaned sentence to the story
        story.append(filtered_tokens)

# Print sample output
print(story[:5])  # Display first 5 processed sentences


[['game', 'thrones', 'book', 'one', 'song', 'ice', 'fire', 'george', 'martin', 'prologue', 'start', 'back', 'gared', 'urged', 'woods', 'began', 'grow', 'dark', 'around'], ['wildlings', 'dead'], ['dead', 'frighten'], ['ser', 'waymar', 'royce', 'asked', 'hint', 'smile'], ['gared', 'rise', 'bait']]


In [24]:
len(story)

145020

In [25]:
story

[['game',
  'thrones',
  'book',
  'one',
  'song',
  'ice',
  'fire',
  'george',
  'martin',
  'prologue',
  'start',
  'back',
  'gared',
  'urged',
  'woods',
  'began',
  'grow',
  'dark',
  'around'],
 ['wildlings', 'dead'],
 ['dead', 'frighten'],
 ['ser', 'waymar', 'royce', 'asked', 'hint', 'smile'],
 ['gared', 'rise', 'bait'],
 ['old', 'man', 'past', 'fifty', 'seen', 'lordlings', 'come', 'go'],
 ['dead', 'dead', 'said'],
 ['business', 'dead'],
 ['dead'],
 ['royce', 'asked', 'softly'],
 ['proof'],
 ['saw', 'gared', 'said'],
 ['says', 'dead', 'proof', 'enough'],
 ['known', 'would', 'drag', 'quarrel', 'sooner', 'later'],
 ['wished', 'later', 'rather', 'sooner'],
 ['mother', 'told', 'dead', 'men', 'sing', 'songs', 'put'],
 ['wet', 'nurse', 'said', 'thing', 'royce', 'replied'],
 ['never', 'believe', 'anything', 'hear', 'woman', 'tit'],
 ['things', 'learned', 'even', 'dead'],
 ['voice', 'echoed', 'loud', 'twilit', 'forest'],
 ['page', 'long', 'ride', 'us', 'gared', 'pointed'],
 ['eig

In [26]:
model=gensim.models.Word2Vec(
    window= 10,
    min_count= 2
)

In [27]:
model.build_vocab(story)

In [28]:
model.train(story,total_examples=model.corpus_count,epochs=model.epochs)

(4396539, 4579390)

In [29]:
model.wv.most_similar('commander')

[('steward', 0.85390704870224),
 ('wyman', 0.853133499622345),
 ('janos', 0.8506969809532166),
 ('manderly', 0.8468608260154724),
 ('randyll', 0.846759021282196),
 ('jason', 0.8384342193603516),
 ('tarly', 0.8297778367996216),
 ('locke', 0.8219822645187378),
 ('nestor', 0.8152021169662476),
 ('umber', 0.8147589564323425)]

In [30]:
model.wv.most_similar('daenerys')

[('stormborn', 0.8927571177482605),
 ('unburnt', 0.8858948945999146),
 ('dorne', 0.8336957097053528),
 ('myrcella', 0.831866443157196),
 ('viserys', 0.8242335319519043),
 ('elia', 0.8207799792289734),
 ('mopatis', 0.8186942934989929),
 ('court', 0.8173956274986267),
 ('doran', 0.8094561696052551),
 ('targaryen', 0.8071060180664062)]

In [31]:
model.wv.doesnt_match(['jon','arya','sansa','bran','robb','rikon'])

'jon'

In [32]:
model.wv['jon'].shape

(100,)

In [33]:
model.wv.similarity('arya','jon')

0.51321816

In [34]:
model.wv.similarity('arya','sansa')

0.7996197

In [35]:
model.wv.similarity('arya','wolf')

0.59991324

In [36]:
model.wv.get_normed_vectors().shape

(17310, 100)

In [37]:
y= model.wv.index_to_key
y

['said',
 'lord',
 'would',
 'one',
 'ser',
 'could',
 'man',
 'king',
 'men',
 'back',
 'well',
 'like',
 'jon',
 'father',
 'old',
 'hand',
 'even',
 'tyrion',
 'never',
 'know',
 'see',
 'made',
 'eyes',
 'black',
 'told',
 'lady',
 'thought',
 'time',
 'long',
 'might',
 'us',
 'come',
 'face',
 'still',
 'head',
 'red',
 'way',
 'boy',
 'page',
 'must',
 'queen',
 'good',
 'two',
 'brother',
 'night',
 'little',
 'took',
 'came',
 'though',
 'say',
 'three',
 'away',
 'dead',
 'son',
 'blood',
 'take',
 'go',
 'half',
 'make',
 'arya',
 'saw',
 'day',
 'white',
 'jaime',
 'first',
 'look',
 'want',
 'much',
 'enough',
 'sword',
 'tell',
 'girl',
 'bran',
 'great',
 'looked',
 'left',
 'knew',
 'asked',
 'gave',
 'maester',
 'called',
 'wall',
 'every',
 'heard',
 'sansa',
 'let',
 'yet',
 'went',
 'turned',
 'dany',
 'need',
 'behind',
 'around',
 'woman',
 'another',
 'snow',
 'beneath',
 'across',
 'knight',
 'keep',
 'grace',
 'found',
 'gold',
 'last',
 'cersei',
 'castle',
 '

In [38]:
len(y)

17310

In [39]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)

In [40]:
X = pca.fit_transform(model.wv.get_normed_vectors())

In [41]:
X.shape

(17310, 3)

In [42]:
import plotly.express as px
fig = px.scatter_3d(X[:500],x = 0,y = 1,z= 2,color=y[:500])
fig.show()