<a href="https://colab.research.google.com/github/RajeevRanjany/Applied-Machine-Learning/blob/main/GOT_NLP_Pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
!pip install gensim
!pip install nltk
import pandas as pd



In [24]:
df = pd.read_csv('/content/Game_of_Thrones_Script.csv')

In [25]:
df.sample()

Unnamed: 0,Release Date,Season,Episode,Episode Title,Name,Sentence
10018,2013-06-02,Season 3,Episode 9,The Rains of Castamere,gilly,How do you know all that?


In [26]:
df = df[['Sentence', 'Name']]

In [27]:
df.sample()

Unnamed: 0,Sentence,Name
9543,So I could tear it off you.,jon snow


In [28]:
df.isna().sum()

Unnamed: 0,0
Sentence,1
Name,3


In [29]:
df = df.dropna()

In [49]:
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = text.replace("what's", "what is")

    words = simple_preprocess(text)
    words = [w for w in words if w not in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]

    return words


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [51]:
df['tokens'] = df['Sentence'].apply(preprocess)

In [52]:
df.sample()

Unnamed: 0,Sentence,Name,tokens
10820,What's the prize to winning this stupid contest?,daenerys targaryen,"[whats, prize, winning, stupid, contest]"


In [54]:
top_chars = df['Name'].value_counts().head(8).index
df = df[df['Name'].isin(top_chars)]
df

Unnamed: 0,Sentence,Name,tokens
15,Go on. Father's watching.,jon snow,"[go, father, watching]"
16,And your mother.,jon snow,[mother]
18,Thank you.,sansa stark,[thank]
21,"Don't think too much, Bran.",jon snow,"[dont, think, much, bran]"
38,Don't look away.,jon snow,"[dont, look, away]"
...,...,...,...
23895,We have. These projects will begin as soon as ...,davos,"[project, begin, soon, master, coin, lord, lof..."
23897,Any more.,davos,[]
23899,"Grandmaester, ahem, it is my theory, based on ...",tyrion lannister,"[grandmaester, ahem, theory, based, year, work..."
23902,Find the best builders and set them to the task.,tyrion lannister,"[find, best, builder, set, task]"


In [55]:
df['tokens']

Unnamed: 0,tokens
15,"[go, father, watching]"
16,[mother]
18,[thank]
21,"[dont, think, much, bran]"
38,"[dont, look, away]"
...,...
23895,"[project, begin, soon, master, coin, lord, lof..."
23897,[]
23899,"[grandmaester, ahem, theory, based, year, work..."
23902,"[find, best, builder, set, task]"


In [56]:
df['clean_sentence'] = df['tokens'].apply(lambda x: " ".join(x))

In [57]:
df[['Sentence','clean_sentence']].head()

Unnamed: 0,Sentence,clean_sentence
15,Go on. Father's watching.,go father watching
16,And your mother.,mother
18,Thank you.,thank
21,"Don't think too much, Bran.",dont think much bran
38,Don't look away.,dont look away


In [59]:
top_chars = df['Name'].value_counts().head(8).index
df = df[df['Name'].isin(top_chars)]
df

Unnamed: 0,Sentence,Name,tokens,clean_sentence
15,Go on. Father's watching.,jon snow,"[go, father, watching]",go father watching
16,And your mother.,jon snow,[mother],mother
18,Thank you.,sansa stark,[thank],thank
21,"Don't think too much, Bran.",jon snow,"[dont, think, much, bran]",dont think much bran
38,Don't look away.,jon snow,"[dont, look, away]",dont look away
...,...,...,...,...
23895,We have. These projects will begin as soon as ...,davos,"[project, begin, soon, master, coin, lord, lof...",project begin soon master coin lord lofty titl...
23897,Any more.,davos,[],
23899,"Grandmaester, ahem, it is my theory, based on ...",tyrion lannister,"[grandmaester, ahem, theory, based, year, work...",grandmaester ahem theory based year work caste...
23902,Find the best builders and set them to the task.,tyrion lannister,"[find, best, builder, set, task]",find best builder set task


In [61]:
from sklearn.model_selection import train_test_split

X = df['clean_sentence']
y = df['Name']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)


In [65]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
mf = model.fit(X_train_vec, y_train)


In [64]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.29161451814768463
                    precision    recall  f1-score   support

        arya stark       0.62      0.10      0.17       157
  cersei lannister       0.42      0.09      0.15       201
daenerys targaryen       0.43      0.18      0.25       210
             davos       1.00      0.02      0.04       105
   jaime lannister       0.30      0.05      0.09       189
          jon snow       0.37      0.28      0.32       227
       sansa stark       0.62      0.06      0.12       157
  tyrion lannister       0.25      0.88      0.39       352

          accuracy                           0.29      1598
         macro avg       0.50      0.21      0.19      1598
      weighted avg       0.44      0.29      0.23      1598



In [69]:
from gensim.models import Word2Vec

w2v = Word2Vec(
    sentences=df['tokens'],
    vector_size=100,
    window=5,
    min_count=5,
    sg=1,              # ðŸ”¥ skip-gram
    negative=10,
    sample=1e-3,       # ðŸ”¥ subsampling
    epochs=20,
    workers=4
)


In [76]:
w2v.wv.most_similar("starks", topn=10)

[('ned', 0.7935193181037903),
 ('beside', 0.7252316474914551),
 ('roose', 0.7247337698936462),
 ('umber', 0.7246590256690979),
 ('karstarks', 0.7238113880157471),
 ('rickon', 0.6734824180603027),
 ('bastard', 0.6679479479789734),
 ('youngest', 0.652161717414856),
 ('aside', 0.6518599987030029),
 ('eldest', 0.6495211124420166)]

In [78]:
w2v.wv.most_similar(
    positive=["stark", "king"],
    negative=["north"],
    topn=5
)


[('pardon', 0.47235560417175293),
 ('myrcella', 0.46053412556648254),
 ('commit', 0.4533063769340515),
 ('eddard', 0.45020246505737305),
 ('tommen', 0.44377195835113525)]

In [79]:
w2v.wv.most_similar(
    positive=["eddard", "catelyn"],
    negative=["stark"],
    topn=5
)


[('arryn', 0.7400678992271423),
 ('eldest', 0.7336087226867676),
 ('robb', 0.7262743711471558),
 ('youngest', 0.7121531367301941),
 ('treason', 0.7035139799118042)]

In [84]:
w2v.wv.doesnt_match(["robb", "jon", "bronn", "arya"])

'bronn'

In [85]:
w2v.wv.doesnt_match(["eddard", "catelyn", "robb", "tommen"])

'tommen'

In [102]:
words = list(w2v.wv.key_to_index.keys())[:200]
vectors = [w2v.wv[word] for word in words]

In [103]:
words

['dont',
 'im',
 'know',
 'youre',
 'one',
 'king',
 'want',
 'father',
 'would',
 'like',
 'lord',
 'well',
 'u',
 'men',
 'think',
 'never',
 'good',
 'back',
 'man',
 'need',
 'come',
 'cant',
 'see',
 'take',
 'brother',
 'right',
 'time',
 'people',
 'queen',
 'he',
 'get',
 'go',
 'could',
 'tell',
 'going',
 'make',
 'didnt',
 'thats',
 'say',
 'thing',
 'let',
 'ive',
 'kill',
 'lady',
 'ever',
 'look',
 'yes',
 'ser',
 'stark',
 'north',
 'army',
 'child',
 'way',
 'life',
 'ill',
 'grace',
 'told',
 'hand',
 'many',
 'son',
 'fight',
 'wont',
 'dead',
 'love',
 'first',
 'little',
 'long',
 'dragon',
 'mother',
 'night',
 'joffrey',
 'day',
 'city',
 'world',
 'wall',
 'better',
 'always',
 'boy',
 'girl',
 'war',
 'youll',
 'family',
 'give',
 'there',
 'much',
 'jon',
 'name',
 'sister',
 'youve',
 'house',
 'said',
 'woman',
 'die',
 'watch',
 'find',
 'every',
 'even',
 'stop',
 'nothing',
 'cersei',
 'enough',
 'doesnt',
 'killed',
 'still',
 'thank',
 'made',
 'keep',
 

In [105]:
vectors

[array([-0.21473914, -0.2293472 ,  0.31035993,  0.31060502, -0.5083165 ,
        -0.34034   , -0.1763137 ,  0.15155527,  0.04197933, -0.65233624,
        -0.02139463, -0.11671755,  0.07806996,  0.43960288,  0.33874977,
        -0.4373161 ,  0.6701412 , -0.2088908 , -0.22337626, -0.4787973 ,
         0.44950786, -0.04059947,  0.00150061, -0.17241003,  0.41683462,
        -0.03969378, -0.10154704,  0.6065648 , -0.09425066, -0.04800491,
        -0.19688532, -0.20365867,  0.43770948, -0.2263062 , -0.4636219 ,
         0.11944686,  0.32520184, -0.1179594 , -0.18389356, -0.38567597,
        -0.33498544,  0.11000372, -0.39042696, -0.03955133,  0.10511241,
         0.10326184, -0.28386524,  0.04617426,  0.14702246,  0.45293814,
         0.0857022 ,  0.07727686, -0.20374736, -0.07056855,  0.49120447,
        -0.23245299, -0.09575766, -0.10772128, -0.34706768,  0.18164265,
         0.02034723, -0.394173  , -0.1134917 , -0.26328906, -0.17223577,
         0.44294274,  0.08889271,  0.5642107 , -0.2

In [106]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
vectors_3d = pca.fit_transform(vectors)


In [109]:
!pip install plotly



In [111]:
import plotly.express as px
import pandas as pd

df_plot = pd.DataFrame({
    "x": vectors_3d[:, 0],
    "y": vectors_3d[:, 1],
    "z": vectors_3d[:, 2],
    "word": words
})

fig = px.scatter_3d(
    df_plot,
    x="x",
    y="y",
    z="z",
    text="word",
    title="Interactive 3D PCA of Word2Vec Embeddings"
)

fig.update_traces(textposition='top center')
fig.show()
