In [44]:
import pandas as pd
import numpy as np
import opendatasets as od
import pickle

In [45]:
od.download("https://www.kaggle.com/datasets/mathurinache/game-of-thrones-data?select=script-bag-of-words.json")

Skipping, found downloaded files in ".\game-of-thrones-data" (use force=True to force download)


In [46]:
df = pd.read_json("game-of-thrones-data/script-bag-of-words.json").set_index('episodeAlt')
df.head()

Unnamed: 0_level_0,seasonNum,episodeNum,episodeTitle,text
episodeAlt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
S1E1,1,1,Winter Is Coming,"[{'text': 'Easy, boy.', 'name': 'Will'}, {'tex..."
S1E2,1,2,The Kingsroad,"[{'text': 'And You child... drink, eat. need t..."
S1E3,1,3,Lord Snow,[{'text': 'Council. Grand Lord Maester Pycelle...
S1E4,1,4,"Cripples, Bastards, and Broken Things",[{'text': 'Lord's The again. been dreaming lit...
S1E5,1,5,The Wolf and the Lion,[{'text': 'Does Hugh Ser any capital? family h...


In [47]:
df.iloc[-1]['text']

[{'name': 'Tyrion Lannister', 'text': "I'll find later. you"},
 {'name': 'Jon Snow', 'text': "It's Let me men not safe. send some with you."},
 {'name': 'Tyrion Lannister', 'text': "I'm alone. going"},
 {'name': 'Grey Worm',
  'text': 'Daenerys I In Queen, Targaryen, die. name of one sentence the the to true you'},
 {'name': 'Jon Snow',
  'text': "Grey It's These Worm! are men over. prisoners."},
 {'name': 'Grey Worm',
  'text': "It Queen's are defeated. enemies is not over the until"},
 {'name': 'Davos Seaworth',
  'text': "How They're be? defeated do knees. more much on their them to want you"},
 {'name': 'Grey Worm', 'text': 'They are breathing.'},
 {'name': 'Davos Seaworth', 'text': 'Look We around friend. won. you,'},
 {'name': 'Grey Worm', 'text': "I commands, my not obey queen's yours."},
 {'name': 'Jon Snow', 'text': "And Queen's are commands? the what"},
 {'name': 'Grey Worm',
  'text': '"Kill Cersei Lannister." These They all are chose fight follow for free her. men. to who'}

In [48]:
dialogue = {}
for index,row in df.iterrows():
    for item in row['text']:
        if item['name'] in dialogue:
            dialogue[item['name']] = dialogue[item['name']] + item['text']
        else:
            dialogue[item['name']] = item['text'] + " "

In [49]:
len(dialogue)

817

In [50]:
new_df = pd.DataFrame(columns=["Character name", "Words", "Word Count"])
new_df['Character name'] = dialogue.keys()
new_df['Words'] = dialogue.values()
new_df["Word Count"] = new_df['Words'].apply(lambda x:len(x.split()))

In [51]:
new_df = new_df.sort_values(ascending=False,by="Word Count")

In [52]:
new_df.iloc[91,:]

Character name                                        Doran Martell
Words             Captain. Let her pass. By Oberyn a by combat. ...
Word Count                                                      501
Name: 451, dtype: object

In [53]:
new_df = new_df[(new_df['Word Count'] >= 500)]

In [54]:
new_df.shape

(92, 3)

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english")

In [57]:
embeddings = cv.fit_transform(new_df['Words']).toarray()

In [58]:
embeddings.shape

(92, 15194)

In [59]:
embeddings = embeddings.astype('float64')

In [60]:
from sklearn.manifold import TSNE

In [62]:
tsne = TSNE(n_components=2,verbose=1,random_state=123)
z = tsne.fit_transform(embeddings)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 92 samples in 0.009s...
[t-SNE] Computed neighbors for 92 samples in 3.389s...
[t-SNE] Computed conditional probabilities for sample 92 / 92
[t-SNE] Mean sigma: 16.560709
[t-SNE] KL divergence after 250 iterations with early exaggeration: 56.788578
[t-SNE] KL divergence after 750 iterations: 0.254762


In [63]:
z.shape

(92, 2)

In [64]:
new_df['x'] = z.T[0]
new_df['y'] = z.T[1]
new_df

Unnamed: 0,Character name,Words,Word Count,x,y
17,Tyrion Lannister,It Mmh. Northern about girls. is say the they ...,25924,-1.027815,-5.102725
13,Cersei Lannister,And And Casterly One Rock. When about afraid. ...,14294,-0.897913,-4.897999
3,Jon Snow,Father's Go on. watching. And mother. yourBran...,11488,-1.039271,-4.697813
20,Daenerys Targaryen,We've a and anything. asked been for for guest...,11202,-0.327270,-4.638791
12,Jaime Lannister,"As I It's brother, duty feel it's much. my sho...",10823,-0.888325,-4.703313
...,...,...,...,...,...
590,Randyll Tarly,That's enough of that. Not already? enough fat...,581,0.667622,0.917512
315,Karl Tanner,"Whose cut, gonna man? old throat you It's and ...",576,0.801468,0.521908
320,Selyse Baratheon,"Burn For Help I Lord, Protect Use and and as a...",542,0.521088,0.588699
596,Lyanna Mormont,Bear Island. Welcome to I My Robb. She She a a...,534,-0.298424,1.346392


In [67]:
import plotly.express as px
fig = px.scatter(new_df.head(25), x = "x", y="y", color="Character name")
fig.show()