# Named Entity recognition

In [1]:
import numpy as np
import pandas as pd
import spacy
from nltk import sent_tokenize
import spacy_transformers
import os


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
!pip install spacy-transformers
!python -m spacy download en_core_web_trf

# Load model

In [2]:
def load_model():
    nlp = spacy.load("en_core_web_trf")
    return nlp

nlp_model = load_model()

  model.load_state_dict(torch.load(filelike, map_location=device))


# load Dataset

In [3]:
def load_subtitles_dataset(dataset_path):
    df = pd.read_csv(dataset_path)
    df['season_episode'] = (df['Season'].str.split(' ').str[1].astype(int) - 1) * 10 + df['Episode'].str.split(' ').str[1].astype(int)
    df_a = df.drop(['Season','Episode','Release Date','Episode Title'],axis=1)
    df_a['Sentence'] = df_a['Sentence'].astype(str)
    df_a.rename(columns={'Sentence':'Subtitles'},inplace=True)
    
    return df_a

In [4]:
dataset_path = "C:/work/ml/analyze_series_with_NLP/data/GOT.csv"
df = load_subtitles_dataset(dataset_path)

In [5]:
sample_script = df
sample_script

Unnamed: 0,Name,Subtitles,season_episode
0,waymar royce,What do you expect? They're savages. One lot s...,1
1,will,I've never seen wildlings do a thing like this...,1
2,waymar royce,How close did you get?,1
3,will,Close as any man would.,1
4,gared,We should head back to the wall.,1
...,...,...,...
23906,brienne,I think we can all agree that ships take prece...,76
23907,bronn,I think that's a very presumptuous statement.,76
23908,tyrion lannister,I once brought a jackass and a honeycomb into ...,76
23909,man,The Queen in the North!,76


## Run named entity recognition


In [6]:
# Characters in the script
sample_script['Name'].value_counts().head(10)

Name
tyrion lannister      1760
jon snow              1133
daenerys targaryen    1048
cersei lannister      1005
jaime lannister        945
sansa stark            784
arya stark             783
davos                  528
theon greyjoy          455
petyr baelish          449
Name: count, dtype: int64

In [7]:
# Map of characters with their aliases

character_map ={
    'daenerys': ['daenerys','daenerys targaryen','khaleesi','mother of dragons','dany','mhysa'],
    'eddard': ['eddard','ned'],
    'theon': ['theon','theon greyjoy','reek'],
    'cersei': ['cersei','cersei lannister'],
    'catelyn': ['catelyn','cat','catlyn'],
    'sandor': ['sandor','hound'],
    'sam': ['sam','samwell','piggy'],
    'petyr': ['petyr','littlefinger','baelish'],
    'bran': ['bran','brandon'],
    'ramsay': ['ramsay','bolton'],
    'jon': ['jon','snow'],
    'jorah': ['jorah','mormont'],
}

In [125]:
def get_ners_inference2(script):
    script_sentences = sent_tokenize(script)

    ner_output = []

    for sentence in script_sentences:
        doc = nlp_model(sentence)
        ners = set()
        for entity in doc.ents:
            if entity.label_ =="PERSON":

                first_name = entity.text.split(" ")[0]
                first_name = first_name.lower().strip()


                ners.add(first_name)

        ner_output.append(ners)

    return ner_output

In [None]:
# Get NERs for the sample script and save it to a CSV file

sample_script['ners'] = sample_script['Subtitles'].apply(get_ners_inference2)
with open('sample_script_ner.csv', 'w', newline='', encoding='utf-8') as f:
    sample_script.to_csv(f, index=False)

In [None]:
script_ner = pd.read_csv('stubs/sample_script_ner.csv',index_col=False)

In [10]:
def generate_character_network2(df):
    entity_relationship = []

    for name , row in zip(df['Name'],df['ners']):

        if(type(name) != str):
            continue

        row = eval(row) # convert string to list from csv

        
        name = name.lower().split(" ")[0]

        for sentence in row:
            for entity in sentence:
                current_entity = entity

                # check if the first name is in the character map
                for key,values in character_map.items():
                    if entity in values:
                        current_entity = key
                        break

                entity_relationship.append(sorted([current_entity,name ]))
    
    relationship_df = pd.DataFrame({'value': entity_relationship})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value', ascending=False)

    return relationship_df

In [11]:
relationship_df = generate_character_network2(script_ner)

In [12]:
relationship_df.head(20)

Unnamed: 0,source,target,value
781,daenerys,jorah,59
1619,jon,sam,53
627,cersei,jaime,49
1210,gilly,sam,45
2217,petyr,sansa,44
780,daenerys,jon,44
1496,jaime,tyrion,41
1377,hodor,hodor,40
685,cersei,tyrion,37
831,daenerys,tyrion,36


# Character Network

In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [14]:
relationship_df = relationship_df.sort_values('value', ascending=False)
relationship_df_g = relationship_df.head(200)

In [16]:
G = nx.from_pandas_edgelist(
    relationship_df_g, 
    source='source', 
    target='target', 
    edge_attr='value',
    create_using=nx.Graph()
)

net = Network(notebook=True, width="1920px", height="1080px", bgcolor="#222222", font_color="white", cdn_resources="remote")
node_degree = dict(G.degree)

nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show("results/got2.html")


results/got2.html
