# Name Entity Recognition

In [1]:
import spacy
from nltk import sent_tokenize # Sentence Tokenizer

In [4]:
!python -m spacy download en_core_web_md
# python -m spacy download en_core_web_trf

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
     ---------------------------------------- 0.0/33.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/33.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/33.5 MB ? eta -:--:--
     ---------------------------------------- 0.3/33.5 MB ? eta -:--:--
     ---------------------------------------- 0.3/33.5 MB ? eta -:--:--
      -------------------------------------- 0.5/33.5 MB 509.0 kB/s eta 0:01:05
      -------------------------------------- 0.5/33.5 MB 509.0 kB/s eta 0:01:05
      -------------------------------------- 0.5/33.5 MB 509.0 kB/s eta 0:01:05
      -------------------------------------- 0.8/33.5 MB 466.0 kB/s eta 0:01:11
      -------------------------------------- 0.8/33.5 MB 466.0 kB/s eta 0:01:11
      -------------------------------------- 0.8/33.5 MB 466.0 k


[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Load model

In [None]:
def load_model():
    # nlp = spacy.load("en_core_web_trf") # Large
    nlp = spacy.load("en_core_web_md") # Small
    return nlp

In [10]:
nlp_model = load_model()

# load Dataset

In [49]:
import os 
import sys
import pathlib
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils import load_subtitles_dataset

In [50]:
dataset_path = "../Data/Subtitles/"
df = load_subtitles_dataset(dataset_path)

In [51]:
df.head()

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas..."


In [52]:
sample_script = df.iloc[0]['script']
sample_script

'A long time ago, a powerful demon fox appeared with nine tails.\n With its powerful tails,\n it could smash mountains and create tidal waves.\n A band of Ninjas rose to defend their village from attack.\n We have to wait until the Fourth Hokage gets here!\n We can\'t let it get any closer to our village!\n One great Ninja was able to imprison the monster,\n but died in the process.\n This Ninja was known as… the Fourth Hokage.\n Naruto!\n Why did you do such a thing?!\n You\'re really gonna get it this time!\n I don\'t care!\n You know your problem?\n You can\'t do the things I do!\n Only I can do this!\n I\'m better than all of you! Believe it!\n There\'s a problem, sir!\n Lord Hokage!\n What is it?\n Did that Naruto do something again?\n Yes. He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them!\n Wait!\n Ha ha…\n Why should I?\n Hey, Naruto!\n How did you suddenly get here, lruka Sensei?\n The question is what are you doing here when you should 

In [53]:
sentences = sent_tokenize(sample_script)

In [54]:
sentences[60:90] # a slice of a list of sentences

["Don't you know who the Hokage leaders are?",
 'Of course, I do!',
 'I know they earned the title Lord Hokage\n because they were the best Ninja of their time, right?',
 'Especially the Fourth Hokage was a hero\n who saved the village from the nine-tail demon fox.',
 'Then why did you do that?',
 "Because I'll become a Hokage myself.",
 "And I'll be the greatest Hokage of all time!",
 'So that everyone will finally learn to accept me!',
 'By the way, Sensei, I have a favor to ask.',
 'You want another bowl?',
 'Mmmm…No…\n Can I borrow that Leaf headband for a while?',
 'This?',
 'No no!',
 'This is worn only by those who have graduated from Ninja Academy.',
 "Tomorrow, you will…\n You're so mean!",
 "So that's why you took off your goggles…\n Humph... One more bowl please!",
 'We are now about to begin the graduation test.',
 'When your name is called, proceed to the next classroom.',
 'The test is on the Clone Jutsu.',
 'Oh no…\n Of all the…!',
 'That is my weakest Jutsu!',
 'But sti

In [55]:
sentences = sentences[60:90]

In [56]:
sentence = ".".join(sentences) # Join the sentences by dot

In [57]:
sentence

"Don't you know who the Hokage leaders are?.Of course, I do!.I know they earned the title Lord Hokage\n because they were the best Ninja of their time, right?.Especially the Fourth Hokage was a hero\n who saved the village from the nine-tail demon fox..Then why did you do that?.Because I'll become a Hokage myself..And I'll be the greatest Hokage of all time!.So that everyone will finally learn to accept me!.By the way, Sensei, I have a favor to ask..You want another bowl?.Mmmm…No…\n Can I borrow that Leaf headband for a while?.This?.No no!.This is worn only by those who have graduated from Ninja Academy..Tomorrow, you will…\n You're so mean!.So that's why you took off your goggles…\n Humph... One more bowl please!.We are now about to begin the graduation test..When your name is called, proceed to the next classroom..The test is on the Clone Jutsu..Oh no…\n Of all the…!.That is my weakest Jutsu!.But still… I will do it no matter what!.Clone Jutsu!.Disqualified!.Iruka Sensei..His physica

# Run Model

In [58]:
doc = nlp_model(sentence)

In [59]:
doc.ents # Got the names from the sentneces,

(Hokage,
 Lord Hokage
  ,
 Ninja,
 the Fourth Hokage,
 nine,
 Hokage,
 Hokage,
 Sensei,
 bowl?.Mmmm,
 while?.This?.No,
 Ninja Academy,
 Tomorrow,
 One,
 Clone Jutsu,
 Jutsu!.But,
 three,
 Naruto)

In [60]:
# See the label of each one
for entity in doc.ents:
    print(entity, entity.label_)

# Ninja Academy ORG ( It is an org not a name)
# Tomorrow DATE
# We will only take the name entity

Hokage GPE
Lord Hokage
  ORG
Ninja PERSON
the Fourth Hokage FAC
nine CARDINAL
Hokage GPE
Hokage PERSON
Sensei PERSON
bowl?.Mmmm NORP
while?.This?.No ORG
Ninja Academy ORG
Tomorrow DATE
One CARDINAL
Clone Jutsu ORG
Jutsu!.But NORP
three CARDINAL
Naruto PERSON


In [64]:
# Get only person label
def get_ners_inference(script):
    script_sentences = sent_tokenize(script)

    ner_output = []

    for sentence in script_sentences:
        doc = nlp_model(sentence)
        ners = set()
        for entity in doc.ents:
            if entity.label_ =="PERSON": # Get the person label
                full_name = entity.text
                first_name = entity.text.split(" ")[0] # First word
                first_name = first_name.strip()
                ners.add(first_name)
        ner_output.append(ners)

    return ner_output

In [65]:
df = df.head(10)

In [66]:
df

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas..."
5,6,"C'mon!\n Running like a fugitive,\n Being chas..."
6,7,"C'mon!\n Running like a fugitive,\n Being chas..."
7,8,"C'mon!\n Running like a fugitive,\n Being chas..."
8,9,"C'mon!\n Running like a fugitive,\n Being chas..."
9,12,"C'mon!\n Running like a fugitive,\n Being chas..."


In [69]:
df['ners'] = df['script'].apply(get_ners_inference)  # Get only person label from the scripts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ners'] = df['script'].apply(get_ners_inference)  # Get only person label from the scripts


In [70]:
df # Got only the person name

Unnamed: 0,episode,script,ners
0,1,"A long time ago, a powerful demon fox appeared...","[{}, {}, {}, {}, {}, {}, {Ninja}, {Naruto}, {}..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {Sasuke, Sakura}, {}, {Konohamaru..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {Naruto}, {}, {}, {Iruka}, {}, {N..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {Jutsu}, {}, {}, {Ninja}, {}, {}, {},..."
5,6,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {Sakura}, {Naruto}, {}, {Naruto},..."
6,7,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
7,8,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {Jonin}, {}, {Sasuke}, {}..."
8,9,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
9,12,"C'mon!\n Running like a fugitive,\n Being chas...","[{}, {}, {}, {}, {Zabuza}, {}, {}, {}, {Naruto..."


# Character Network
- Number of occcurance  per character pair( FIrst extract the name from the sentnece and then see how many times the person appeard in the sentence)

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [None]:
def generate_character_network(df):

    windows=10 # If 2 character within 10 sentence
    entity_relationship = [] # Output list

    for row in df['ners']:
        previous_entities_in_window = []

        for sentence in row:
            previous_entities_in_window.append(list(sentence)) # All the entity in sentence , convert to list # TO do list
            previous_entities_in_window = previous_entities_in_window[-windows:] # LAst 10sentences

            # Flatten 2D List into 1D List
            previous_entities_flattened = sum(previous_entities_in_window, []) # From vertical list to horizontal list

            for entity in sentence:
                for entity_in_window in previous_entities_flattened:
                    if entity != entity_in_window:
                        entity_relationship.append(sorted([entity, entity_in_window])) # Standarization
    
    relationship_df = pd.DataFrame({'value': entity_relationship}) # Value column
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index() # How many times they occured together
    relationship_df = relationship_df.sort_values('value', ascending=False) # sOrt the values

    return relationship_df

# Most characters are at top and least one are at botton

In [30]:
relationship_df = generate_character_network(df)

In [72]:
relationship_df

Unnamed: 0,source,target,value
161,Naruto,Sasuke,87
185,Sakura,Sasuke,53
84,Iruka,Naruto,45
160,Naruto,Sakura,33
157,Naruto,Ninja,29
...,...,...,...
62,Hokage,Mizuki,1
59,Hokage,K-Kakashi,1
57,Hinata,Shino,1
56,Hinata,Shikamaru,1


In [73]:
relationship_df = relationship_df.sort_values('value', ascending=False)
relationship_df = relationship_df.head(200)

In [74]:
# create a network visualization of a graph using the NetworkX
# How close and how often they occur

G = nx.from_pandas_edgelist(
    relationship_df, 
    source='source', 
    target='target', 
    edge_attr='value',
    create_using=nx.Graph() # Convert to character network
)

net = Network(notebook=True, width="1000px", height="700px", bgcolor="#222222", font_color="white", cdn_resources="remote")
node_degree = dict(G.degree)

nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show("naruto.html")


naruto.html
