In [22]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [18]:
# Download English module
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------- -------------------------------- 2.4/12.8 MB 19.2 MB/s eta 0:00:01
     -------------------------------- ------ 10.7/12.8 MB 33.6 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 32.2 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [20]:
# Load spacy English module
NER = spacy.load("en_core_web_sm")

In [24]:
# Load the book
with open('Alice_article_Wiki.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')
book = NER(data)

In [26]:
# Visualize identified entities
displacy.render(book[273:20000], style = "ent", jupyter = True)

In [36]:
## Get named entity list per sentence
df_sentences = [] # empty shell to store results
# Loop through sentences, get entity list for each sentence
for sent in book.sents:
       entity_list = [ent.text for ent in sent.ents]
       df_sentences.append({"sentence": sent, "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)

In [38]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"( , Alice, 's, Adventures, in, Wonderland, ...",[Wonderland - Wikipedia ...
1,"(à¥€Bahasa, Melayuê¯ƒê¯¤ê¯‡ê¯)","[à¥€Bahasa, Melayuê¯ƒê¯¤ê¯‡ê¯]"
2,"(©, ê¯‚ê¯£ê¯Ÿá€, ™, á€¼á€”á€ºá€, ™, á€¬á€˜á€¬á...","[bokmÃ¥lNorsk, EnglishSlovenÄinaÅšlÅ¯nskiÚ]"
3,"(©, ÙˆØ±Ø¯ÛŒÐ¡Ñ€Ð¿ÑÐºÐ¸, /, srpskiSrpskohrvats...","[Edit, ArticleTalk ]"
4,"(informationCite, this, pageGet, shortened, UR...","[URLDownload, Print, Download, PDFPrintable, 1..."
5,"(For, other, uses, ,, see, Alice, in, Wonderla...","[Alice, Wonderland]"
6,"(Alice, 's, Adventures, in, Wonderland, First,...","[Alice, Wonderland First, 1865)AuthorLewis, 18..."
7,"(It, details, the, story, of, a, girl, named, ...",[Alice]
8,"(It, is, seen, as, an, example, of, the, liter...",[]
9,"(The, artist, John, Tenniel, provided, 42, woo...","[John Tenniel, 42]"


In [42]:
## Load character names
# Import characters
character_df = pd.read_csv("characters_alice.csv", index_col = 0)

In [44]:
character_df.head()

Unnamed: 0,character,character_alias
0,Alice,Alice
1,The White Rabbit,Rabbit
2,The Mouse,Mouse
3,The Dodo,Dodo
4,The Lory,Lory


In [48]:
## Filtering entities from the book
# Function to filter out entities not of interest
def filter_entity(ent_list, character_df):
       return [ent for ent in ent_list
                  if ent in list(character_df['character_alias'])]

In [50]:
# Check
filter_entity(["Alice", "CF", "2"], character_df)

['Alice']

In [54]:
df_sentences['character_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, character_df))

In [56]:
df_sentences['character_entities'].head(20)

0            [Alice]
1                 []
2                 []
3                 []
4                 []
5            [Alice]
6     [Alice, Alice]
7            [Alice]
8                 []
9                 []
10                []
11                []
12                []
13           [Alice]
14                []
15                []
16                []
17                []
18                []
19           [Alice]
Name: character_entities, dtype: object

In [58]:
# Filter out sentences that don’t have any character entities
df_sentences_filtered = df_sentences[df_sentences['character_entities'].map(len) > 0]

In [60]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,character_entities
858,"(Alice, in, Wonderland, (, 1966, ))","[Alice, Wonderland]",[Alice]
862,"(Alice, through, the, Looking, Glass, (, 1998, ))","[Alice, 1998]",[Alice]
863,"(Alice, in, Wonderland, (, 1999, ))","[Alice, Wonderland]",[Alice]
864,"(Alice, (, 2009, ))","[Alice, 2009]",[Alice]
866,"(Alice, 's, Wonderland, Bakery, (, 2022, ), De...","[Alice, Wonderland Bakery, 2022, 1967]",[Alice]
868,"(Alice, in, Wonderland, (, 2010, ), Almost, Al...","[Alice, Wonderland, 2010, Almost Alice, 2010, ...",[Alice]
869,"(Alice, in, Wonderland, (, 1985, ))","[Alice, Wonderland, 1985]",[Alice]
874,"(Alice, in, Wonderland, (, 2000, ), American, ...","[Alice, Wonderland, 2000, American, McGee, Alice]","[Alice, Alice]"
875,"(Alice, in, the, Country, of, Hearts, (, 2007, ))",[Alice],[Alice]
876,"(Alice, in, Wonderland, (, 2010, ))","[Alice, Wonderland, 2010]",[Alice]


In [34]:
## Create relationships
# Defining relationships
# window size = 5 : this defines how many sentences will be looked at simultaneously
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].character_entities), [])

# Remove duplicated characters that are next to each other
char_unique = [char_list[i] for i in range(len(char_list))
    if (i==0) or char_list[i] != char_list[i-1]]
if len(char_unique) >1:
    for idx, a in enumerate( char_unique[ :-1]):
        b = char_unique[idx + 1]
        relationship.append({"source": a, "target": b})

NameError: name 'df_sentences_filtered' is not defined

In [156]:
relationship_df = pd.DataFrame(relationships)

In [158]:
relationship_df

In [160]:
# Sort the cases with a- >b and b- >a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

In [166]:
# Summarize the interactions
relationships_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

relationship_df.head(10)

KeyError: 'source'