In [1]:
import pandas as pd
import json

## Read all data files
The first column on Movies.csv is handled further manually due to encoding.

In [2]:
place_df = pd.read_csv('Harry_Potter_Movies/Places.csv')
chars_df = pd.read_csv('Harry_Potter_Movies/Characters.csv', encoding='ISO-8859-1')
dialogue_df = pd.read_csv('Harry_Potter_Movies/Dialogue.csv', encoding='ISO-8859-1')
chaps_df = pd.read_csv('Harry_Potter_Movies/Chapters.csv', encoding='ISO-8859-1')
movie_df = pd.read_csv('Harry_Potter_Movies/Movies.csv', encoding='ISO-8859-1').rename({'ï»¿Movie ID': 'Movie ID'}, axis='columns')

## Merge to a single dataframe
This helps us to extract either the ID or the actual name very easily.

In [3]:
df = pd.merge(dialogue_df, chaps_df, how="outer", on=["Chapter ID"])
df = pd.merge(df, chars_df, how="outer", on=["Character ID"])
df = pd.merge(df, place_df, how="outer", on=["Place ID"])
df = pd.merge(df, movie_df, how="outer", on=["Movie ID"])
df['Movie Title'] = df['Movie Title'].apply(lambda x: '_'.join(x.replace("'", '').lower().split()))
df[['Movie Title', 'Movie ID', 'Chapter ID', 'Chapter Name', 'Place ID', 'Place Name', 'Character Name']]

Unnamed: 0,Movie Title,Movie ID,Chapter ID,Chapter Name,Place ID,Place Name,Character Name
0,harry_potter_and_the_philosophers_stone,1,1,Doorstep Delivery,8,4 Privet Drive,Albus Dumbledore
1,harry_potter_and_the_philosophers_stone,1,1,Doorstep Delivery,8,4 Privet Drive,Albus Dumbledore
2,harry_potter_and_the_philosophers_stone,1,1,Doorstep Delivery,8,4 Privet Drive,Albus Dumbledore
3,harry_potter_and_the_philosophers_stone,1,1,Doorstep Delivery,8,4 Privet Drive,Albus Dumbledore
4,harry_potter_and_the_philosophers_stone,1,1,Doorstep Delivery,8,4 Privet Drive,Albus Dumbledore
...,...,...,...,...,...,...,...
7439,harry_potter_and_the_deathly_hallows_part_2,8,225,Your Mother's Eyes,24,Boathouse,Voldemort
7440,harry_potter_and_the_deathly_hallows_part_2,8,224,He Will Come To Me,24,Boathouse,Lucius Malfoy
7441,harry_potter_and_the_deathly_hallows_part_2,8,224,He Will Come To Me,24,Boathouse,Lucius Malfoy
7442,harry_potter_and_the_deathly_hallows_part_2,8,224,He Will Come To Me,24,Boathouse,Sybill Trelawney


## Combine to a dictionary
For each particular place and chapter we define all the characters contributing to the dialogues as a co-occurence network, unless it's just a single character at that space and time.

In [4]:
data_dict = {}
for movie_id in range(1, 9):
    temp_dict = {}
    temp = df[df['Movie ID'] == movie_id]
    title = list(set(temp['Movie Title']))
    assert len(title) == 1
    title = title[0]
    chaps, places = set(temp['Chapter Name']), set(temp['Place Name'])
    for chap_id in chaps:
        for place_id in places:
            temp2 = temp[(temp['Chapter Name'] == chap_id) & (temp['Place Name'] == place_id)]
            if temp2.shape[0] != 0:
                check = set(temp2['Character Name'])
                if len(check) != 1:
                    temp_dict[f'{chap_id} - {place_id}'] = ','.join(check)
    data_dict[title] = temp_dict

Wrap up the process by writing to a txt file in the harrypotter directory.

In [5]:
for title in data_dict:
    with open(f'harrypotter/{title}.txt', 'w') as f:
        for conn in data_dict[title].values():
            f.write(f'{conn}\n')
    f.close()

Export JSON string as well in case we need them.

In [6]:
json_string = json.dumps(data_dict)
with open('dialogues.json', 'w') as f:
    f.write(json_string)
f.close()

## Network processing

In [7]:
import seaborn as sns
import networkprocessing as np

In [8]:
# Read in the text file and process the network
np.make_network("harrypotter")

Processing harrypotter\harry_potter_and_the_chamber_of_secrets.txt
Processing harrypotter\harry_potter_and_the_deathly_hallows_part_1.txt
Processing harrypotter\harry_potter_and_the_deathly_hallows_part_2.txt
Processing harrypotter\harry_potter_and_the_goblet_of_fire.txt
Processing harrypotter\harry_potter_and_the_half-blood_prince.txt
Processing harrypotter\harry_potter_and_the_order_of_the_phoenix.txt
Processing harrypotter\harry_potter_and_the_philosophers_stone.txt
Processing harrypotter\harry_potter_and_the_prisoner_of_azkaban.txt


In [9]:
# Examine the dataset
networkInfo = pd.read_excel('harrypotter_analysis/networkInfo.xlsx')
nodeInfo = pd.read_excel('harrypotter_analysis/nodeInfo.xlsx')

In [10]:
networkInfo

Unnamed: 0,network_id,nodes,edges,avg path length,avg degree,avg weighted degree,diameter,radius,density
0,harry_potter_and_the_chamber_of_secrets,51,327,1.76549,12.823529,26.156863,3,2,0.256471
1,harry_potter_and_the_deathly_hallows_part_1,57,440,1.75188,15.438596,22.280702,3,2,0.275689
2,harry_potter_and_the_deathly_hallows_part_2,50,224,1.955918,8.96,14.24,4,2,0.182857
3,harry_potter_and_the_goblet_of_fire,42,265,1.772358,12.619048,25.761905,3,2,0.307782
4,harry_potter_and_the_half-blood_prince,39,213,1.805668,10.923077,20.871795,4,2,0.287449
5,harry_potter_and_the_order_of_the_phoenix,63,421,1.784434,13.365079,19.873016,2,1,0.215566
6,harry_potter_and_the_philosophers_stone,50,314,1.743673,12.56,24.52,2,1,0.256327
7,harry_potter_and_the_prisoner_of_azkaban,40,211,1.770513,10.55,22.75,3,2,0.270513


In [11]:
nodeInfo

Unnamed: 0.1,Unnamed: 0,label,degree,weighted_degree,betweenness,normalized_betweenness,eccentricity,closeness,network
0,0,Ron Weasley,42,157,201.692868,0.164647,2,0.862069,harry_potter_and_the_chamber_of_secrets
1,1,Harry Potter,48,195,368.715704,0.300992,2,0.961538,harry_potter_and_the_chamber_of_secrets
2,2,Gilderoy Lockhart,26,60,38.266064,0.031238,2,0.675676,harry_potter_and_the_chamber_of_secrets
3,3,Moaning Myrtle,4,10,0.000000,0.000000,2,0.520833,harry_potter_and_the_chamber_of_secrets
4,4,Draco Malfoy,25,59,28.498016,0.023264,2,0.666667,harry_potter_and_the_chamber_of_secrets
...,...,...,...,...,...,...,...,...,...
387,387,Pansy Parkinson,6,6,0.000000,0.000000,3,0.534247,harry_potter_and_the_prisoner_of_azkaban
388,388,All,7,7,0.000000,0.000000,3,0.506494,harry_potter_and_the_prisoner_of_azkaban
389,389,Parvati Patil,7,7,0.000000,0.000000,3,0.506494,harry_potter_and_the_prisoner_of_azkaban
390,390,Dean Thomas,7,7,0.000000,0.000000,3,0.506494,harry_potter_and_the_prisoner_of_azkaban
