In [1]:
from HTMLParser import HTMLParser
import numpy as np
from __future__ import division
import vincent
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
import pandas as pd

In [2]:
# Load the script
with open('script_LOTRI.html', 'r') as myfile:
    script=myfile.read().replace('\n', '')

# Modify the script to fix some issues.
script = script.replace("FRO DO", "FRODO")
script = script.replace("STRIDER", "ARAGORN")

In [3]:
# Custom HTML Parser to parse the characters.
class MyHTMLCharacterParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.is_b = False
        self.characters = set()
    
    def handle_starttag(self, tag, attrs):
        if tag == "b":
            self.is_b = True

    def handle_endtag(self, tag):
        if tag == "b":
            self.is_b = False

    def handle_data(self, data):
        if self.is_b == True:
            if data[0] == " " and ":" not in data and "(" not in data:
                self.characters.add(data.strip(" ,.·;:\\\"/-)"))
                

# Instantiate the parser and feed it the script.
parser = MyHTMLCharacterParser()
parser.feed(script)

# Clean the character list.
parser.characters.remove("FRODO DISAPPEARS")
parser.characters.remove("FADE TO BLACK")

for elem in list(parser.characters):
    if "V/0" in elem:
        parser.characters.remove(elem)
    if elem.isdigit():
        parser.characters.remove(elem)

print len(parser.characters)

characters = []
for elem in parser.characters:
    characters.append(elem)
    print elem

27
GALADRIEL
WITCH KING
LURTZ
BILBO
CELEBORN
PIPPIN
GANDALF
ELROND
ODO PROUDFOOT
BUTTERBUR
SARUMAN
GIMLI
BLACK RIDER
ARWEN
SAM
GOLLUM
MERRY
HOBBIT BOUNDER
HALDIR
BOROMIR
ORC OVERSEER
ARAGORN
GATEKEEPER
ISILDUR
LEGOLAS
FRODO
FARMER MAGGOT


In [4]:
import networkx as nx
import matplotlib.pyplot as plt

G = nx.Graph()

# Add the nodes
for char in characters:
    G.add_node(char)

In [5]:
# Add the edges, find the scenes using the words "EXT." and "INT."
class MyHTMLSceneParser(HTMLParser):
    def __init__(self, characters):
        HTMLParser.__init__(self)
        self.is_b = False
        self.characters = characters
        self.scenes = []
        self.char_in_scene = set()
    
    def handle_starttag(self, tag, attrs):
        if tag == "b":
            self.is_b = True
        
    def handle_endtag(self, tag):
        if tag == "b":
            self.is_b = False
        
    def handle_data(self, data):
        if data[:4] == "EXT." or data[:4] == "INT.":
            self.scenes.append(self.char_in_scene)
            self.char_in_scene = set()
        else:
            if self.is_b:
                char = data.strip(" ,.·;:\\\"/-)")
                if char in self.characters:
                    self.char_in_scene.add(char)
                
                

# Instantiate the parser and feed it the script.
parser = MyHTMLSceneParser(characters)
parser.feed(script)
# Add the last scene.
parser.scenes.append(parser.char_in_scene)

all_scenes = parser.scenes


In [6]:
# Add the edges to the graph.

for scene in parser.scenes:
    for i, c in enumerate(list(scene)):
        for j in range(i+1, len(scene)):
            if G.has_edge(c, list(scene)[j]):
                G[c][list(scene)[j]]['weight'] = G[c][list(scene)[j]]['weight'] + 1
            else:
                G.add_edge(c, list(scene)[j], weight=1)



In [7]:
# Remove the nodes with degree 0.
for char in characters:
    if G.degree(char) == 0:
        G.remove_node(char)
        print char


CELEBORN
HOBBIT BOUNDER


In [8]:
x = nx.algorithms.pagerank(G)

import operator
sorted_x = sorted(x.items(), key=operator.itemgetter(1))
sorted_x.reverse()
sorted_x[:10]

[('FRODO', 0.1112640227785102),
 ('SAM', 0.10622423833917934),
 ('GANDALF', 0.10021622405885808),
 ('ARAGORN', 0.09146293723316566),
 ('MERRY', 0.08186887211931439),
 ('PIPPIN', 0.06792813427238063),
 ('BOROMIR', 0.05476240988844253),
 ('GIMLI', 0.05441090564628471),
 ('LEGOLAS', 0.05135240404147857),
 ('FARMER MAGGOT', 0.04)]

In [9]:
import community

com = community.best_partition(G)

sorted_x = sorted(com.items(), key=operator.itemgetter(1))
sorted_x.reverse()
sorted_x

[('FARMER MAGGOT', 3),
 ('BLACK RIDER', 3),
 ('ARAGORN', 2),
 ('WITCH KING', 2),
 ('ARWEN', 2),
 ('LEGOLAS', 2),
 ('HALDIR', 2),
 ('BOROMIR', 2),
 ('LURTZ', 2),
 ('GIMLI', 2),
 ('ELROND', 1),
 ('GOLLUM', 1),
 ('GANDALF', 1),
 ('ISILDUR', 1),
 ('BILBO', 1),
 ('ORC OVERSEER', 1),
 ('SARUMAN', 1),
 ('GALADRIEL', 1),
 ('PIPPIN', 0),
 ('FRODO', 0),
 ('SAM', 0),
 ('BUTTERBUR', 0),
 ('ODO PROUDFOOT', 0),
 ('MERRY', 0),
 ('GATEKEEPER', 0)]

In [10]:
x = nx.algorithms.degree_centrality(G)

sorted_x = sorted(x.items(), key=operator.itemgetter(1))
sorted_x.reverse()
sorted_x[:10]

[('SAM', 0.6666666666666666),
 ('PIPPIN', 0.5833333333333333),
 ('FRODO', 0.5833333333333333),
 ('GANDALF', 0.5416666666666666),
 ('MERRY', 0.5416666666666666),
 ('ARAGORN', 0.5),
 ('BOROMIR', 0.4583333333333333),
 ('ELROND', 0.41666666666666663),
 ('LEGOLAS', 0.41666666666666663),
 ('GIMLI', 0.41666666666666663)]

In [11]:
x = nx.algorithms.betweenness_centrality(G)

sorted_x = sorted(x.items(), key=operator.itemgetter(1))
sorted_x.reverse()
sorted_x[:10]

[('GANDALF', 0.1690217391304348),
 ('SAM', 0.1306763285024155),
 ('FRODO', 0.11316425120772948),
 ('ELROND', 0.07608695652173914),
 ('ARWEN', 0.07608695652173914),
 ('SARUMAN', 0.07608695652173914),
 ('PIPPIN', 0.06455314009661835),
 ('BILBO', 0.04981884057971015),
 ('ARAGORN', 0.0434782608695652),
 ('MERRY', 0.03134057971014492)]

In [38]:
# Save the network.
nx.write_gexf(G, "LOTRI.gexf")

In [12]:
import collections
    
# create a subclass and override the handler methods
class MyHTMLSceneParser(HTMLParser):
    def __init__(self, characters):
        HTMLParser.__init__(self)
        self.is_b = False
        self.characters = characters
        self.lines = []
    
    def handle_starttag(self, tag, attrs):
        if tag == "b":
            self.is_b = True
        
    def handle_endtag(self, tag):
        if tag == "b":
            self.is_b = False
        
    def handle_data(self, data):
        if self.is_b:
            char = data.strip(" ,.·;:\\\"/-)")
            if char in self.characters:
                self.lines.append(char)

                

# Instantiate the parser and feed it the script.
parser = MyHTMLSceneParser(characters)
parser.feed(script)

counter = collections.Counter(parser.lines)
for elem in counter.most_common(10):
    print elem



('GANDALF', 125)
('FRODO', 111)
('ARAGORN', 63)
('SAM', 53)
('BILBO', 36)
('BOROMIR', 36)
('MERRY', 33)
('PIPPIN', 31)
('SARUMAN', 23)
('GIMLI', 23)


In [13]:
import vincent
pre_data = counter.most_common(10)

vincent.core.initialize_notebook()
data = [x[1] for x in pre_data]
index = [x[0] for x in pre_data]
s = pd.Series(data=data, index=index)

bars = vincent.Bar(s)
bars.colors(brew='Pastel2')
bars.axis_titles(x='Character', y='# lines')
bars.scales['x'].padding = 0.1
bars.display()

In [14]:
# Character vs Number of scenes
all_scenes2 = []
for scene in all_scenes:
    ls = list(scene)
    all_scenes2.extend(ls)
    
counter = collections.Counter(all_scenes2)
for elem in counter.most_common(10):
    print elem


('FRODO', 43)
('GANDALF', 40)
('SAM', 27)
('ARAGORN', 27)
('MERRY', 18)
('BOROMIR', 13)
('PIPPIN', 13)
('SARUMAN', 11)
('GIMLI', 11)
('LEGOLAS', 10)


In [15]:
import vincent
pre_data = counter.most_common(10)

vincent.core.initialize_notebook()
data = [x[1] for x in pre_data]
index = [x[0] for x in pre_data]
s = pd.Series(data=data, index=index)

bars = vincent.Bar(s)
bars.colors(brew='Pastel2')
bars.axis_titles(x='Character', y='# scenes')
bars.scales['x'].padding = 0.1
bars.display()