In [5]:
from urllib.request import urlopen

In [37]:
# URLs
plays = ["Henry IV Part 1"]
urls = {"Henry IV Part 1": "http://shakespeare.mit.edu/1henryiv/full.html"}

In [101]:
# data storage mechanism
import pandas as pd
# the attributes that define each row of data in the dataframe
"""
id: number of line within entire text of play
act: the Act number
scene: the scene number within the Act
line: the line number within the Act and Scene
ref: the full line location in the format of act.scene.line as numbers
speech: identifying which speech/block of dialogue this line belongs to within the scene
speaker: the character to whom this speech is attributed
text: the actual text of the line
"""
line_attrs = ["id", "act", "scene", "line", "ref", "speech", "speaker", "text"]
# define a dataframe for each play
temp_play_data = []

In [76]:
# helper function to convert roman numerals (up to X)
def conv_roman(numeral):
    """
    :param numeral: a string representing a roman numeral between I (1) and X (10)
    :return an integer that is the decimal representation of the roman numeral
    """
    mapping = {
        "I": 1, "II": 2, "III": 3, "IV": 4, "V": 5, "VI": 6, "VII": 7, "VIII": 8, "IX": 9, "X": 10
    }
    return mapping[numeral]

In [123]:
from html.parser import HTMLParser
class MyParser(HTMLParser):
    
    # keeping track of current location and associated attributes during HTML feeding
    tracker = {}
    temp_play_data = []
    
    def reset_tracker(self, play):
        """
        helper function to reset and set the tracking variables
        """
        self.tracker = {
            "id": 1,
            "play": play,
            "act": None,
            "scene": None,
            "line": None,
            "speech": None,
            "speaker": None,
            "last_tag": None
        }
        self.temp_play_data = []
        
    def get_data(self):
        return self.temp_play_data
    
    def handle_starttag(self, tag, attrs):
        track_tag = True
        # marks new speech
        if tag == "a":
            track_tag = False
            if "name" in dict(attrs):
                attr = dict(attrs)["name"]
                if 'speech' in attr:
                    # get the name for speech
                    self.tracker["speech"] = int(attr[6:])
                else:
                    self.tracker['line'] = attr
                    track_tag = True
        if track_tag:
            self.tracker["last_tag"] = tag
        

    def handle_endtag(self, tag):
        pass

    def handle_data(self, data):
        # ACT or SCENE
        if self.tracker["last_tag"] == "h3":
            # get the ACT
            if data[:3] == "ACT":
                self.tracker["act"] = conv_roman(data[4:])
            # get the SCENE
            if data[:5] == "SCENE":
                # get everything before the first period and drop the first 6 characters
                self.tracker["scene"] = conv_roman(data.split(".")[0][6:])
        # Speaker
        if self.tracker["last_tag"] == "b" and data.strip() != "":
            self.tracker["speaker"] = data.strip()
        # Get the line text
        if self.tracker["last_tag"] == "a":
            # insert into data list for current play
            line_data = [self.tracker["id"], self.tracker["act"], self.tracker["scene"], 
                         self.tracker["line"].split(".")[-1], self.tracker["line"], self.tracker["speech"],
                         self.tracker["speaker"], data]
            self.temp_play_data.append(line_data)
            self.tracker["id"] += 1

In [124]:
# loop through each play to construct dataframe for its text
plays_data_df = {}
# create instance of the parser
parser = MyParser()
for play in plays:
    # get HTML as string
    html_data = urlopen(urls[play]).read().decode('utf-8')
    # set parser for a new play and feed it the HTML
    parser.reset_tracker(play)
    parser.feed(html_data)
    # extract and store the result in a dataframe
    plays_data_df[play] = pd.DataFrame(parser.get_data(), columns=line_attrs)

In [126]:
# see head of one play for sanity check
plays_data_df["Henry IV Part 1"].head()

Unnamed: 0,id,act,scene,line,ref,speech,speaker,text
0,1,1,1,1,1.1.1,1,KING HENRY IV,"So shaken as we are, so wan with care,"
1,2,1,1,2,1.1.2,1,KING HENRY IV,"Find we a time for frighted peace to pant,"
2,3,1,1,3,1.1.3,1,KING HENRY IV,And breathe short-winded accents of new broils
3,4,1,1,4,1.1.4,1,KING HENRY IV,To be commenced in strands afar remote.
4,5,1,1,5,1.1.5,1,KING HENRY IV,No more the thirsty entrance of this soil
