In [37]:
import os
from urllib.parse import quote

import bs4
from bs4 import BeautifulSoup
import requests

BASE_URL = 'http://www.imsdb.com'
corpus_path = 'C:/Users/Rik/Documents/corpus/'
scripts_path = 'imsdb_parsed'

In [38]:
def clean_script(text):
    text = text.replace('Back to IMSDb', '')
    text = text.replace('''<b><!--
</b>if (window!= top)
top.location.href=location.href
<b>// -->
</b>
''', '')
    text = text.replace('''          Scanned by http://freemoviescripts.com
          Formatting by http://simplyscripts.home.att.net
''', '')
    return text.replace(r'\r', '')

In [130]:
def get_script_blocks_from_link(script_url):
    print("Retrieving script on", script_url)
    script_soup = BeautifulSoup(requests.get(script_url).text, "html.parser")
    script_text_tag = script_soup.find_all('td', {'class': "scrtext"})[0]
    found_blocks = []
    def get_text_recursively(tag, is_bold=False):
        print(tag)
        if isinstance(tag, bs4.element.NavigableString):
            text = tag.string
            stripped_text = text.lstrip()
            leading_spaces = len(text) - len(stripped_text)
            is_empty = not stripped_text.strip()
            found_blocks.append({
                'text': stripped_text,
                'leading_spaces': leading_spaces,
                'is_bold': is_bold,
                'is_empty': is_empty
            })
        if isinstance(tag, bs4.element.Tag):
            if tag.name != 'head' and tag.name != 'script':
                for el in tag:
                    get_text_recursively(el, tag.name == 'b' or is_bold)
                
    get_text_recursively(script_text_tag.find_parent('table'))
    return found_blocks

In [131]:
def get_character_scene_leading_spaces(df_script_blocks):
    df_script_blocks_bold = df_script_blocks[np.logical_and(df_script_blocks['is_bold']==True, df_script_blocks['is_empty']==False)]
    if(len(df_script_blocks_bold)==0):
        return None, None
    count_by_leading = [{'leading_spaces': leading_spaces, 'count': len(bold_group)} for leading_spaces, bold_group in df_script_blocks_bold.groupby(['leading_spaces'])]
    df_count_by_leading = pd.DataFrame(count_by_leading).sort_values('count', ascending=False)
    if(len(df_count_by_leading)> 2):
        return df_count_by_leading.iloc[0]['leading_spaces'], df_count_by_leading.iloc[1]['leading_spaces']
    return None, None

In [132]:
import re

def clean_text(text):
    text = text.replace('\r', ' ').replace('\n', ' ')
    return re.sub(r"\s+", " ", text).strip()

In [133]:
import pandas as pd
import numpy as np
def get_script_from_page(script_url):
    script_blocks = get_script_blocks_from_link(script_url)
    df_script_blocks = pd.DataFrame(script_blocks)
    character_leading_spaces, scene_leading_spaces = get_character_scene_leading_spaces(df_script_blocks)
    if not character_leading_spaces:
        print("Couldn't find correct leading spaces")
        return None
    intro = ""
    scenes = []
    current_scene = None
    characters = []
    current_dialogue = None
    for idx, row in df_script_blocks.iterrows():
        cleaned_text = clean_text(row['text'])
        # if we don't have a scene and the row is not bold we're still in the intro
        if current_scene is None and not row['is_bold']:
            intro += cleaned_text + " "
        elif row['is_bold']:
            # if it's a bold row it's either a scene, a character, or an empty line
            if row['leading_spaces'] == scene_leading_spaces:
                # store previous scene
                if current_scene is not None:
                    scenes.append(current_scene)
                # create new scene
                current_scene = { 'heading': cleaned_text, 'body': []}
            elif row['leading_spaces'] == character_leading_spaces:
                # get character name
                character_name = cleaned_text
                # add to characters if not there yet
                if character_name not in characters:
                    characters.append(character_name)
                # add a dialogue
                current_dialogue = { 'type': 'dialogue', 'character': character_name, 'text': '' }
            elif row['is_empty']:
                # empty line might mean a dialogue is finished
                if current_dialogue is not None and current_scene is not None:
                    current_dialogue['text'] = current_dialogue['text'].strip() 
                    current_scene['body'].append(current_dialogue)
                    current_dialogue = None
            elif current_scene is None:
                # if we're still in the intro but it's bold
                intro += cleaned_text + " "
        elif current_scene is not None:
            # if we still have a dialogue it's a dialogue
            if current_dialogue is not None:
                current_dialogue['text'] += cleaned_text + " "
            else:
                current_scene['body'].append({'type': 'action', 'text': cleaned_text})
    if current_dialogue is not None and current_scene is not None:
        current_dialogue['text'] = current_dialogue['text'].strip() 
        current_scene['body'].append(current_dialogue)
    if current_scene is not None:
        scenes.append(current_scene)
    
    intro = intro.strip()
    print("Found", len(scenes), "scenes and", len(characters), "characters")
    return {
        'intro': intro,
        'scenes': scenes,
        'characters': characters
    }
get_script_from_page('http://www.imsdb.com/scripts/Blade-II.html')

Retrieving script on http://www.imsdb.com/scripts/Blade-II.html
<table width="100%"><tr><td class="scrtext">
<pre><html><head><script>
<b><!--
</b>if (window!= top)
top.location.href=location.href
<b>// -->
</b></script><title>"BLADE II" -- by David Goyer</title>
<b></b></head>
</html></pre></td></tr></table>
<tr><td class="scrtext">
<pre><html><head><script>
<b><!--
</b>if (window!= top)
top.location.href=location.href
<b>// -->
</b></script><title>"BLADE II" -- by David Goyer</title>
<b></b></head>
</html></pre></td></tr>
<td class="scrtext">
<pre><html><head><script>
<b><!--
</b>if (window!= top)
top.location.href=location.href
<b>// -->
</b></script><title>"BLADE II" -- by David Goyer</title>
<b></b></head>
</html></pre></td>


<pre><html><head><script>
<b><!--
</b>if (window!= top)
top.location.href=location.href
<b>// -->
</b></script><title>"BLADE II" -- by David Goyer</title>
<b></b></head>
</html></pre>
<html><head><script>
<b><!--
</b>if (window!= top)
top.location.href=locat

In [121]:
def get_script(anchor):
    title = anchor.text
    print(title)
    relative_link = anchor['href']
    tail = relative_link.split('/')[-1]
    script_front_url = BASE_URL + quote(relative_link)
    front_page_response = requests.get(script_front_url)
    front_soup = BeautifulSoup(front_page_response.text, "html.parser")

    try:
        script_link = front_soup.find_all('p', align="center")[0].a['href']
    except IndexError:
        print('%s has no script :(' % tail)
        return None, None

    if script_link.endswith('.html'):
        file_title = script_link.split('/')[-1].split(' Script')[0].strip('.html')
        script_url = BASE_URL + script_link
        script = get_script_from_page(script_url)
        if script is None:
            return None, None
        script['title'] = title
        script['file_title'] = file_title
        return file_title, script
    else:
        print('%s is a pdf :(' % tail)
        return None, None

In [123]:
import json

response = requests.get('http://www.imsdb.com/all%20scripts/')
html = response.text

soup = BeautifulSoup(html, "html.parser")
paragraphs = soup.find_all('p')

correct_parse_count = 0
no_script_count = 0
error_count = 0

for p in paragraphs:
    
    anchor = p.a
    
    title, script = get_script(anchor)
    if not script:
        no_script_count+=1
        continue
    try:
        with open(os.path.join(corpus_path, scripts_path, title + '.json'), 'w') as outfile:
            outfile.write(json.dumps(script))
        correct_parse_count += 1
    except:
        print("error while writing")
        error_count += 1
print("Correctly parsed", correct_parse_count)
print("No script", no_script_count)
print("Errors", error_count)

10 Things I Hate About You
Retrieving script on http://www.imsdb.com/scripts/10-Things-I-Hate-About-You.html
Found 141 scenes and 38 characters
12
Retrieving script on http://www.imsdb.com/scripts/12.html
Couldn't find correct leading spaces
12 and Holding
Retrieving script on http://www.imsdb.com/scripts/12-and-Holding.html
Found 266 scenes and 35 characters
12 Monkeys
Retrieving script on http://www.imsdb.com/scripts/12-Monkeys.html
Found 201 scenes and 105 characters
12 Years a Slave
Retrieving script on http://www.imsdb.com/scripts/12-Years-a-Slave.html
Found 228 scenes and 136 characters
127 Hours
Retrieving script on http://www.imsdb.com/scripts/127-Hours.html
Found 262 scenes and 49 characters
1492: Conquest of Paradise
Retrieving script on http://www.imsdb.com/scripts/1492-Conquest-of-Paradise.html
Found 176 scenes and 54 characters
15 Minutes
Retrieving script on http://www.imsdb.com/scripts/15-Minutes.html
Found 111 scenes and 96 characters
17 Again
Retrieving script on http:

Found 238 scenes and 78 characters
American President, The
Retrieving script on http://www.imsdb.com/scripts/American-President,-The.html
Found 145 scenes and 88 characters
American Psycho
Retrieving script on http://www.imsdb.com/scripts/American-Psycho.html
Couldn't find correct leading spaces
American Shaolin: King of Kickboxers II
Retrieving script on http://www.imsdb.com/scripts/American-Shaolin-King-of-Kickboxers-II.html
Couldn't find correct leading spaces
American Sniper
Retrieving script on http://www.imsdb.com/scripts/American-Sniper.html
Found 293 scenes and 50 characters
American Splendor
Retrieving script on http://www.imsdb.com/scripts/American-Splendor.html
Found 247 scenes and 56 characters
American Werewolf in London
Retrieving script on http://www.imsdb.com/scripts/American-Werewolf-in-London.html
Couldn't find correct leading spaces
American, The
Retrieving script on http://www.imsdb.com/scripts/American,-The.html
Found 189 scenes and 13 characters
Amityville Asylum,

Couldn't find correct leading spaces
Basquiat
Retrieving script on http://www.imsdb.com/scripts/Basquiat.html
Found 166 scenes and 76 characters
Batman
Retrieving script on http://www.imsdb.com/scripts/Batman.html
Found 62 scenes and 86 characters
Batman 2
Retrieving script on http://www.imsdb.com/scripts/Batman-2.html
Found 358 scenes and 83 characters
Batman and Robin
Batman and Robin Script.html has no script :(
Batman Begins
Batman Begins Script.html has no script :(
Batman Forever
Batman Forever Script.html has no script :(
Batman Returns
Batman Returns Script.html has no script :(
Battle of Algiers, The
Retrieving script on http://www.imsdb.com/scripts/Battle-of-Algiers,-The.html
Found 145 scenes and 69 characters
Battle of Shaker Heights, The
Retrieving script on http://www.imsdb.com/scripts/Battle-of-Shaker-Heights,-The.html
Found 120 scenes and 39 characters
Battle: Los Angeles
Retrieving script on http://www.imsdb.com/scripts/Battle-Los-Angeles.html
Found 161 scenes and 297 c

Couldn't find correct leading spaces
Box, The
Retrieving script on http://www.imsdb.com/scripts/Box,-The.html
Found 393 scenes and 105 characters
Boxtrolls, The
Retrieving script on http://www.imsdb.com/scripts/Boxtrolls,-The.html
Found 214 scenes and 46 characters
Boyhood
Retrieving script on http://www.imsdb.com/scripts/Boyhood.html
Found 397 scenes and 105 characters
Braveheart
Retrieving script on http://www.imsdb.com/scripts/Braveheart.html
Found 198 scenes and 93 characters
Brazil
Retrieving script on http://www.imsdb.com/scripts/Brazil.html
Couldn't find correct leading spaces
Break
Retrieving script on http://www.imsdb.com/scripts/Break.html
Found 3 scenes and 5 characters
Breakdown
Retrieving script on http://www.imsdb.com/scripts/Breakdown.html
Couldn't find correct leading spaces
Breakfast Club, The
Retrieving script on http://www.imsdb.com/scripts/Breakfast-Club,-The.html
Found 75 scenes and 5 characters
Breaking Away
Retrieving script on http://www.imsdb.com/scripts/Breaki

Retrieving script on http://www.imsdb.com/scripts/Color-of-Night.html
Found 120 scenes and 40 characters
Commando
Retrieving script on http://www.imsdb.com/scripts/Commando.html
Found 274 scenes and 270 characters
Conan the Barbarian
Retrieving script on http://www.imsdb.com/scripts/Conan-the-Barbarian.html
Found 161 scenes and 65 characters
Confessions of a Dangerous Mind
Retrieving script on http://www.imsdb.com/scripts/Confessions-of-a-Dangerous-Mind.html
Found 337 scenes and 99 characters
Confidence


KeyboardInterrupt: 