In [None]:
import os
from urllib.parse import quote

import bs4
from bs4 import BeautifulSoup
import requests

BASE_URL = 'http://www.imsdb.com'
corpus_path = 'C:/Users/Rik/Documents/corpus/'
scripts_path = 'imsdb_parsed'

In [None]:
def get_script_blocks_from_link(script_url):
    print("Retrieving script on", script_url)
    reading_state = []
    script_soup = BeautifulSoup(requests.get(script_url).text, "html.parser")
    found_blocks = []
    def get_text_recursively(tag, is_bold=False):
        if isinstance(tag, bs4.element.NavigableString) and len(reading_state) > 0:
            text = tag.string
            stripped_text = text.lstrip()
            leading_spaces = len(text) - len(stripped_text)
            is_empty = not stripped_text.strip()
            found_blocks.append({
                'text': stripped_text,
                'leading_spaces': leading_spaces,
                'is_bold': is_bold,
                'is_empty': is_empty
            })
        if isinstance(tag, bs4.element.Tag):
            if tag.name == 'td' and tag.has_attr('class') and tag['class'][0] == 'scrtext':
                reading_state.append(True)
            if tag.name == 'table' and len(reading_state) > 0:
                reading_state.pop()
            if tag.name != 'head' and tag.name != 'script':
                for el in tag:
                    get_text_recursively(el, tag.name == 'b' or is_bold)
    get_text_recursively(script_soup)
    return found_blocks

In [None]:
def get_character_scene_leading_spaces(df_script_blocks):
    df_script_blocks_bold = df_script_blocks[np.logical_and(df_script_blocks['is_bold']==True, df_script_blocks['is_empty']==False)]
    if(len(df_script_blocks_bold)==0):
        return None, None
    count_by_leading = [{'leading_spaces': leading_spaces, 'count': len(bold_group)} for leading_spaces, bold_group in df_script_blocks_bold.groupby(['leading_spaces'])]
    df_count_by_leading = pd.DataFrame(count_by_leading).sort_values('count', ascending=False)
    if(len(df_count_by_leading)> 2):
        return df_count_by_leading.iloc[0]['leading_spaces'], df_count_by_leading.iloc[1]['leading_spaces']
    return None, None

In [None]:
import re

def clean_text(text):
    text = text.replace('\r', ' ').replace('\n', ' ')
    return re.sub(r"\s+", " ", text).strip()

def clean_character(character):
    # check if there's a ( in the character
    if "(" not in character:
        return character, ""
    splitCharacter, splitBracket, splitSuffix = character.rpartition("(")
    return splitCharacter.strip(), (splitBracket + splitSuffix).strip()

In [None]:
import pandas as pd
import numpy as np
def get_script_from_page(script_url):
    try:
        script_blocks = get_script_blocks_from_link(script_url)
    except:
        print("error getting blocks")
        return None
    if (len(script_blocks)==0):
        print("No blocks found")
        return None
    df_script_blocks = pd.DataFrame(script_blocks)
    character_leading_spaces, scene_leading_spaces = get_character_scene_leading_spaces(df_script_blocks)
    if not character_leading_spaces:
        print("Couldn't find correct leading spaces")
        return None
    intro = ""
    scenes = []
    current_scene = None
    characters = []
    current_dialogue = None
    for idx, row in df_script_blocks.iterrows():
        cleaned_text = clean_text(row['text'])
        # if we don't have a scene and the row is not bold we're still in the intro
        if current_scene is None and not row['is_bold'] and cleaned_text != '':
            intro += cleaned_text + " "
        elif row['is_bold']:
            # if it's a bold row it's either a scene, a character, or an empty line
            if row['leading_spaces'] == scene_leading_spaces:
                # store previous scene
                if current_scene is not None:
                    scenes.append(current_scene)
                # create new scene
                current_scene = { 'heading': cleaned_text, 'body': []}
            elif row['leading_spaces'] == character_leading_spaces:
                # get character name
                character_name = cleaned_text
                cleaned_character_name, dialogue_suffix = clean_character(character_name)
                # if character in list, add
                if cleaned_character_name not in characters:
                    characters.append(cleaned_character_name)
                # add a dialogue
                current_dialogue = {
                    'type': 'dialogue',
                    'character': cleaned_character_name,
                    'suffix': dialogue_suffix,
                    'text': ''
                }
            elif row['is_empty']:
                # empty line might mean a dialogue is finished
                if current_dialogue is not None and current_scene is not None:
                    current_dialogue['text'] = current_dialogue['text'].strip() 
                    current_scene['body'].append(current_dialogue)
                    current_dialogue = None
            elif current_scene is None and cleaned_text != '':
                # if we're still in the intro but it's bold
                intro += cleaned_text + " "
        elif current_scene is not None:
            # if we still have a dialogue it's a dialogue
            if current_dialogue is not None and cleaned_text != '':
                current_dialogue['text'] += cleaned_text + " "
            elif cleaned_text != '':
                current_scene['body'].append({'type': 'action', 'text': cleaned_text})
    if current_dialogue is not None and current_scene is not None:
        current_dialogue['text'] = current_dialogue['text'].strip() 
        current_scene['body'].append(current_dialogue)
    if current_scene is not None:
        scenes.append(current_scene)
    
    intro = intro.strip()
    print("Found", len(scenes), "scenes (", len(scenes.filter(lambda s: len(s['body'] == 0)))," empty) and", len(characters), "characters")
    return {
        'intro': intro,
        'scenes': scenes,
        'characters': characters
    }

In [None]:
def get_script(anchor):
    title = anchor.text
    print(title)
    relative_link = anchor['href']
    tail = relative_link.split('/')[-1]
    script_front_url = BASE_URL + quote(relative_link)
    front_page_response = requests.get(script_front_url)
    front_soup = BeautifulSoup(front_page_response.text, "html.parser")

    try:
        script_link = front_soup.find_all('p', align="center")[0].a['href']
    except IndexError:
        print('%s has no script :(' % tail)
        return None, None

    if script_link.endswith('.html'):
        file_title = script_link.split('/')[-1].split(' Script')[0].strip('.html')
        script_url = BASE_URL + script_link
        script = get_script_from_page(script_url)
        if script is None:
            return None, None
        script['title'] = title
        script['file_title'] = file_title
        return file_title, script
    else:
        print('%s is a pdf :(' % tail)
        return None, None

In [None]:
import json

response = requests.get('http://www.imsdb.com/all%20scripts/')
html = response.text

soup = BeautifulSoup(html, "html.parser")
paragraphs = soup.find_all('p')

correct_parsed = []
no_script = []
error = []

for p in paragraphs:
    
    anchor = p.a
    
    title, script = get_script(anchor)
    if not script:
        no_script.append(anchor['href'])
        continue
    try:
        with open(os.path.join(corpus_path, scripts_path, title + '.json'), 'w') as outfile:
            outfile.write(json.dumps(script))
        correct_parsed.append(anchor['href'])
    except:
        print("error while writing")
        error.append(anchor['href'])
print("Correctly parsed", len(correct_parsed))
print("No script", len(no_script))
print("Errors", len(error))
for error_url in error:
    print(error_url)