In [1]:
import aiohttp
import asyncio
import requests
from bs4 import BeautifulSoup
import urllib.parse
import re
import ssl
import json
import os
from cleantext import clean

In [2]:
BASE_URL = 'http://www.kacl780.net'
CONCURRENT_LIMIT = 5

In [3]:
def parse_catalog(html_doc):
    soup = BeautifulSoup(html_doc, 'html.parser')
    links = list(map(lambda x: urllib.parse.urljoin(BASE_URL, x.get('href')), soup.select('#rightCol .SeasonList a')))
    return links

In [4]:
async def fetch_page(session, url, semaphore):
    async with semaphore:
        async with session.get(url) as response:
            return await response.text()

async def fetch_pages(urls):
    async with aiohttp.ClientSession() as session:
        semaphore = asyncio.Semaphore(CONCURRENT_LIMIT)
        results = await asyncio.gather(*[fetch_page(session, url, semaphore) for url in urls], return_exceptions=True)
        return results

In [5]:
def split_title(title_string):
    #Example input: '[2.1]Slow Tango In South Seattle'
    title_string = title_string.strip()
    season_ep = re.match(r"\[(\d+\.\d+).*\]",  title_string)[1]
    season, episode = season_ep.split('.')
    season, episode = int(season), int(episode)
    title = title_string.split(']')[1].strip()
    return {'season': season, 'episode': episode, 'title': title}

In [6]:
def get_speech(transcript):
    reg = re.compile('\n *\w+:')
    transcript = re.sub(re.compile('\[(.|\n)*?\]'), '', transcript)
    dialogue = reg.split(transcript)[1:]
    speech_tags = list(map(lambda x: x.strip().replace(':',''), reg.findall(transcript)))
    speech = [{'character': character, 'line': line} for character, line in zip(speech_tags, dialogue)]
    linetab = re.compile('[\n\t\r ]+')
    for x in speech:
        x['line'] = re.compile('\n\n').split(x['line'])[0]
        x['line'] = re.sub(linetab, ' ', x['line']).strip()
    speech = list(filter(lambda x: len(x['line']) > 0, speech))
    return speech

In [7]:
def parse_transcript_page(html_doc):
    soup = BeautifulSoup(html_doc, 'html.parser')
    try:
        title_text = soup.select_one('#rightCol h1').text
    except:
        title_text = soup.select_one('#rightCol h2').text
    data = split_title(title_text)
    try:
        transcript = list(filter(lambda x: x.find(string=re.compile('(Transcript|Quotes & Scene Summary)')), soup.select('#rightCol h2')))[0].find_next_sibling('pre').get_text(' ')
    except:
        transcript = soup.find(string=re.compile('^(ACT|Act)'))
    data['transcript'] = transcript#.text.replace('\n','')#clean(transcript, lower=False, to_ascii=False)
    data['dialogue'] = get_speech(data['transcript'])
    return data

In [8]:
r = requests.get('http://www.kacl780.net/frasier/transcripts/')
urls = parse_catalog(r.text)

responses = await fetch_pages(urls)

In [9]:
transcripts = list(map(lambda x: parse_transcript_page(x), responses))
for script in transcripts:
    filepath = 'data/site=kacl780/season={season}/Frasier_{season}x{episode}.json'.format(season=script['season'], episode=script['episode'])
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, 'w') as file:
        json.dump(script, file)
        print(filepath)

data/site=kacl780/season=1/Frasier_1x1.json
data/site=kacl780/season=1/Frasier_1x2.json
data/site=kacl780/season=1/Frasier_1x3.json
data/site=kacl780/season=1/Frasier_1x4.json
data/site=kacl780/season=1/Frasier_1x5.json
data/site=kacl780/season=1/Frasier_1x6.json
data/site=kacl780/season=1/Frasier_1x7.json
data/site=kacl780/season=1/Frasier_1x8.json
data/site=kacl780/season=1/Frasier_1x9.json
data/site=kacl780/season=1/Frasier_1x10.json
data/site=kacl780/season=1/Frasier_1x11.json
data/site=kacl780/season=1/Frasier_1x12.json
data/site=kacl780/season=1/Frasier_1x13.json
data/site=kacl780/season=1/Frasier_1x14.json
data/site=kacl780/season=1/Frasier_1x15.json
data/site=kacl780/season=1/Frasier_1x16.json
data/site=kacl780/season=1/Frasier_1x17.json
data/site=kacl780/season=1/Frasier_1x18.json
data/site=kacl780/season=1/Frasier_1x19.json
data/site=kacl780/season=1/Frasier_1x20.json
data/site=kacl780/season=1/Frasier_1x21.json
data/site=kacl780/season=1/Frasier_1x22.json
data/site=kacl780/s