# Arrested Development Network From Webscraping Transcripts
## By Oliver Gladfelter
### Nov 14, 2019

In [10]:
import pandas as pd
import lxml.html as lh
import io
import requests
from bs4 import BeautifulSoup
import re
import itertools
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm

## Create list of links to each episode

In [11]:
html = requests.get("https://arresteddevelopment.fandom.com/wiki/Category:Season_One_Transcripts")
html = html.text
bs = BeautifulSoup(html, "lxml")
links1 = bs.find("ol").findAll("li")

html = requests.get("https://arresteddevelopment.fandom.com/wiki/Category:Season_Two_Transcripts")
html = html.text
bs = BeautifulSoup(html, "lxml")
links2 = bs.find("ol").findAll("li")

html = requests.get("https://arresteddevelopment.fandom.com/wiki/Category:Season_Three_Transcripts")
html = html.text
bs = BeautifulSoup(html, "lxml")
links3 = bs.find("ol").findAll("li")

html = requests.get("https://arresteddevelopment.fandom.com/wiki/Category:Season_Four_Transcripts")
html = html.text
bs = BeautifulSoup(html, "lxml")
links4 = bs.find("ol").findAll("li")

links = links1 + links2 + links3 + links4

## Function to count how many scenes each character appeared in in an episode

In [12]:
def countCharacterCountsInEpisode(transcriptLink):
    """
    Doesn't return anything - just counts number of each character's scenes in a given episode and updates sceneCount dictionary
    """
    
    # web scrape entire transcript, grab all <p> (lines of dialogue) and <h3>, <h2>, <d1> (denotes new scene) tags
    html = requests.get(transcriptLink)
    html = html.text
    bs = BeautifulSoup(html, "lxml")
    script = bs.findAll(['p','h3', 'h2', 'dl'])

    # get rid of title, credits, cast data
    for num in range(0,len(script)):
        if script[num].text == 'Act 1': # this should be the start of the script
            script = script[num+1:]
            break
    
    allScenes = []
    oneScene = []

    for tag in script:
        # span tags represent official new scenes/acts in the script
        # dd tags represent flashbacks
        # i tags with 'cut' in the text represent cuts / cutaways
        # all three cases are interpreted as new scenes
        if (tag.find('span') is not None) | (tag.find('dd') is not None) | ((tag.find('i') is not None) & ('cut' in tag.text.lower())): #  represents new scene
            allScenes.append(oneScene) # append current scene to allScenes list
            oneScene = [] # next scene restarts with empty list
        elif tag.find('b') is not None: # represents dialogue. The bold text specifically is the speaker:
            oneScene.append(tag.find('b').text.strip(" ").strip(":")) # add character speaking the dialogue to list 
                  
    allScenes = [x for x in allScenes if x != []] # remove all error, empty scenes
    
    #sceneCount = {} # used to count number of scenes each character in episode appear in

    for scene in allScenes:
        scene = set(scene) # show only unique values; show each name just once

        for name in scene: # iterate over all names in each scene
            if name in sceneCount: # if already in dictionary,
                sceneCount[name] = sceneCount[name] + 1 # increment count value by 1
            else: # if not already in dictionary
                sceneCount[name] = 1 # add with count of 1

## Run character counting function on all episodes in season 1

In [13]:
sceneCount = {} # used to count number of scenes each character in episode appear in

for link in links: # iterate over each episode link
    countCharacterCountsInEpisode("https://arresteddevelopment.fandom.com" + link.find('a')['href']) # update sceneCount dictionary
    
# remove the quite verbose narrator
sceneCount.pop('Narrator')
sceneCount.pop('Guest Starring')
sceneCount.pop('Starring')

sceneCount.update({'George': sceneCount['George'] + sceneCount['George, Sr.']}) # George's lines are sometimes attributed to 'George, Sr.' so we need to combine these two results in the dictionary
sceneCount.pop('George, Sr.') # now remove because it's redundant 

119

## Convert resulting dictionary into dataframe; extract top characters by count into list

In [14]:
characterSceneCountsDF = pd.DataFrame({'character':list(sceneCount.keys()), 'sceneCount':list(sceneCount.values())}).sort_values('sceneCount',ascending=False).reset_index()
del characterSceneCountsDF['index']

In [41]:
mainCharacters = list(characterSceneCountsDF['character'][0:10])

In [18]:
characterSceneCountsDF.head(10)

Unnamed: 0,character,sceneCount
0,Michael,541
1,G.O.B.,297
2,Lindsay,258
3,Lucille,239
4,George Michael,221
5,Tobias,203
6,George,182
7,Buster,170
8,Maeby,158
9,Oscar,53


## Function to scrape episode transcript, count pairs between characters, return df

In [38]:
def countPairsInEpisode(transcriptLink):
    # web scrape entire transcript, grab all <p> (lines of dialogue) and <h3>, <h2>, <d1> (denotes new scene) tags
    html = requests.get(transcriptLink)
    html = html.text
    bs = BeautifulSoup(html, "lxml")
    script = bs.findAll(['p','h3', 'h2', 'dl'])

    # get rid of title, credits, cast data
    for num in range(0,len(script)):
        if script[num].text == 'Act 1': # this should be the start of the script
            script = script[num+1:]
            break
    
    allScenes = []
    oneScene = []

    for tag in script:
        # span tags represent official new scenes/acts in the script
        # dd tags represent flashbacks
        # i tags with 'cut' in the text represent cuts / cutaways
        # all three cases are interpreted as new scenes
        if (tag.find('span') is not None) | (tag.find('dd') is not None) | ((tag.find('i') is not None) & ('cut' in tag.text.lower())): #  represents new scene
            allScenes.append(oneScene) # append current scene to allScenes list
            oneScene = [] # next scene restarts with empty list
        elif tag.find('b') is not None: # represents dialogue. The bold text specifically is the speaker:
            oneScene.append(tag.find('b').text.strip(" ").strip(":")) # add character speaking the dialogue to list 
                  
    allScenes = [x for x in allScenes if x != []] # remove all error, empty scenes

    # remove all character who aren't in mainCharacters list defined above
    for num in range(0,len(allScenes)):
        allScenes[num] = ['George' if x == 'George, Sr.' else x for x in allScenes[num]] # change all instances of 'George, Sr' to 'George'
        allScenes[num] = [x for x in allScenes[num] if x in mainCharacters]

    comboDictionary = {}

    for scene in allScenes:
        combos = list(set(itertools.combinations(scene, 2))) # finds all possible combinations of any two main characters in a scene together

        combos.sort()

        for combo in combos:
            if combo[0] != combo[1]: # because 'Michael and Michael' is technically a possible combo. Omit these. 
                combo = list(combo) # convert into a list, so we can perform the next step
                combo.sort() # sort, so we can avoid measuring both 'Michael and Lindsay' AND 'Lindsay and Michael'
                combo = str(combo) # convert to string so we can add to dictionary
                if combo in comboDictionary: 
                    comboDictionary[combo] = comboDictionary[combo] + 1 # if already in dictionary, increment count value by 1
                else:
                    comboDictionary[combo] = 1 # if not in dictionary already, add with count value of 1
                    
    df = pd.DataFrame([comboDictionary]) # flat dataframe
    df['episode'] = transcriptLink
    return df

    # not flat
    # data = pd.DataFrame({'pair':list(comboDictionary.keys()), 'sceneCount':list(comboDictionary.values())}).sort_values('sceneCount',ascending=False)

## Calculate # of common scenes between all unique character pairs for each episode

In [42]:
pairsAllEpisodes = countPairsInEpisode("https://arresteddevelopment.fandom.com/wiki/Transcript_of_Pilot")

linksNoPilot = links[1:]

for link in linksNoPilot: # iterate over each episode link
    data = countPairsInEpisode("https://arresteddevelopment.fandom.com" + link.find('a')['href'])
    pairsAllEpisodes = pd.concat([pairsAllEpisodes, data], sort = True)
    
pairsAllEpisodes = pairsAllEpisodes.fillna(0)
pairsAllEpisodes = pairsAllEpisodes.reset_index()
del pairsAllEpisodes['index']

In [43]:
pairsAllEpisodes.head()

Unnamed: 0,"['Buster', 'G.O.B.']","['Buster', 'George Michael']","['Buster', 'George']","['Buster', 'Lindsay']","['Buster', 'Lucille']","['Buster', 'Maeby']","['Buster', 'Michael']","['Buster', 'Oscar']","['Buster', 'Tobias']","['G.O.B.', 'George Michael']",...,"['Lucille', 'Michael']","['Lucille', 'Oscar']","['Lucille', 'Tobias']","['Maeby', 'Michael']","['Maeby', 'Oscar']","['Maeby', 'Tobias']","['Michael', 'Oscar']","['Michael', 'Tobias']","['Oscar', 'Tobias']",episode
0,8.0,4.0,2.0,7.0,9.0,2.0,8.0,0.0,6.0,7.0,...,14.0,0.0,10.0,2.0,0.0,2.0,0.0,11.0,0.0,https://arresteddevelopment.fandom.com/wiki/Tr...
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,7.0,0.0,2.0,6.0,0.0,3.0,0.0,7.0,0.0,https://arresteddevelopment.fandom.com/wiki/Tr...
2,5.0,0.0,1.0,5.0,5.0,0.0,16.0,0.0,1.0,1.0,...,4.0,0.0,0.0,3.0,0.0,8.0,0.0,6.0,0.0,https://arresteddevelopment.fandom.com/wiki/Tr...
3,2.0,0.0,0.0,1.0,6.0,0.0,4.0,0.0,0.0,0.0,...,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,https://arresteddevelopment.fandom.com/wiki/Tr...
4,0.0,1.0,0.0,1.0,4.0,0.0,7.0,0.0,0.0,6.0,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://arresteddevelopment.fandom.com/wiki/Tr...


In [44]:
pairsAllEpisodes.to_csv("C:\\Users\\glol7001\\Documents\\ArrestedDevelopmentPairs.csv", index=False)

## Scrape ratings from IMDb

In [46]:
html = requests.get("https://www.imdb.com/title/tt0367279/episodes?season=1")
html = html.text
bs = BeautifulSoup(html, "lxml")
episodes1 = bs.find("div", {'class':'list detail eplist'}).findAll('div',{'class':'info'})

html = requests.get("https://www.imdb.com/title/tt0367279/episodes?season=2")
html = html.text
bs = BeautifulSoup(html, "lxml")
episodes2 = bs.find("div", {'class':'list detail eplist'}).findAll('div',{'class':'info'})

html = requests.get("https://www.imdb.com/title/tt0367279/episodes?season=3")
html = html.text
bs = BeautifulSoup(html, "lxml")
episodes3 = bs.find("div", {'class':'list detail eplist'}).findAll('div',{'class':'info'})

html = requests.get("https://www.imdb.com/title/tt0367279/episodes?season=4")
html = html.text
bs = BeautifulSoup(html, "lxml")
episodes4 = bs.find("div", {'class':'list detail eplist'}).findAll('div',{'class':'info'})

episodes = episodes1 + episodes2 + episodes3 + episodes4

In [47]:
title = []
rating = []
numVotes = []

for episode in episodes:
    title.append(episode.find('a',{'itemprop':'name'}).text)
    rating.append(episode.find('span', {'class':'ipl-rating-star__rating'}).text)
    numVotes.append(episode.find('span', {'class':'ipl-rating-star__total-votes'}).text.strip("()"))
    
ratingData = pd.DataFrame({'title':title,'rating':rating,'numVotes':numVotes})

## Merge dataframes

In [48]:
pairsAllEpisodes = pairsAllEpisodes.reset_index()
ratingData = ratingData.reset_index()

df = pd.merge(left=pairsAllEpisodes, right=ratingData, on='index')
del df['index']

## Rename columns so something cleaner, for convenience in Javascript

In [51]:
newColumns = list(df.columns)

for index in range(0,len(newColumns)):
    newColumns[index] = newColumns[index].strip("['']").replace("', '","+").replace(" ","_")
    
df.columns = newColumns

In [57]:
df.head(1)

Unnamed: 0,Buster+G.O.B.,Buster+George_Michael,Buster+George,Buster+Lindsay,Buster+Lucille,Buster+Maeby,Buster+Michael,Buster+Oscar,Buster+Tobias,G.O.B.+George_Michael,...,Maeby+Michael,Maeby+Oscar,Maeby+Tobias,Michael+Oscar,Michael+Tobias,Oscar+Tobias,episode,title,rating,numVotes
0,8.0,4.0,2.0,7.0,9.0,2.0,8.0,0.0,6.0,7.0,...,2.0,0.0,2.0,0.0,11.0,0.0,https://arresteddevelopment.fandom.com/wiki/Tr...,Pilot,8.2,2804.0


In [55]:
df.to_csv("C:\\Users\\glol7001\\Documents\\ArrestedDevelopmentPairsAndRatingData.csv", index=False)