# IMDb Web Scraping, Data Merging, & Munging

Author: Oliver Gladfelter

Date: 4/8/18 (updated 6/6/2018)

Objective: Compile a data set including information about each episode released by the top 100 comedies, including the names of the writing staff of each episode

In [102]:
import pandas as pd
import lxml.html as lh
import io
import requests
from bs4 import BeautifulSoup
import re

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Contains title ID, titleType, startYear, endYear, genres
basicInfo = pd.read_csv('basicTitle.tsv', sep = '\t')

# Contains tv episode info: title ID, parent ID, seasonNumber, episodeNumber
episodeInfo = pd.read_csv('title.episode.tsv', sep = '\t')

# Contains the title ID and the writer ID 
crew = pd.read_csv('title.crew.tsv', sep = '\t')

# Contains writer ID and writer name
names = pd.read_csv('name.basics.tsv', sep = '\t')

In [104]:
# Note: this cell is meant to subset the dataset as quickly as possible, 
# to make the first merge easier on my CPU. Given a better processor, this can be skipped,
# as the dataset will be subsetted to top 100 comedies in the next step anyways 

def comedyDummy(value):
    if 'Comedy' in str(value):
        return 1
    else:
        return 0
    
basicInfo['comedy'] = basicInfo['genres'].apply(comedyDummy)

# subset to include only tv episodes from the comedy genre
episodes = basicInfo[(basicInfo['titleType'] == 'tvEpisode') & 
                     (basicInfo['comedy'] == 1)]

In [5]:
episodes.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,comedy
46971,tt0047810,tvEpisode,All My Sons,All My Sons,0,1958,\N,\N,"Comedy,Drama",1
47019,tt0047858,tvEpisode,The Barretts of Wimpole Street,The Barretts of Wimpole Street,0,1956,\N,90,"Comedy,Drama,Family",1
47456,tt0048302,tvEpisode,Life of Vernon Hathaway,Life of Vernon Hathaway,0,1955,\N,30,"Comedy,Drama,Romance",1
47711,tt0048562,tvEpisode,Rookie of the Year,Rookie of the Year,0,1955,\N,29,"Comedy,Drama,Romance",1
48712,tt0049572,tvEpisode,One,One,0,1956,\N,90,"Comedy,Drama",1


In [105]:
episodes = episodes.drop(['isAdult', 'endYear', 'genres', 'comedy', 'titleType', 'runtimeMinutes'], axis = 1)

episodesAndEpisodeInfo = episodes.merge(episodeInfo)

episodesAndEpisodeInfo.head()

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,parentTconst,seasonNumber,episodeNumber
0,tt0047810,All My Sons,All My Sons,1958,tt0914702,3,36
1,tt0047858,The Barretts of Wimpole Street,The Barretts of Wimpole Street,1956,tt0046637,2,9
2,tt0048302,Life of Vernon Hathaway,Life of Vernon Hathaway,1955,tt0047768,1,6
3,tt0048562,Rookie of the Year,Rookie of the Year,1955,tt0047768,1,10
4,tt0049572,One,One,1956,tt0914702,1,30


## Webscrape IMDb to retrieve the IDs of the top 100 comedy series, then subset the dataset to include only episodes from those series

In [134]:
def webscrapeComedies(url):
    """
    Webscrapes a given 'top IMDb comedies' page and returns a list of the IDs to each comedy
    """
    
    idsList = []
    r = requests.get(url)
    
    if r.status_code != 200:
        print("Error")
    else:
        soup = BeautifulSoup(r.content)

    # return an array of all divs that represent top comedies on the IMDb page
    headers = soup.findAll("h3", {"class": "lister-item-header"})

    # for each header, access the tag containing the series ID. Then use regEx to extract the ID and append it to 'ids' list
    for header in headers:
        link = header.find('a')['href']
        idsList.append(re.search('tt\d{7}', link).group(0))
        
    return idsList

idsPage1 = webscrapeComedies("https://www.imdb.com/search/title?genres=comedy&title_type=tv_series,mini_series&num_votes=5000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2295992002&pf_rd_r=0TNGC2T28F4V8BM518BR&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=toptv&sort=num_votes,desc&ref_=chttvtp_gnr_5")
idsPage2 = webscrapeComedies("https://www.imdb.com/search/title?genres=comedy&title_type=tv_series,mini_series&num_votes=5000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2295992002&pf_rd_r=0TNGC2T28F4V8BM518BR&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=toptv&sort=num_votes,desc&page=2&ref_=adv_nxt")

# A list of the IDs of the top 100 comedies
ids = idsPage1 + idsPage2

In [135]:
# Manually append the next 8 comedies that weren't web scraped
ids = ids + ['tt4742876', 'tt0096548', 'tt0083399', 'tt1663676', 'tt1480684', 'tt0118375', 'tt1225901']

In [136]:
def topComedyDummy(value):
    """
    Return 1 if any given episode's parentTconst is included in the list of IDs
    """
    if str(value) in ids:
        return 1
    else:
        return 0
    
episodesAndEpisodeInfo['topComedy'] = 0
episodesAndEpisodeInfo['topComedy'] = episodesAndEpisodeInfo['parentTconst'].apply(topComedyDummy)

# Select all episodes whose parentTconst was found in the list of top comedies
topComediesData = episodesAndEpisodeInfo[(episodesAndEpisodeInfo['topComedy'] == 1)]

del topComediesData['originalTitle']
del topComediesData['topComedy']

In [138]:
topComediesData.head()

Unnamed: 0,tconst,primaryTitle,startYear,parentTconst,seasonNumber,episodeNumber
260,tt0098286,"Good News, Bad News",1989,tt0098904,1,0
441,tt0177842,Treehouse of Horror VI,1995,tt0096697,7,6
555,tt0213826,"Goodbye, Farewell, and Amen",1983,tt0068098,11,16
620,tt0238966,Enemies,1996,tt0092400,10,23
769,tt0291751,The Best Bits of Mr. Bean,1995,tt0096657,1,15


## Attribute series' names by web scraping the IMDb webpage of each unique series ID

In [118]:
# List of all unique parent IDs in the data set
idAndName = {}

# Webscrapes IMDb and returns the appropriate TV series name based on the parentTconst
for show in ids:
   
    url = 'https://www.imdb.com/title/' + str(show)

    r = requests.get(url)
    if r.status_code != 200:
        print("Error")
    else:
        root = lh.parse(io.BytesIO(r.content)).getroot()

    title = root.xpath('//h1//text()')
    
    # Parent ID is added to dictionary as key, corresponds to series name
    idAndName[show] = title[0][0:-13]
    
def attributeName(value):
    """
    Locates a given ID in the dictionary and returns the corresponding series name
    """
    return idAndName[value]

topComediesDataWithName = topComediesData
topComediesDataWithName['series'] = ''
topComediesDataWithName['series'] = topComediesDataWithName['parentTconst'].apply(attributeName)

In [119]:
topComediesDataWithName.head()

Unnamed: 0,tconst,primaryTitle,startYear,parentTconst,seasonNumber,episodeNumber,series
260,tt0098286,"Good News, Bad News",1989,tt0098904,1,0,Seinfeld
441,tt0177842,Treehouse of Horror VI,1995,tt0096697,7,6,The Simpsons
555,tt0213826,"Goodbye, Farewell, and Amen",1983,tt0068098,11,16,M*A*S*H
620,tt0238966,Enemies,1996,tt0092400,10,23,Married with Children
769,tt0291751,The Best Bits of Mr. Bean,1995,tt0096657,1,15,Mr. Bean


## Add a 'total episode count' variable, measuring the overall episode number of a series

In [65]:
def convertToInt(value):
    if value != '\\N':
        return int(value)

# convert seasonNumber and episodeNumber column values from strings to integers
topComediesDataWithName['episodeNumber'] = topComediesDataWithName['episodeNumber'].apply(convertToInt)
topComediesDataWithName['seasonNumber'] = topComediesDataWithName['seasonNumber'].apply(convertToInt)

topComediesOrdered = topComediesDataWithName.sort_values(by = ['series', 'seasonNumber', 'episodeNumber'])
topComediesOrdered = topComediesOrdered.reset_index()
del topComediesOrdered['index']

topComediesOrdered['totalNum'] = 1

for episode in range(1, len(topComediesOrdered)):
    # if the current episode is from the same series as the previous one, increment the overall 
    # episode count by one. Otherwise, the current episode is from  new series, so the overall episode count is reset
    if topComediesOrdered['series'][episode] == topComediesOrdered['series'][episode - 1]:
        topComediesOrdered['totalNum'][episode] = topComediesOrdered['totalNum'][episode - 1] + 1

topComediesOrdered.head()

Unnamed: 0,tconst,primaryTitle,startYear,parentTconst,seasonNumber,episodeNumber,series,totalNum
0,tt1228840,We're Not in Kansas Anymore,2008,tt1225901,1,1,90210,1
1,tt1280050,The Jet Set,2008,tt1225901,1,2,90210,2
2,tt1280051,Lucky Strike,2008,tt1225901,1,3,90210,3
3,tt1280052,The Bubble,2008,tt1225901,1,4,90210,4
4,tt1280053,Wide Awake and Dreaming,2008,tt1225901,1,5,90210,5


## Retrieve and add each episode's writing staff members

In [128]:
episodesCrew = topComediesOrdered.merge(crew)
del episodesCrew['directors']

episodesCrew.head()

Unnamed: 0,tconst,primaryTitle,startYear,parentTconst,seasonNumber,episodeNumber,writers
0,tt0552605,David Cross & Bob Odenkirk,1996,tt0115147,1,49,"nm0112944,nm0247243,nm0369870,nm0416853,nm0426..."
1,tt0552606,James Doohan,1996,tt0115147,1,52,"nm0112944,nm0247243,nm0369870,nm0416853,nm0426..."
2,tt0552607,Scott Thompson,1998,tt0115147,2,153,"nm0112944,nm0170306,nm0247243,nm0369870,nm0416..."
3,tt0552608,David Cross & Bob Odenkirk,1998,tt0115147,3,49,"nm0112944,nm0170306,nm0247243,nm0329634,nm0334..."
4,tt0552609,Todd Rundgren,1998,tt0115147,3,64,"nm0112944,nm0170306,nm0247243,nm0329634,nm0334..."


In [129]:
# in the crews table, multiple writer IDs of one project are included together in one
# long string, which makes a merge with the names table successful only for projects
# with only one writer. This for loop munges the crew table into a table where
# projects with multiple writers are split into multiple rows, one for each writer

writingStaff = []

for index, row in episodesCrew.iterrows():
    
    if row['writers'] != r'\N':
        episodesOrMovies = row['tconst']
        writersP = row['writers']
        
        for writer in writersP.split(','):
            newRow = [episodesOrMovies, writer]
            writingStaff.append(newRow)

writingStaffDF = pd.DataFrame(writingStaff, columns = ['tconst', 'nconst'])

writingStaffDF.head()

Unnamed: 0,tconst,nconst
0,tt0552605,nm0112944
1,tt0552605,nm0247243
2,tt0552605,nm0369870
3,tt0552605,nm0416853
4,tt0552605,nm0426351


In [None]:
# Merge with the names table to add the writer names associated with writer IDs
episodeWriters = writingStaffDF.merge(names)

# to put all the writers back in one row, so that one episode = one row,
# we need to first sort the dataframe by episode ID
episodeWriters = episodeWriters.sort_values(by='tconst')
episodeWriters = episodeWriters.reset_index()
episodeWriters = episodeWriters.drop(['index', 'nconst', 'birthYear', 'deathYear', 'primaryProfession', 'knownForTitles'], axis=1)

In [None]:
# add a new column of blank strings
episodeWriters['writers'] = ''

# iterate over the length of the data frame in order to create full lists of writers involved, rather
# than having writing staffs seperated over multiple rows
for episode in range(0, len(episodeWriters) - 1):
    
    # create a variable holding the string of the writers name for current row
    names = episodeWriters['primaryName'][episode]
    count = 1
    
    # while subsequent rows contain information about the current row's same movie, add the writer's
    # names to the 'names' variable. 
    while episodeWriters['tconst'][episode] == episodeWriters['tconst'][episode + count]:
        names = names + "," + episodeWriters['primaryName'][episode + count]
        count = count + 1
        
    # once the last row for the given movie is reached and all writers have been added to the 
    # 'names' variable, replace the empty string in the 'writers' column with the string held by 'names'
    episodeWriters['writers'][episode] = names

In [91]:
# drop duplicate episodes in the data frame, always keeping the first instance of each repeated movie
# because the full writing staffs are only included in the first instance
episodeWriters = episodeWriters.drop_duplicates(subset = 'tconst', keep = 'first')

episodeWriters.iloc[-1]['writers'] = 'Rob Thomas,Lara Olsen,Darren Star,Jeff Judah,Brian Dawson,Gabe Sachs,Allen Clary'

del episodeWriters['primaryName']

episodeWriters.tail()

Unnamed: 0,tconst,writers
518,tt2644094,"Brian Dawson,Allen Clary,Jeff Judah,Darren Sta..."
524,tt2672248,"Gabe Sachs,Allen Clary,Jeff Judah,Darren Star,..."
531,tt2724880,"Darren Star,Terrence Coli,Allen Clary,Brian Da..."
539,tt2726702,"Jeff Judah,Patricia Carr,Allen Clary,Darren St..."
546,tt2794682,"Rob Thomas,Lara Olsen,Darren Star,Jeff Judah,B..."


In [92]:
# attach each episode's writing staff to our main dataframe, topComediesOrdered
topComediesDataWithWriters2 = topComediesDataWithName.merge(episodeWriters)

topComediesDataWithWriters2.head()

Unnamed: 0,tconst,primaryTitle,startYear,parentTconst,seasonNumber,episodeNumber,series,writers
0,tt1228840,We're Not in Kansas Anymore,2008,tt1225901,1,1,90210,"Rob Thomas,Darren Star,Jeff Judah,Gabe Sachs"
1,tt1280046,Secrets and Lies,2008,tt1225901,1,9,90210,"Dailyn Rodriguez,Darren Star"
2,tt1280047,Games People Play,2008,tt1225901,1,10,90210,"Kristin Long,Darren Star"
3,tt1280048,"Hello, Goodbye, Amen",2009,tt1225901,1,12,90210,"Jennifer Cecil,Darren Star"
4,tt1280049,Love Me or Leave Me,2009,tt1225901,1,13,90210,Paul Sciarrotta


In [98]:
topComediesDataWithWriters.to_csv("top100ComediesWithWriters.csv")