# IMDb Data Merging & Manipulation

Author: Oliver Gladfelter

Date: 4/8/18

Objective: Merging multiple IMDb data tables together, collect information - including the names of the writing staff - for each episode in the top US comedies of the 21st century

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Contains title ID, titleType, startYear, endYear, genres
basicInfo = pd.read_csv('basicTitle.tsv', sep = '\t')

# Contains ratings info
ratings = pd.read_csv('ratings.tsv', sep = '\t')

# Contains tv episode info: title ID, parent ID, seasonNumber, episodeNumber
episodeInfo = pd.read_csv('data.tsv', sep = '\t')

# Contains the title ID and the writer ID 
crew = pd.read_csv('data_1.tsv', sep = '\t')

# Contains writer ID and writer name
names = pd.read_csv('data_2.tsv', sep = '\t')

In [3]:
def convertToInt(value):
    if value != "\\N":
        return int(value)

basicInfo['startYear'] = basicInfo['startYear'].apply(convertToInt)

In [8]:
def comedyDummy(value):
    if 'Comedy' in str(value):
        return 1
    else:
        return 0
    
basicInfo['comedy'] = basicInfo['genres'].apply(comedyDummy)

In [41]:
# subset to include only tv episodes from the comedy genre and released 1987 or later
episodes = basicInfo[(basicInfo['titleType'] == 'tvEpisode') & 
                     (basicInfo['startYear'] > 1986) & (basicInfo['comedy'] == 1)]

In [11]:
episodes.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,comedy
88839,tt0090780,tvEpisode,Bulls of a Feather,Bulls of a Feather,0,1992.0,\N,\N,"Animation,Comedy,Family",1
93622,tt0095671,tvEpisode,La mujer fría,La mujer fría,0,1990.0,\N,\N,"Comedy,Drama",1
93623,tt0095672,tvEpisode,La mujer infiel,La mujer infiel,0,1990.0,\N,57,"Comedy,Drama",1
93624,tt0095673,tvEpisode,La mujer oriental,La mujer oriental,0,1990.0,\N,\N,"Comedy,Drama",1
93625,tt0095674,tvEpisode,La mujer perdida,La mujer perdida,0,1990.0,\N,\N,"Comedy,Drama",1


In [42]:
episodes = episodes.drop(['isAdult', 'endYear', 'genres', 'comedy', 'titleType'], axis = 1)

In [43]:
# merging with ratings and episodeInfo
episodesRatings = episodes.merge(ratings)

episodesRatingsSeries = episodesRatings.merge(episodeInfo)

In [149]:
episodesRatingsSeries.head()

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,averageRating,numVotes,parentTconst,seasonNumber,episodeNumber
0,tt0087632,The Little Mermaid,The Little Mermaid,1987.0,47,7.1,110,tt0199214,6,2
1,tt0090607,All My Sons,All My Sons,1987.0,\N,7.4,163,tt0176357,6,1
2,tt0090732,The Birthday Party,The Birthday Party,1987.0,110,7.7,50,tt0820333,2,6
3,tt0091111,Ghosts,Ghosts,1987.0,104,7.8,95,tt0820333,2,5
4,tt0091821,Pilot,Pilot,1987.0,96,7.7,273,tt0092434,1,0


### Subset to the top 10 comedy series, based on this IMDb list: 

https://www.imdb.com/search/title?genres=comedy&title_type=tv_series,mini_series&num_votes=5000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=2295992002&pf_rd_r=0TNGC2T28F4V8BM518BR&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=toptv&sort=num_votes,desc&ref_=chttvtp_gnr_5

In [278]:
def topComedyDummy(value):
    """
    If any given episode's parentTconst is included in the following string,
    which is a list of the top 10 comedy series' parentTconsts, return 1 
    """
    if str(value) in 'tt2861424, tt0898266, tt0108778, tt0460649, tt0096697, tt1632701, tt1442437, tt0121955, tt0182576, tt0386676, tt0367279': 
        return 1
    else:
        return 0
    
episodesRatingsSeries['topComedy'] = 0
episodesRatingsSeries['topComedy'] = episodesRatingsSeries['parentTconst'].apply(topComedyDummy)

# Select all episodes whose parentTconst was found in the list of top comedies
topComediesData = episodesRatingsSeries[(episodesRatingsSeries['topComedy'] == 1)]

del topComediesData['originalTitle']
del topComediesData['topComedy']

In [279]:
def seriesTitle(value):
    """
    Returns the appropriate TV series name based on the parentTconst
    """
    if value == 'tt2861424':
        return 'Rick and Morty'
    if value == 'tt0898266':
        return 'The Big Bang Theory'
    if value == 'tt0108778':
        return 'Friends'
    if value == 'tt0460649':
        return 'How I Met Your Mother'
    if value == 'tt0096697':
        return 'The Simpsons'
    if value == 'tt1632701':
        return 'Suits'
    if value == 'tt1442437':
        return 'Modern Family'
    if value == 'tt0121955':
        return 'South Park'
    if value == 'tt0182576':
        return 'Family Guy'
    if value == 'tt0386676':
        return 'The Office'
    if value == 'tt0367279':
        return 'Arrested Development'
    else:
        return 0

topComediesData['series'] = ''
topComediesData['series'] = topComediesData['parentTconst'].apply(seriesTitle)

In [280]:
topComediesData.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,averageRating,numVotes,parentTconst,seasonNumber,episodeNumber,series
139,tt0177842,Treehouse of Horror VI,1995.0,30,8.5,1654,tt0096697,7,6,The Simpsons
254,tt0348034,Simpsons Roasting on an Open Fire,1989.0,30,8.2,4344,tt0096697,1,1,The Simpsons
357,tt0394893,Cartman Gets an Anal Probe,1997.0,22,8.2,2593,tt0121955,1,1,South Park
422,tt0458217,The Passion of the Jew,2004.0,22,8.7,2003,tt0121955,8,3,South Park
2009,tt0515207,Afternoon Delight,2004.0,30,9.1,1603,tt0367279,2,6,Arrested Development


### Add a 'total episode count' variable, measuring the overall episode number of a series

In [281]:
# convert seasonNumber and episodeNumber column values from strings to integers
topComediesData['episodeNumber'] = topComediesData['episodeNumber'].astype(int)
topComediesData['seasonNumber'] = topComediesData['seasonNumber'].astype(int)

topComediesOrdered = topComediesData.sort_values(by = ['series', 'seasonNumber', 'episodeNumber'])
topComediesOrdered = topComediesOrdered.reset_index()
del topComediesOrdered['index']

topComediesOrdered.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,averageRating,numVotes,parentTconst,seasonNumber,episodeNumber,series
0,tt0515236,Pilot,2003.0,21,8.3,2290,tt0367279,1,1,Arrested Development
1,tt0515256,Top Banana,2003.0,22,8.6,1986,tt0367279,1,2,Arrested Development
2,tt0515212,Bringing Up Buster,2003.0,22,8.2,1826,tt0367279,1,3,Arrested Development
3,tt0515223,Key Decisions,2003.0,21,8.5,1739,tt0367279,1,4,Arrested Development
4,tt0515214,Charity Drive,2003.0,21,8.4,1647,tt0367279,1,5,Arrested Development


In [288]:
topComediesOrdered['totalNum'] = 1

for episode in range(1, len(topComediesOrdered)):
    # if the current episode is from the same series as the previous one, increment the overall 
    # episode count by one
    if topComediesOrdered['series'][episode] == topComediesOrdered['series'][episode - 1]:
        topComediesOrdered['totalNum'][episode] = topComediesOrdered['totalNum'][episode - 1] + 1
    # if the current episode is from a new series, reset the overall episode count
    elif topComediesOrdered['series'][episode] != topComediesOrdered['series'][episode-1]:
        topComediesOrdered['totalNum'][episode] = topComediesOrdered['episodeNumber'][episode]
    # otherwise, return -1 to signal an error
    else: 
        topComediesOrdered['totalNum'][episode] = -1

topComediesOrdered.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,averageRating,numVotes,parentTconst,seasonNumber,episodeNumber,series,totalNum
0,tt0515236,Pilot,2003.0,21,8.3,2290,tt0367279,1,1,Arrested Development,1
1,tt0515256,Top Banana,2003.0,22,8.6,1986,tt0367279,1,2,Arrested Development,2
2,tt0515212,Bringing Up Buster,2003.0,22,8.2,1826,tt0367279,1,3,Arrested Development,3
3,tt0515223,Key Decisions,2003.0,21,8.5,1739,tt0367279,1,4,Arrested Development,4
4,tt0515214,Charity Drive,2003.0,21,8.4,1647,tt0367279,1,5,Arrested Development,5


In [211]:
topComediesOrdered.to_csv("C:\\Users\\Oliver\\Documents\\imbd episodes\\top10Comedies-4-23-18.csv")

### Retrieve and add each episode's writing staff members

In [243]:
episodesCrew = topComediesOrdered.merge(crew)

episodesCrew.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,averageRating,numVotes,parentTconst,series,episodeNumberInt,seasonNumberInt,totalNum,directors,writers
0,tt0515236,Pilot,2003.0,21,8.3,2290,tt0367279,Arrested Development,1,1,1,"nm0751577,nm0751648",nm0403804
1,tt0515256,Top Banana,2003.0,22,8.6,1986,tt0367279,Arrested Development,2,1,2,nm0751577,"nm0403804,nm0505276,nm1545076"
2,tt0515212,Bringing Up Buster,2003.0,22,8.2,1826,tt0367279,Arrested Development,3,1,3,nm0751648,"nm0403804,nm0742711,nm1545076"
3,tt0515223,Key Decisions,2003.0,21,8.5,1739,tt0367279,Arrested Development,4,1,4,nm0751577,"nm0403804,nm0178589,nm1545076"
4,tt0515214,Charity Drive,2003.0,21,8.4,1647,tt0367279,Arrested Development,5,1,5,nm0609549,"nm0403804,nm0271003,nm1545076"


In [234]:
# in the crews table, multiple writer IDs of one project are included together in one
# long string, which makes a merge with the names table successful only for projects
# with only one writer. This for loop munges the crew table into a table where
# projects with multiple writers are split into multiple rows, one for each writer

writingStaff = []

for index, row in episodesCrew.iterrows():
    
    if row['writers'] != r'\N':
        episodesOrMovies = row['tconst']
        writersP = row['writers']
        
        for writer in writersP.split(','):
            newRow = [episodesOrMovies, writer]
            writingStaff.append(newRow)

writingStaffDF = pd.DataFrame(writingStaff, columns = ['tconst', 'nconst'])

writingStaffDF.head()

Unnamed: 0,tconst,nconst
0,tt0515236,nm0403804
1,tt0515256,nm0403804
2,tt0515256,nm0505276
3,tt0515256,nm1545076
4,tt0515212,nm0403804


In [271]:
# Merge with the names table to add the writer names associated with writer IDs
episodeWriters = writingStaffDF.merge(names)

episodeWriters.head()

Unnamed: 0,tconst,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,tt0515236,nm0403804,Mitchell Hurwitz,\N,\N,"producer,writer,miscellaneous","tt4789300,tt0320052,tt0106039,tt0367279"
1,tt0515256,nm0403804,Mitchell Hurwitz,\N,\N,"producer,writer,miscellaneous","tt4789300,tt0320052,tt0106039,tt0367279"
2,tt0515212,nm0403804,Mitchell Hurwitz,\N,\N,"producer,writer,miscellaneous","tt4789300,tt0320052,tt0106039,tt0367279"
3,tt0515223,nm0403804,Mitchell Hurwitz,\N,\N,"producer,writer,miscellaneous","tt4789300,tt0320052,tt0106039,tt0367279"
4,tt0515214,nm0403804,Mitchell Hurwitz,\N,\N,"producer,writer,miscellaneous","tt4789300,tt0320052,tt0106039,tt0367279"


In [275]:
# to put all the writers back in one row, so that one episode = one row,
# we need to first sort the dataframe by episode ID
episodeWriters = episodeWriters.sort_values(by='tconst')
episodeWriters = writingStaffNamesSort.reset_index()
episodeWriters = episodeWriters.drop(['index', 'nconst', 'birthYear', 'deathYear', 'primaryProfession', 'knownForTitles'], axis=1)

episodeWriters.head()

Unnamed: 0,tconst,primaryName
0,tt0177842,Dan Greaney
1,tt0177842,Sam Simon
2,tt0177842,David X. Cohen
3,tt0177842,Steve Tompkins
4,tt0177842,James L. Brooks


In [None]:
# add a new column of blank strings
episodeWriters['writers'] = ''

# iterate over the length of the data frame in order to create full lists of writers involved, rather
# than having writing staffs seperated over multiple rows
for episode in range(0, len(episodeWriters) - 1):
    
    # create a variable holding the string of the writers name for current row
    names = episodeWriters['primaryName'][episode]
    count = 1
    
    # while subsequent rows contain information about the current row's same movie, add the writer's
    # names to the 'names' variable. 
    while episodeWriters['tconst'][episode] == episodeWriters['tconst'][episode + count]:
        names = names + ", " + episodeWriters['primaryName'][episode + count]
        count = count + 1
        
    # once the last row for the given movie is reached and all writers have been added to the 
    # 'names' variable, replace the empty string in the 'writers' column with the string held by 'names'
    episodeWriters['writers'][episode] = names

In [266]:
# drop duplicate movies in the data frame, always keeping the first instance of each repeated movie
# because the full writing staffs are only included in the first instance
episodeWriters = episodeWriters.drop_duplicates(subset = 'tconst', keep = 'first')

del episodeWriters['primaryName']

episodeWriters.head()

Unnamed: 0,tconst,writers
0,tt0177842,"Dan Greaney, Sam Simon, David X. Cohen, Steve ..."
7,tt0348034,"Mimi Pond, Sam Simon, Matt Groening, James L. ..."
11,tt0394893,"Trey Parker, Dan Sterling, Matt Stone, Brian G..."
15,tt0458217,"Brian Graden, Trey Parker, Matt Stone"
18,tt0515207,"Mitchell Hurwitz, Abraham Higginbotham, Chuck ..."


In [269]:
# attach each episode's writing staff to our main dataframe, topComediesOrdered
topComediesDataWithWriters = topComediesOrdered.merge(episodeWriters)

topComediesDataWithWriters.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,averageRating,numVotes,parentTconst,series,episodeNumberInt,seasonNumberInt,totalNum,writers
0,tt0515236,Pilot,2003.0,21,8.3,2290,tt0367279,Arrested Development,1,1,1,Mitchell Hurwitz
1,tt0515256,Top Banana,2003.0,22,8.6,1986,tt0367279,Arrested Development,2,1,2,"Abraham Higginbotham, Mitchell Hurwitz, John L..."
2,tt0515212,Bringing Up Buster,2003.0,22,8.2,1826,tt0367279,Arrested Development,3,1,3,"Richard Rosenstock, Mitchell Hurwitz, Abraham ..."
3,tt0515223,Key Decisions,2003.0,21,8.5,1739,tt0367279,Arrested Development,4,1,4,"Brad Copeland, Mitchell Hurwitz, Abraham Higgi..."
4,tt0515214,Charity Drive,2003.0,21,8.4,1647,tt0367279,Arrested Development,5,1,5,"Barbie Adler, Mitchell Hurwitz, Abraham Higgin..."


In [270]:
topComediesDataWithWriters.to_csv("C:\\Users\\Oliver\\Documents\\imbd episodes\\top10ComediesWithWriters.csv")