# Measuring Film Genres - Data Collection

## Author: Oliver Gladfelter

## Date: September 14th, 2018

In [1]:
import lxml.html as lh
import io
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import re
import math
from math import pi

In [68]:
def movieInfoFromSearch(genre,numPages):
    """
    How many pages to collect movie data from. There are 50 movies per pages, so a parameter value of 4 would collect data on 200 movies.
    """
    
    titles = []
    IDs = []
    genres = []
    years = []
    numVotes = []
    
    for num in range(0,numPages):
        startNum = num * 50 + 1 # page 1 starts at movie 1, page 2 starts at movie 51, page 3 starts at movie 101, and so on 
        URL = "https://www.imdb.com/search/title?title_type=feature&genres={}&countries=us&languages=en&sort=num_votes,desc&start={}&ref_=adv_prv".format(genre,startNum)

        html = requests.get(URL).text
        bs = BeautifulSoup(html, "lxml")
        searchResults = bs.findAll("div",{"class":"lister-item-content"})

        for movie in searchResults:
            
            # stop collecting data once we reach films with less than 50k votes on IMDb
            if int(movie.find("span", {"name":"nv"}).text.replace(",","")) < 50000:
                df = pd.DataFrame({'title':titles,'ID':IDs,'genre':genres,'year':years,'numVotes':numVotes})
                return df
            
            titles.append(movie.find("a").text)
            IDs.append(re.search('tt\d{7}',str(movie.find("a"))).group(0))
            genres.append(movie.find("span", {"class":"genre"}).text.strip('\n').strip(' '))
            years.append(movie.find("span", {"class":"lister-item-year"}).text.strip("()"))
            numVotes.append(int(movie.find("span", {"name":"nv"}).text.replace(",","")))
    
    df = pd.DataFrame({'title':titles,'ID':IDs,'genre':genres,'year':years,'numVotes':numVotes})

    return df

# Function to Retrieve IMDb's plot key words / tags for each movie

In [3]:
def getKeyWords(movieID):
    html = requests.get("https://www.imdb.com/title/" + movieID + '/keywords?ref_=tt_stry_kw').text
    bs = BeautifulSoup(html, "lxml")
    keyWords = bs.findAll("div",{"class":"sodatext"})

    keyWordsList = []

    for word in keyWords:
        keyWordsList.append(word.text.strip('\n'))
        
    return keyWordsList

# Function to Use Key Words to Create a Data Frame, Sorted by Frequency. 

In [4]:
def mostCommonWords(df, genre):
    
    genre = genre + "Count"
    # list comprehension to flatten data['keyWords'] column (which is full of lists) into one long list 
    listOfWords = [y for x in list(df['keyWords']) for y in x]

    # Convert to data frame and group by word
    genresWords = pd.DataFrame({'word':listOfWords})
    genresWords[genre] = 1
    genresWords = genresWords.groupby('word').sum()
    genresWords = genresWords.sort_values(by=genre, ascending = False)
    
    return genresWords

# Collect Movie Data & Most Frequent Tagged Words for Each Genre (https://www.imdb.com/feature/genre/) 

In [None]:
maxPageNum = 15

print("Action...")
actionMovies = movieInfoFromSearch('action',maxPageNum)
actionMovies = actionMovies[actionMovies['year'] != '']
actionMovies['keyWords'] = actionMovies['ID'].apply(getKeyWords)
actionWords = mostCommonWords(actionMovies,'action')

print("Sci-Fi...")
scifiMovies = movieInfoFromSearch('sci-fi',maxPageNum)
scifiMovies = scifiMovies[scifiMovies['year'] != '']
scifiMovies['keyWords'] = scifiMovies['ID'].apply(getKeyWords)
scifiWords = mostCommonWords(scifiMovies,'scifi')

print("Horror...")
horrorMovies = movieInfoFromSearch('horror',maxPageNum)
horrorMovies = horrorMovies[horrorMovies['year'] != '']
horrorMovies['keyWords'] = horrorMovies['ID'].apply(getKeyWords)
horrorWords = mostCommonWords(horrorMovies,'horror')

print("Comedy...")
comedyMovies = movieInfoFromSearch('comedy',maxPageNum)
comedyMovies = comedyMovies[comedyMovies['year'] != '']
comedyMovies['keyWords'] = comedyMovies['ID'].apply(getKeyWords)
comedyWords = mostCommonWords(comedyMovies,'comedy')

print("Drama...")
dramaMovies = movieInfoFromSearch('drama',maxPageNum)
dramaMovies = dramaMovies[dramaMovies['year'] != '']
dramaMovies['keyWords'] = dramaMovies['ID'].apply(getKeyWords)
dramaWords = mostCommonWords(dramaMovies,'drama')

print("Fantasy...")
fantasyMovies = movieInfoFromSearch('fantasy',maxPageNum)
fantasyMovies = fantasyMovies[fantasyMovies['year'] != '']
fantasyMovies['keyWords'] = fantasyMovies['ID'].apply(getKeyWords)
fantasyWords = mostCommonWords(fantasyMovies,'fantasy')

print("Thriller...")
thrillerMovies = movieInfoFromSearch('thriller',maxPageNum)
thrillerMovies = thrillerMovies[thrillerMovies['year'] != '']
thrillerMovies['keyWords'] = thrillerMovies['ID'].apply(getKeyWords)
thrillerWords = mostCommonWords(thrillerMovies,'thriller')

# Creating and Cleaning Movies Data Set

In [None]:
def numWords(value):
    return len(value)

def yearFix(year):
    return int(re.search('\d\d\d\d',year).group(0))

allGenreMovies = pd.concat([actionMovies, scifiMovies, horrorMovies, comedyMovies, dramaMovies, fantasyMovies, thrillerMovies])
allGenreMovies = allGenreMovies.drop_duplicates(subset = 'ID')
allGenreMovies = allGenreMovies.reset_index()
del allGenreMovies['index']

allGenreMovies['numWords'] = allGenreMovies['keyWords'].apply(numWords)

# only interested in evaluating movies with at least 50 key words
allGenreMovies2 = allGenreMovies[allGenreMovies['numWords'] > 49]

allGenreMovies2['year'] = allGenreMovies2['year'].apply(yearFix)

allGenreMovies2 = allGenreMovies2.reset_index()
del allGenreMovies2['index']

In [72]:
len(allGenreMovies2)

2167

# Using mostCommonWords() function to determine how many movies each plot tag is used to describe

In [82]:
allWordsAllMovies = mostCommonWords(allGenreMovies2,"allMovies")

# determining the percentage of films each word is used to describe
allWordsAllMovies['%used'] = allWordsAllMovies['allMoviesCount'] / len(allGenreMovies2)

# Limit to include only words tagged in at least 3% of the films
allWordsAllMovies = allWordsAllMovies[allWordsAllMovies['%used'] > .03]

allWordsAllMovies = allWordsAllMovies.reset_index()

# What Are the Most Commonly Tagged Plot Key Words / Phrases In Each Genre?
## Merging All Word DataFrames Together; Outer Merge so no words are lost

In [83]:
allGenreWordsBeforeSubsetting = actionWords.merge(scifiWords,on='word',how='outer').merge(horrorWords,on='word',how='outer').merge(comedyWords,on='word',how='outer').merge(dramaWords,on='word',how='outer').merge(fantasyWords,on='word',how='outer').merge(thrillerWords,on='word',how='outer')
allGenreWordsBeforeSubsetting = allGenreWordsBeforeSubsetting.reset_index()
allGenreWordsBeforeSubsetting = allGenreWordsBeforeSubsetting.fillna(0)

## Inner merge with allWordsAllMovies dataframe in order to remove any plot tags that weren't used to describe at least 3% of all films in the dataset
### This approach is necessary because multiple genre counts double count certain tag/movie combinations

In [84]:
allGenreWords = allWordsAllMovies.merge(allGenreWordsBeforeSubsetting, how = "inner")

In [85]:
len(allGenreWords)

1077

# Calculating Plot Tag Relative Usage For Each Genre

### Calculating the percentage of movies that used a keyword in a genre and dividing it by the usage among all other genres. For example, 'explosion' is used to describe action films 1.97x as often as other genres, it's used to describe drama films 0.27x as often as other genres, so on and so forth. 

In [87]:
# relative usage = percentage word is used in genre X / percentage word is used across all OTHER genres
allGenreWords['actionRelativeUsage'] = (allGenreWords['actionCount'] / len(actionMovies)) / ((allGenreWords['scifiCount'] + allGenreWords['horrorCount'] + allGenreWords['comedyCount'] + allGenreWords['dramaCount'] + allGenreWords['fantasyCount'] + allGenreWords['thrillerCount']) / (len(scifiMovies) + len(horrorMovies) + len(comedyMovies) + len(dramaMovies) + len(fantasyMovies) + len(thrillerMovies)))
allGenreWords['scifiRelativeUsage'] = (allGenreWords['scifiCount'] / len(scifiMovies)) / ((allGenreWords['actionCount'] + allGenreWords['horrorCount'] +  allGenreWords['comedyCount'] + allGenreWords['dramaCount'] + allGenreWords['fantasyCount'] + allGenreWords['thrillerCount']) / (len(actionMovies) + len(horrorMovies) + len(comedyMovies) + len(dramaMovies) + len(fantasyMovies) + len(thrillerMovies)))
allGenreWords['horrorRelativeUsage'] = (allGenreWords['horrorCount'] / len(horrorMovies)) / ((allGenreWords['scifiCount'] + allGenreWords['actionCount'] + allGenreWords['comedyCount'] + allGenreWords['dramaCount'] + allGenreWords['fantasyCount'] + allGenreWords['thrillerCount']) / (len(actionMovies) + len(scifiMovies) + len(comedyMovies) + len(dramaMovies) + len(fantasyMovies) + len(thrillerMovies)))
allGenreWords['comedyRelativeUsage'] = (allGenreWords['comedyCount'] / len(comedyMovies)) / ((allGenreWords['scifiCount'] + allGenreWords['horrorCount'] + allGenreWords['actionCount'] + allGenreWords['dramaCount'] + allGenreWords['fantasyCount'] + allGenreWords['thrillerCount']) / (len(scifiMovies) + len(horrorMovies) + len(actionMovies) + len(dramaMovies) + len(fantasyMovies) + len(thrillerMovies)))
allGenreWords['dramaRelativeUsage'] = (allGenreWords['dramaCount'] / len(dramaMovies)) / ((allGenreWords['scifiCount'] + allGenreWords['horrorCount'] + allGenreWords['comedyCount'] + allGenreWords['actionCount'] + allGenreWords['fantasyCount'] + allGenreWords['thrillerCount']) / (len(scifiMovies) + len(horrorMovies) + len(comedyMovies) + len(actionMovies) + len(fantasyMovies) + len(thrillerMovies)))
allGenreWords['fantasyRelativeUsage'] = (allGenreWords['fantasyCount'] / len(fantasyMovies)) / ((allGenreWords['scifiCount'] + allGenreWords['horrorCount'] + allGenreWords['comedyCount'] + allGenreWords['dramaCount'] + allGenreWords['actionCount'] + allGenreWords['thrillerCount']) / (len(scifiMovies) + len(horrorMovies) + len(comedyMovies) + len(dramaMovies) + len(actionMovies) + len(thrillerMovies)))
allGenreWords['thrillerRelativeUsage'] = (allGenreWords['thrillerCount'] / len(thrillerMovies)) / ((allGenreWords['scifiCount'] + allGenreWords['horrorCount'] + allGenreWords['comedyCount'] + allGenreWords['dramaCount'] + allGenreWords['fantasyCount'] + allGenreWords['actionCount']) / (len(scifiMovies) + len(horrorMovies) + len(comedyMovies) + len(dramaMovies) + len(fantasyMovies) + len(actionMovies)))

# Showing The 5 'Most Sci-Fi' Words

In [88]:
allGenreWords.sort_values('scifiRelativeUsage', ascending = False).head()

Unnamed: 0,word,allMoviesCount,%used,actionCount,scifiCount,horrorCount,comedyCount,dramaCount,fantasyCount,thrillerCount,actionRelativeUsage,scifiRelativeUsage,horrorRelativeUsage,comedyRelativeUsage,dramaRelativeUsage,fantasyRelativeUsage,thrillerRelativeUsage
798,space travel,87,0.040148,60.0,85.0,5.0,9.0,12.0,21.0,18.0,1.8192,5.538637,0.323171,0.203642,0.275636,1.005636,0.426375
612,spacecraft,108,0.049838,70.0,104.0,11.0,16.0,16.0,21.0,25.0,1.649534,5.327583,0.578373,0.294607,0.294607,0.785393,0.477731
886,planet,79,0.036456,56.0,72.0,6.0,12.0,7.0,22.0,13.0,1.929455,5.055551,0.436813,0.310091,0.17589,1.199494,0.337851
526,future,122,0.056299,77.0,116.0,11.0,11.0,27.0,14.0,49.0,1.535947,4.999081,0.495748,0.170163,0.441712,0.43543,0.870516
411,alien,146,0.067374,88.0,143.0,31.0,29.0,18.0,30.0,41.0,1.37063,4.914527,1.176934,0.375761,0.226144,0.775776,0.550053


# The 'Most Horror' Words

In [89]:
allGenreWords.sort_values('horrorRelativeUsage', ascending = False).head()

Unnamed: 0,word,allMoviesCount,%used,actionCount,scifiCount,horrorCount,comedyCount,dramaCount,fantasyCount,thrillerCount,actionRelativeUsage,scifiRelativeUsage,horrorRelativeUsage,comedyRelativeUsage,dramaRelativeUsage,fantasyRelativeUsage,thrillerRelativeUsage
1052,slasher,67,0.030918,4.0,3.0,60.0,2.0,7.0,4.0,20.0,0.1895,0.251909,19.875,0.092816,0.342323,0.377114,1.137
1055,supernatural horror,67,0.030918,2.0,1.0,68.0,2.0,12.0,7.0,31.0,0.075174,0.066763,16.381818,0.075174,0.491676,0.546164,1.532478
914,body count,77,0.035533,27.0,11.0,48.0,7.0,7.0,5.0,34.0,1.096393,0.699966,6.989011,0.241182,0.241182,0.337714,1.472686
656,serial killer,103,0.047531,12.0,9.0,53.0,5.0,30.0,5.0,60.0,0.336889,0.444276,5.803719,0.134556,0.9475,0.267773,2.393684
646,darkness,104,0.047993,27.0,22.0,61.0,4.0,24.0,24.0,50.0,0.663762,0.943112,5.352649,0.087462,0.580596,1.155412,1.403704


# Calculating Genre Scores and Percentages of Movies

In [None]:
# Use the relative genre usage of each plot key word tag to calculate genre scores for each film

allGenreMovies2['actionScore'] = ''
allGenreMovies2['scifiScore'] = ''
allGenreMovies2['horrorScore'] = ''
allGenreMovies2['comedyScore'] = ''
allGenreMovies2['dramaScore'] = ''
allGenreMovies2['fantasyScore'] = ''
allGenreMovies2['thrillerScore'] = ''
allGenreMovies2['numTagsUsed'] = ''

for movie in range(0,len(allGenreMovies2)):
        
    actionScore = 0
    scifiScore = 0
    horrorScore = 0
    comedyScore = 0
    dramaScore = 0
    fantasyScore = 0
    thrillerScore = 0
    
    numTags = allGenreMovies2['numWords'][movie]
    
    for word in allGenreMovies2['keyWords'][movie]:
        try:            
            wordScoresDF = allGenreWords.loc[allGenreWords['word'] == word]
            
            actionRelativeUsage = float(wordScoresDF['actionRelativeUsage'])
            scifiRelativeUsage = float(wordScoresDF['scifiRelativeUsage'])
            horrorRelativeUsage = float(wordScoresDF['horrorRelativeUsage'])
            comedyRelativeUsage = float(wordScoresDF['comedyRelativeUsage'])
            dramaRelativeUsage = float(wordScoresDF['dramaRelativeUsage'])
            fantasyRelativeUsage = float(wordScoresDF['fantasyRelativeUsage'])
            thrillerRelativeUsage = float(wordScoresDF['thrillerRelativeUsage'])
            
            # relative usage scores of <=1 don't impact the movie's genre's score
            # relative usage scorers of >1 are reduced by 1 (so the marginal difference of .99 and 1.01, for example,
            # isn't overexagerated), then added to the movie's genre's score
            if actionRelativeUsage > 1:
                actionRelativeUsage = actionRelativeUsage - 1
                actionScore = actionScore + actionRelativeUsage
            if scifiRelativeUsage > 1:
                scifiRelativeUsage = scifiRelativeUsage - 1
                scifiScore = scifiScore + scifiRelativeUsage
            if horrorRelativeUsage > 1:
                horrorRelativeUsage = horrorRelativeUsage - 1
                horrorScore = horrorScore + horrorRelativeUsage
            if comedyRelativeUsage > 1:
                comedyRelativeUsage = comedyRelativeUsage - 1
                comedyScore = comedyScore + comedyRelativeUsage
            if dramaRelativeUsage > 1:
                dramaRelativeUsage = dramaRelativeUsage - 1
                dramaScore = dramaScore + dramaRelativeUsage
            if fantasyRelativeUsage > 1:
                fantasyRelativeUsage = fantasyRelativeUsage - 1
                fantasyScore = fantasyScore + fantasyRelativeUsage
            if thrillerRelativeUsage > 1:
                thrillerRelativeUsage = thrillerRelativeUsage - 1  
                thrillerScore = thrillerScore + thrillerRelativeUsage
                
        except: 
            numTags = numTags - 1
    
    allGenreMovies2['actionScore'][movie] = actionScore
    allGenreMovies2['scifiScore'][movie] = scifiScore
    allGenreMovies2['horrorScore'][movie] = horrorScore
    allGenreMovies2['comedyScore'][movie] = comedyScore
    allGenreMovies2['dramaScore'][movie] = dramaScore
    allGenreMovies2['fantasyScore'][movie] = fantasyScore
    allGenreMovies2['thrillerScore'][movie] = thrillerScore
    allGenreMovies2['numTagsUsed'][movie] = numTags # this can be used to control for number of tags when comparing more popular movies with movies with fewer reviews and plot tags

In [92]:
allGenreMovies2['actionPer'] = allGenreMovies2['actionScore'] / (allGenreMovies2['actionScore']+allGenreMovies2['scifiScore']+allGenreMovies2['horrorScore']+allGenreMovies2['comedyScore']+allGenreMovies2['dramaScore']+allGenreMovies2['fantasyScore'] + allGenreMovies2['thrillerScore'])
allGenreMovies2['scifiPer'] = allGenreMovies2['scifiScore'] / (allGenreMovies2['actionScore']+allGenreMovies2['scifiScore']+allGenreMovies2['horrorScore']+allGenreMovies2['comedyScore']+allGenreMovies2['dramaScore']+allGenreMovies2['fantasyScore'] + allGenreMovies2['thrillerScore'])
allGenreMovies2['horrorPer'] = allGenreMovies2['horrorScore'] / (allGenreMovies2['actionScore']+allGenreMovies2['scifiScore']+allGenreMovies2['horrorScore']+allGenreMovies2['comedyScore']+allGenreMovies2['dramaScore']+allGenreMovies2['fantasyScore'] + allGenreMovies2['thrillerScore'])
allGenreMovies2['comedyPer'] = allGenreMovies2['comedyScore'] / (allGenreMovies2['actionScore']+allGenreMovies2['scifiScore']+allGenreMovies2['horrorScore']+allGenreMovies2['comedyScore']+allGenreMovies2['dramaScore']+allGenreMovies2['fantasyScore'] + allGenreMovies2['thrillerScore'])
allGenreMovies2['dramaPer'] = allGenreMovies2['dramaScore'] / (allGenreMovies2['actionScore']+allGenreMovies2['scifiScore']+allGenreMovies2['horrorScore']+allGenreMovies2['comedyScore']+allGenreMovies2['dramaScore']+allGenreMovies2['fantasyScore'] + allGenreMovies2['thrillerScore'])
allGenreMovies2['fantasyPer'] = allGenreMovies2['fantasyScore'] / (allGenreMovies2['actionScore']+allGenreMovies2['scifiScore']+allGenreMovies2['horrorScore']+allGenreMovies2['comedyScore']+allGenreMovies2['dramaScore']+allGenreMovies2['fantasyScore'] + allGenreMovies2['thrillerScore'])
allGenreMovies2['thrillerPer'] = allGenreMovies2['thrillerScore'] / (allGenreMovies2['actionScore']+allGenreMovies2['scifiScore']+allGenreMovies2['horrorScore']+allGenreMovies2['comedyScore']+allGenreMovies2['dramaScore']+allGenreMovies2['fantasyScore'] + allGenreMovies2['thrillerScore'])

In [98]:
allGenreMovies2.head(2)

Unnamed: 0,title,ID,genre,year,numVotes,keyWords,numWords,actionScore,scifiScore,horrorScore,...,fantasyScore,thrillerScore,numTagsUsed,actionPer,scifiPer,horrorPer,comedyPer,dramaPer,fantasyPer,thrillerPer
0,The Dark Knight,tt0468569,"Action, Crime, Drama",2008,2008917,"[dc comics, moral dilemma, psychopath, clown, ...",637,232.63,102.743,67.0001,...,17.958,136.537,327,0.393399,0.173748,0.113303,0.010052,0.0482338,0.0303685,0.230896
1,Inception,tt1375666,"Action, Adventure, Sci-Fi",2010,1788343,"[dream, ambiguous ending, subconscious, thief,...",340,142.366,62.5072,22.5254,...,13.6539,73.4009,194,0.427399,0.187653,0.0676236,0.0123863,0.0435901,0.0409906,0.220357


In [93]:
allGenreWords.to_csv("wordsDataJan.csv")
allGenreMovies2.to_csv("moviesDataJan.csv")