### Merging datasets

In [1]:
import pandas as pd

def remove_year(string):
    return string.split('(')[0]

def get_year(date):
    return date.year

# Load datasets
imdb = pd.read_pickle('imdb.pkl')
rt = pd.read_pickle('rt_clean.pkl')
bom = pd.read_pickle('bom_clean.pkl')
bud = pd.read_pickle('budgets_clean.pkl')

# Remove years from bom titles
bom['title'] = bom['title'].apply(remove_year)

# Rename columns in imdb
imdb.rename(columns = {'primaryTitle': 'title', 'startYear': 'year'}, inplace = True)

# Make year column in Rotten Tomatoes
rt['year'] = rt['releaseDate'].apply(get_year)

# Lowercase titles
def adjust_title(string):
    string = string.lower()
    string = ''.join(string.split())
    
    chars = [':', '.', ',', ';', '(', ')', '[', ']', '/', '\\', '$', '%', '"', '\'', '#', '@', '=', '?', '+', '-']
    for char in chars:
        string = string.replace(char, '')
        
    string = string.replace('&', 'and')
    
    return string

imdb_lower = imdb.copy()
imdb_lower['title'] = imdb['title'].apply(adjust_title)
rt['title'] = rt['title'].apply(adjust_title)
bom['title'] = bom['title'].apply(adjust_title)
bud['title'] = bud['title'].apply(adjust_title)

# Merge datasets
print(rt.shape)
df = rt.merge(imdb_lower, on = ['title', 'year'])
print(df.shape)
df = df.merge(bom, on = ['title', 'year'])
print(df.shape)
#df = df.merge(bud[['title', 'year', 'budget']], on = ['title', 'year'])
#print(df.shape)

(9984, 14)
(5937, 20)
(2847, 26)


### Structuring data

In [2]:
# Drop NA
#df = df.dropna()

# Select years
df = df[df['year'] >= 2010]

# Set index
df = df.set_index('tconst')

# Get titles from IMDB
imdb = imdb.set_index('tconst')
df['title'] = imdb['title']

df = df.drop_duplicates(subset = ['title', 'releaseDate'])

In [3]:
actors = pd.read_pickle('actors.pkl')
directors = pd.read_pickle('directors.pkl')

top_500 = actors.iloc[:500]['actor'].tolist()
top_100 = directors.iloc[:100]['director'].tolist()

In [4]:
def get_top_actors(lst):
    count = 0
    for item in lst:
        if item in top_500:
            count += 1
    return count

def get_top_directors(lst):
    count = 0
    for item in lst:
        if item in top_100:
            count += 1
    return count

In [5]:
df['topActors'] = df['actors'].apply(get_top_actors)
df['topDirectors'] = df['directors'].apply(get_top_directors)

In [6]:
df.to_pickle('full_no_budgets.pkl')