# Merging datasets

### Merge

In [None]:
# Imports
import pandas as pd

# Function for removing year from movie title
def remove_year(string):
    return string.split('(')[0]

# Function for getting the year of a datetime
def get_year(date):
    return date.year

# Load datasets
imdb = pd.read_pickle('imdb.pkl')
rt = pd.read_pickle('rt_clean.pkl')
bom = pd.read_pickle('bom_clean.pkl')

# Remove years from bom titles
bom['title'] = bom['title'].apply(remove_year)

# Rename columns in imdb
imdb.rename(columns = {'primaryTitle': 'title', 'startYear': 'year'}, inplace = True)

# Make year column in Rotten Tomatoes
rt['year'] = rt['releaseDate'].apply(get_year)

# Function for making titles more comparable
def adjust_title(string):
    string = string.lower()
    string = ''.join(string.split())
    
    chars = [':', '.', ',', ';', '(', ')', '[', ']', '/', '\\', '$', '%', '"', '\'', '#', '@', '=', '?', '+', '-']
    for char in chars:
        string = string.replace(char, '')
        
    string = string.replace('&', 'and')
    
    return string

# Make a copy of the IMDb dataframe
imdb_lower = imdb.copy()

# Make titles more comparable
imdb_lower['title'] = imdb['title'].apply(adjust_title)
rt['title'] = rt['title'].apply(adjust_title)
bom['title'] = bom['title'].apply(adjust_title)
bud['title'] = bud['title'].apply(adjust_title)

# Merge datasets
df = rt.merge(imdb_lower, on = ['title', 'year'])
df = df.merge(bom, on = ['title', 'year'])

### Structuring data

In [None]:
# Select years
df = df[df['year'] >= 2010]

# Set index
df = df.set_index('tconst')

# Get titles from IMDB
imdb = imdb.set_index('tconst')
df['title'] = imdb['title']

# Drop duplicates
df = df.drop_duplicates(subset = ['title', 'releaseDate'])

# Get the list of actors and directors
actors = pd.read_pickle('actors.pkl')
directors = pd.read_pickle('directors.pkl')

# Create list of top 500 actors and top 100 directors
top_500 = actors.iloc[:500]['actor'].tolist()
top_100 = directors.iloc[:100]['director'].tolist()

# Count the number of top actors and directors
df['topActors'] = df['actors'].apply(get_top_actors)
df['topDirectors'] = df['directors'].apply(get_top_directors)

# Save dataframe
df.to_pickle('full.pkl')