### Merging datasets

In [134]:
import pandas as pd

def remove_year(string):
    return string.split('(')[0]

def get_year(date):
    return date.year

# Load datasets
imdb = pd.read_pickle('imdb.pkl')
rt = pd.read_pickle('rt_clean.pkl')
bom = pd.read_pickle('bom_clean.pkl')

# Remove years from bom titles
bom['title'] = bom['title'].apply(remove_year)

# Rename columns in imdb
imdb.rename(columns = {'primaryTitle': 'title', 'startYear': 'year'}, inplace = True)

# Make year column in Rotten Tomatoes
rt['year'] = rt['releaseDate'].apply(get_year)

# Lowercase titles
imdb_lower = imdb.copy()
imdb_lower['title'] = imdb['title'].str.lower()
rt['title'] = rt['title'].str.lower()
bom['title'] = bom['title'].str.lower()

# Merge datasets
df = rt.merge(imdb_lower, on = ['title', 'year'])
df = df.merge(bom, on = ['title', 'year'])

### Structuring data

In [135]:
# Drop NA
#df = df.dropna()

# Select years
#df = df[df['year'] >= 2013]

# Set index
df = df.set_index('tconst')

# Get titles from IMDB
imdb = imdb.set_index('tconst')
df['title'] = imdb['title']

df.to_pickle('full.pkl')

In [61]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

# Density plots
# data = df['audienceScore']
# sns.set_style('whitegrid')
# sns.kdeplot(np.array(data), bw = 5)

plt.figure(figsize = (10, 6))

sns.regplot(x = 'audienceScore', y = 'averageRating', data = df)