# Machine learning

### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso, LinearRegression, ElasticNet
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, validation_curve, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
%matplotlib inline

### Structuring data

In [None]:
# Load dataframe
df = pd.read_pickle('full_w_sentiment.pkl')

# Function for getting possible values in dataframe columns
def possible_values(df, colname):
    out = set([])
    for row in df[colname]:
        for g in row:
            out.add(g)
    return out

# Function for getting dummies for dataframe with columns of lists
def dummies_from_nested_categories(df, colname):
    possible_valuess = possible_values(df, colname)
    
    def clean(s):
        return s.replace(' ', '').replace('&', '').lower()
    
    for pos in possible_valuess:
        df['d_' + clean(pos)] = df[colname].apply(lambda x: pos in x).astype(int)
    return df


# Create dummies for mpaaRating
dummies = pd.get_dummies(df['mpaaRating'])
df = pd.concat([df, dummies], axis = 1)

# Create dummies for genres
dummies_from_nested_categories(df, 'genres')

# Create dummies for studio
dummies = pd.get_dummies(df['studio'])
df = pd.concat([df, dummies], axis = 1)

# Drop non-features
data = df.drop(['actors', 'mpaaRating', 'synopsis', 'title', 'tomatoIcon', 'releaseDate', 'genres', 'directors', \
                'studio', 'tomatoCount', 'audienceCount', 'year', 'titleType', \
                'isAdult', 'numVotes', 'boRank', 'studioAcronym', 'totalTheaters', \
                'boOpening', 'openingTheaters', 'sentimentAfter', 'numCommentsAfter', 'positiveWordsAfter'], axis = 1)

# Function for turning string into float
def get_float(string):
    try:
        return float(string)
    except:
        return None

# Turn columns into floats
items = ['tomatoMeter', 'audienceScore', 'runtime', 'boWorldwide']
for item in items:
    data[item] = data[item].apply(get_float)

# Drop not available observations
data = data.dropna(subset = items)

### Setup for ML

In [None]:
# Split data into training and test
train, test = train_test_split(data, test_size = 0.25)

# Create target dataframes
y_train = train[['tomatoMeter', 'audienceScore', 'averageRating', 'boWorldwide']]
y_train['boWorldwide'] = np.log(y_train['boWorldwide'])
y_test = test[['tomatoMeter', 'audienceScore', 'averageRating', 'boWorldwide']]
y_test['boWorldwide'] = np.log(y_test['boWorldwide'])

# Create feature dataframes
X_train = train.drop(['tomatoMeter', 'audienceScore', 'averageRating', 'boWorldwide'], axis = 1)
X_test = test.drop(['tomatoMeter', 'audienceScore', 'averageRating', 'boWorldwide'], axis = 1)

### Linear regression

In [None]:
# Create list for storing linear regressions
lregs = []

# Fit linear regressions for different targets, outputting MSE and R-squared for each
i = 1
for target in y_train:
    lreg = LinearRegression()
    lreg.fit(X_train, y_train[target])

    print(target + ':', mean_squared_error(y_test[target], lreg.predict(X_test)), r2_score(y_test[target], lreg.predict(X_test)))
    lregs.append(lreg)

### Lasso

In [None]:
# Make pipeline for Lasso
pipe = make_pipeline(StandardScaler(), Lasso(random_state = 1))

# Create list for storing Lassos
gs_lassos = []

# Fit Lassos for different targets, outputting MSE and R-squared for each
for target in y_train:
    gs_lasso = GridSearchCV(estimator = pipe, param_grid = {'lasso__alpha': np.logspace(-4, 4, 12)}, \
                      scoring = 'neg_mean_squared_error', n_jobs = -1, iid = False, cv = 10, verbose = 0)
    gs_lasso.fit(X_train, y_train[target])

    print(target + ':', mean_squared_error(y_test[target], gs_lasso.best_estimator_.predict(X_test)), r2_score(y_test[target], gs_lasso.best_estimator_.predict(X_test)))
    gs_lassos.append(gs_lasso)
    
# Get coefficients regarding Reddit comments
print(gs_lassos[0].best_estimator_.steps[1][1].coef_[3:6])
print(gs_lassos[2].best_estimator_.steps[1][1].coef_[3:6])
print(gs_lassos[3].best_estimator_.steps[1][1].coef_[3:6])

### Validation curves for Lasso

In [None]:
# Set style
plt.style.use('seaborn-deep')

# Creat figure
fig, axes = plt.subplots(1, 3, figsize = (8, 2))

# Create dataframe for Lasso with target average rating
lasso_average_val = pd.DataFrame()
lasso_average_val['Validation'] = -gs_lassos[2].cv_results_['mean_test_score']
lasso_average_val['Train'] = -gs_lassos[2].cv_results_['mean_train_score']
lasso_average_val['Lambda'] = np.logspace(-4, 4, 12)
lasso_average_val = lasso_average_val.set_index('Lambda')
lasso_average_val.plot(logx = True, logy = True, ax = axes[0], legend = False)

# Create dataframe for Lasso with target Tomatometer
lasso_tomato_val = pd.DataFrame()
lasso_tomato_val['Validation'] = -gs_lassos[0].cv_results_['mean_test_score']
lasso_tomato_val['Train'] = -gs_lassos[0].cv_results_['mean_train_score']
lasso_tomato_val['Lambda'] = np.logspace(-4, 4, 12)
lasso_tomato_val = lasso_tomato_val.set_index('Lambda')
lasso_tomato_val.plot(logx = True, logy = True, ax = axes[1], legend = False)

# Create dataframe for Lasso with target box office
lasso_bo_val = pd.DataFrame()
lasso_bo_val['Validation'] = -gs_lassos[3].cv_results_['mean_test_score']
lasso_bo_val['Train'] = -gs_lassos[3].cv_results_['mean_train_score']
lasso_bo_val['Lambda'] = np.logspace(-4, 4, 12)
lasso_bo_val = lasso_bo_val.set_index('Lambda')
lasso_bo_val.plot(logx = True, logy = True, ax = axes[2], legend = False)

# Set y labels
axes[0].set_ylabel('MSE')
axes[1].set_ylabel('MSE')
axes[2].set_ylabel('MSE')

# Set titles
axes[0].set_title('(A)  IMDb \n average rating')
axes[1].set_title('(B)  RT \n Tomatometer')
axes[2].set_title('(B)  Log worldwide \n box office')

# Adjust spacing
plt.subplots_adjust(hspace = 0.6)
plt.subplots_adjust(wspace = 0.65)

# Creating common legend
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc = (0.455, 0.02))
fig.subplots_adjust(bottom = 0.5)

# Save figures
plt.savefig('validation-curves.pdf', bbox_inches = 'tight')

### Elastic net

In [None]:
# Make pipeline for Lasso
pipe = make_pipeline(ElasticNet(random_state = 1))

# Create lists for storing elastic nets
gs_nets = []

# Fit elastic nets for different targets, outputting MSE and R-squared for each
for target in y_train: 
    gs_net = GridSearchCV(estimator = pipe, param_grid = {'elasticnet__alpha': np.logspace(-4, 4, 12), \
                      'elasticnet__l1_ratio': [0, 0.25, 0.5, 0.75, 1]}, \
                      scoring = 'neg_mean_squared_error', n_jobs = -1, iid = False, cv = 10, verbose = 0)
    gs_net.fit(X_train, y_train[target])
    
    print(target + ':', mean_squared_error(y_test[target], gs_net.best_estimator_.predict(X_test)), r2_score(y_test[target], gs_net.best_estimator_.predict(X_test)))
    
    gs_nets.append(gs_net)
    
# Get coefficients regarding Reddit comments
print(gs_nets[0].best_estimator_.steps[0][1].coef_[3:6])
print(gs_nets[2].best_estimator_.steps[0][1].coef_[3:6])
print(gs_nets[3].best_estimator_.steps[0][1].coef_[3:6])

### Learning curves

In [None]:
# Create pipes
lasso_pipe = make_pipeline(StandardScaler(), Lasso(random_state = 1))
elastic_pipe = make_pipeline(ElasticNet(random_state = 1))

# Create dataframes for Lasso with audience score
train_sizes, train_scores, test_scores = learning_curve(estimator = lasso_pipe, \
                   X = X_train,
                   y = y_train['audienceScore'],
                   train_sizes = np.arange(0.2, 1.05, .05),
                   scoring = 'neg_mean_squared_error',                 
                   cv = 10)    
mse_lasso_audience = pd.DataFrame({'Validation': -test_scores.mean(axis = 1),
                     'Train': -train_scores.mean(axis = 1)})\
                    .set_index(pd.Index(train_sizes, name = 'Sample size'))

# Create dataframes for Lasso with Tomatometer
train_sizes, train_scores, test_scores = learning_curve(estimator = lasso_pipe, \
                   X = X_train,
                   y = y_train['tomatoMeter'],
                   train_sizes = np.arange(0.2, 1.05, .05),
                   scoring = 'neg_mean_squared_error',                 
                   cv = 10)
mse_lasso_tomatom = pd.DataFrame({'Validation': -test_scores.mean(axis = 1),
                     'Train': -train_scores.mean(axis = 1)})\
                    .set_index(pd.Index(train_sizes, name = 'Sample size'))

# Create dataframes for Lasso with box office
train_sizes, train_scores, test_scores = learning_curve(estimator = lasso_pipe, \
                   X = X_train,
                   y = y_train['boWorldwide'],
                   train_sizes = np.arange(0.2, 1.05, .05),
                   scoring = 'neg_mean_squared_error',                 
                   cv = 10)
mse_lasso_boworld = pd.DataFrame({'Validation': -test_scores.mean(axis = 1),
                     'Train': -train_scores.mean(axis = 1)})\
                    .set_index(pd.Index(train_sizes, name = 'Sample size'))

# Create dataframes for elastic net with audience score
train_sizes, train_scores, test_scores = learning_curve(estimator = elastic_pipe, \
                   X = X_train,
                   y = y_train['audienceScore'],
                   train_sizes = np.arange(0.2, 1.05, .05),
                   scoring = 'neg_mean_squared_error',                 
                   cv = 10)    
mse_elastic_audience = pd.DataFrame({'Validation': -test_scores.mean(axis = 1),
                     'Train': -train_scores.mean(axis = 1)})\
                    .set_index(pd.Index(train_sizes, name = 'Sample size'))

# Create dataframes for elastic net with Tomatometer
train_sizes, train_scores, test_scores = learning_curve(estimator = elastic_pipe, \
                   X = X_train,
                   y = y_train['tomatoMeter'],
                   train_sizes = np.arange(0.2, 1.05, .05),
                   scoring = 'neg_mean_squared_error',                 
                   cv = 10)    
mse_elastic_tomatom = pd.DataFrame({'Validation': -test_scores.mean(axis = 1),
                     'Train': -train_scores.mean(axis = 1)})\
                    .set_index(pd.Index(train_sizes, name = 'Sample size'))

# Create dataframes for elastic net with box office
train_sizes, train_scores, test_scores = learning_curve(estimator = elastic_pipe, \
                   X = X_train,
                   y = y_train['boWorldwide'],
                   train_sizes = np.arange(0.2, 1.05, .05),
                   scoring = 'neg_mean_squared_error',                 
                   cv = 10)    
mse_elastic_boworld = pd.DataFrame({'Validation': -test_scores.mean(axis = 1),
                     'Train': -train_scores.mean(axis = 1)})\
                    .set_index(pd.Index(train_sizes, name = 'Sample size'))

# Create figure
fig, axes = plt.subplots(2, 3, figsize = (8, 4.5))

# Create plots
mse_lasso_audience.plot(ax = axes[0, 0], legend = False)
mse_lasso_tomatom.plot(ax = axes[0, 1], legend = False)
mse_lasso_boworld.plot(ax = axes[0, 2], legend = False)
mse_elastic_audience.plot(ax = axes[1, 0], legend = False)
mse_elastic_tomatom.plot(ax = axes[1, 1], legend = False)
mse_elastic_boworld.plot(ax = axes[1, 2], legend = False)

# Set titles
axes[0, 0].set_title('(A)  Lasso with \n audience score')
axes[0, 1].set_title('(B)  Lasso with \n Tomatometer')
axes[0, 2].set_title('(C)  Lasso with \n box office')
axes[1, 0].set_title('(D)  Elastic net \n with audience score')
axes[1, 1].set_title('(E)  Elastic net \n with Tomatometer')
axes[1, 2].set_title('(F)  Elastic net \n with box office')

# Set y-labels
axes[0, 0].set_ylabel('MSE')
axes[0, 1].set_ylabel('MSE')
axes[0, 2].set_ylabel('MSE')
axes[1, 0].set_ylabel('MSE')
axes[1, 1].set_ylabel('MSE')
axes[1, 2].set_ylabel('MSE')

# Adjust spacing
plt.subplots_adjust(hspace = 1.1)
plt.subplots_adjust(wspace = 0.5)

# Create common label
handles, labels = axes[0, 0].get_legend_handles_labels()
fig.legend(handles, labels, loc = (0.475, 0.025))
fig.subplots_adjust(bottom = 0.25)

# Save figure
plt.savefig('learning-curves.pdf', bbox_inches = 'tight')