### Imports

In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso, LinearRegression, Ridge, ElasticNet
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

### Structuring data

In [3]:
df = pd.read_pickle('full_no_budgets_w_sentiment.pkl')

### TEMPORARY
#df = df.drop_duplicates(subset = ['title', 'year'])

#budgets = pd.read_pickle('budgets_clean.pkl')
#budgets = budgets.drop('releaseDate', axis = 1)
#df = df.merge(budgets, on = ['title', 'year'])
### TEMPORARY

def possible_values(df, colname):
    out = set([])
    for row in df[colname]:
        for g in row:
            out.add(g)
    return out

def dummies_from_nested_categories(df, colname):
    possible_valuess = possible_values(df, colname)
    
    def clean(s):
        return s.replace(' ', '').replace('&', '').lower()
    
    for pos in possible_valuess:
        df['d_' + clean(pos)] = df[colname].apply(lambda x: pos in x).astype(int)
    return df


# Create dummies for mpaaRating
dummies = pd.get_dummies(df['mpaaRating'])
df = pd.concat([df, dummies], axis = 1)

# Create dummies for genres
dummies_from_nested_categories(df, 'genres')

# Create dummies for studio
dummies = pd.get_dummies(df['studio'])
df = pd.concat([df, dummies], axis = 1)

# Create dummies for actors
#dummies_from_nested_categories(df, 'actors')
#dummies_from_nested_categories(df, 'directors')

data = df.drop(['actors', 'mpaaRating', 'synopsis', 'title', 'tomatoIcon', 'releaseDate', 'genres', 'directors', \
                'studio', 'tomatoCount', 'audienceCount', 'year', 'titleType', \
                'isAdult', 'numVotes', 'boRank', 'studioAcronym', 'totalTheaters', \
                'boOpening', 'openingTheaters', 'sentimentAfter', 'numCommentsAfter'], axis = 1)

def get_float(string):
    try:
        return float(string)
    except:
        return None

items = ['tomatoMeter', 'audienceScore', 'runtime', 'boWorldwide']
for item in items:
    data[item] = data[item].apply(get_float)
    #print(item)
    #print(data.shape)

data = data.dropna(subset = items)

In [9]:
# Temporary
# def get_top_actors(lst):
#     count = 0
#     for item in lst:
#         if item in top_500:
#             count += 1
#     return count

# def get_top_directors(lst):
#     count = 0
#     for item in lst:
#         if item in top_100:
#             count += 1
#     return count

# actors = pd.read_pickle('actors.pkl')
# directors = pd.read_pickle('directors.pkl')

# top_500 = actors.iloc[:500]['actor'].tolist()
# top_100 = directors.iloc[:100]['director'].tolist()

# data['topActors'] = df['actors'].apply(get_top_actors)
# data['topDirectors'] = df['directors'].apply(get_top_directors)

### Setup for ML

In [5]:
train, test = train_test_split(data, test_size = 0.2)

y_train = train['tomatoMeter', 'audienceScore', 'averageRating', 'boWorldwide']
y_train['boWorldwide'] = np.log(y_train['boWorldwide'])
y_test = test['tomatoMeter', 'audienceScore', 'averageRating', 'boWorldwide']
y_test['boWorldwide'] = np.log(y_train['boWorldwide'])

X_train = train.drop(['tomatoMeter', 'audienceScore', 'averageRating', 'boWorldwide'], axis = 1)
X_test = test.drop(['tomatoMeter', 'audienceScore', 'averageRating', 'boWorldwide'], axis = 1)

### Linear regression

In [None]:
lreg = LinearRegression()

for target in y_train:
    lreg.fit(X_train, y_train['audienceScore'])

### Lasso

In [6]:
pipe = make_pipeline(StandardScaler(), Lasso(random_state = 1))

gs_lasso = GridSearchCV(estimator = pipe, param_grid = {'lasso__alpha': np.logspace(-4, 4, 12)}, \
                  scoring = 'r2', n_jobs = -1, iid = False, cv = 10, verbose = 10)

gs_lasso.fit(X_train, train['audienceScore'])

Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1989s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  85 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 105 out of 120 | elapsed:    6.3s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done 118 out of 120 | elapsed:    6.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    6.4s finished
  return self.partial_fit(X, y)
  return self.fit(

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lasso', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=1,
   selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'lasso__alpha': array([1.00000e-04, 5.33670e-04, 2.84804e-03, 1.51991e-02, 8.11131e-02,
       4.32876e-01, 2.31013e+00, 1.23285e+01, 6.57933e+01, 3.51119e+02,
       1.87382e+03, 1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=10)

### Elastic net

In [None]:
from sklearn.linear_model import ElasticNet

train, test = train_test_split(data, test_size = 0.2)

X_train = train.drop(['tomatoMeter', 'audienceScore', 'averageRating', 'boWorldwide'], axis = 1)
X_test = test.drop(['tomatoMeter', 'audienceScore', 'averageRating', 'boWorldwide'], axis = 1)

pipe = make_pipeline(ElasticNet(random_state = 1))
gs = GridSearchCV(estimator = pipe, param_grid = {'elasticnet__alpha': np.logspace(-4, 4, 12), \
                  'elasticnet__l1_ratio': [0, 0.25, 0.5, 0.75, 1]}, \
                  scoring = 'r2', n_jobs = -1, iid = False, cv = 10, verbose = 10)

gs.fit(X_train, train['audienceScore'])

In [7]:
test['predicted'] = gs.predict(X_test)
print(gs.best_score_)
print(gs.best_params_)

0.1984595342737087
{'lasso__alpha': 0.43287612810830617}


  Xt = transform.transform(Xt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [None]:
import seaborn as sns
%matplotlib inline

sns.regplot('audienceScore', 'predicted', ci = None, data = test)

In [None]:
for item in X_train.columns:
    print(item)

In [None]:
X_train