In [1]:
from sklearn import cross_validation, datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import math
%matplotlib inline

In [2]:
def lm_regress( data_df, y ):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
                                            data_df, y, test_size=0)

    X_train = sm.add_constant(X_train,has_constant='add')
    ols_model = sm.OLS(y_train, X_train).fit()
    return ols_model

In [106]:
mojo_final = pd.read_csv("mojo_final.csv", index_col=0)
scores_df = pd.read_csv("scores.csv", index_col=0)
scores_df.drop('a_score', inplace=True, axis=1)


key_genres = ['Action', 'Adventure', 'Animation', 'Comedy', 'Drama', 'Horror', 'Thriller',
              'Crime', 'Fantasy', 'Romance', 'Sci-Fi']
cols = ['title', 'budget', 'Gross', 'rel_date', 'ROI']
# cols += key_genres


data_df = pd.merge(mojo_final[cols], scores_df, on='title')

data_df['ROI'] = data_df.apply(lambda x: math.log10(x.Gross/x.budget), axis=1)

data_df = data_df[data_df['budget']>5e7]

y=data_df['ROI']

drop_cols = ['title', 'Gross', 'rel_date', 'ROI', 'budget']
data_df.drop(drop_cols, axis=1, inplace=True)
data_df.drop(['a_exp', 'd_exp'], axis=1, inplace=True)
# data_df.drop(key_genres, axis=1, inplace=True)

ols_model = lm_regress(data_df, y)
ols_model.summary()

1199                               5 Days of War
571                                     Margaret
1090                         Waiting for Forever
549                                   Love Ranch
1220                                       Agora
272                 The Good, the Bad, the Weird
998                                  The Tempest
468                         The Killer Inside Me
555                         Machine Gun Preacher
281                                 The Greatest
1355                               Brighton Rock
801                                     Restless
928                           A Sound of Thunder
1041                                       Trust
670                                Not Fade Away
684                                 Oliver Twist
1044                                      Twelve
599                                   Middle Men
729            Perfume - The Story of a Murderer
1155                                   The Yards
688                 

0,1,2,3
Dep. Variable:,ROI,R-squared:,0.037
Model:,OLS,Adj. R-squared:,0.03
Method:,Least Squares,F-statistic:,5.071
Date:,"Thu, 06 Oct 2016",Prob (F-statistic):,0.000143
Time:,14:26:44,Log-Likelihood:,-219.37
No. Observations:,664,AIC:,450.7
Df Residuals:,658,BIC:,477.7
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,-0.0536,0.022,-2.440,0.015,-0.097 -0.010
a_star,0.0261,0.015,1.796,0.073,-0.002 0.055
d_score,7.053e-06,4.09e-05,0.172,0.863,-7.33e-05 8.74e-05
da_coop,0.0827,0.040,2.046,0.041,0.003 0.162
dp_coop,0.0092,0.019,0.500,0.617,-0.027 0.046
pw_coop,0.0586,0.024,2.411,0.016,0.011 0.106

0,1,2,3
Omnibus:,129.173,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,367.039
Skew:,-0.956,Prob(JB):,1.99e-80
Kurtosis:,6.1,Cond. No.,1590.0


In [73]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn import cross_validation

In [85]:
from sklearn import linear_model
from sklearn import grid_search

def rf_grid(df, y):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
                                            df, y, test_size=0.1)
    rf = RandomForestRegressor()

    parameters ={ "max_depth": [2,3,4,5,None],
                  "max_features": list(range(1, 5)),
                  "min_samples_split": list(range(1, 5))}

    grid_searcher = grid_search.GridSearchCV(rf, parameters)
    grid_searcher.fit(X_train, y_train)
    print('best_param: ', grid_searcher.best_params_)
    best_model = grid_searcher.best_estimator_
    print('best_score: ', best_model.score(X_test,y_test))
    sorted_features = sorted(zip(data_df.columns, best_model.feature_importances_), key=lambda tup: abs(tup[1]), reverse=True)
    print('feature importance:\n')
    for feature in sorted_features:
        print(feature)

In [86]:
rf_grid(data_df, y)

best_param:  {'max_depth': 3, 'min_samples_split': 1, 'max_features': 1}
best_score:  0.0591816849329
feature importance:

('da_coop', 0.28938887366995913)
('d_score', 0.27013373389461554)
('pw_coop', 0.2088108418441533)
('dp_coop', 0.12185005393173068)
('a_star', 0.10981649665954134)
