## DEALING WITH CATEGORICAL FEATURES

#### Read the data

In [1]:
import sys
sys.path.append('..')

from project_1.loaddata import get_boxofficemojo_movies
movie_list = get_boxofficemojo_movies()

import pandas as pd
movies = pd.DataFrame(movie_list)
movies.head()

Parsed 3728 movies from 3728 files


Unnamed: 0,alt_title,director,domestic_gross,mojo_slug,opening_per_theater,opening_weekend_take,production_budget,release_date_limited,release_date_wide,title,widest_release,worldwide_gross,year
0,"10,000 B.C. (2008)",Roland Emmerich,94784201,10000bc,10518,35867488,105000000.0,,2008-03-07,"10,000 B.C.",3454,269784201.0,2008
1,102 Dalmatians (2000),,66957026,102dalmatians,7353,19883351,85000000.0,,2000-11-22,102 Dalmatians,2704,183611771.0,2000
2,10 Things I Hate About You (1999),,38178166,10thingsihateaboutyou,3668,8330681,30000000.0,,1999-03-31,10 Things I Hate About You,2311,53478166.0,1999
3,10 Years (2012),Jamie Linden,203373,10years,7569,22707,,,2012-09-14,10 Years,63,203373.0,2012
4,11 Flowers (2013),,9213,11flowers,1758,3516,,,2013-02-22,11 Flowers,2,,2013


#### Who are the directors with most movies?

In [2]:
movies.director.value_counts()[:5]

Steven Spielberg     19
Woody Allen          19
Ridley Scott         15
Joel Schumacher      15
Steven Soderbergh    14
dtype: int64

#### Let's only take movies of the top directors

In [3]:
N = 4
top_directors = movies.director.value_counts().index[:N]
top_dir_movies = movies[movies['director'].isin(top_directors)]

print '%i movies by top %i directors: %s.' % (len(top_dir_movies),
                                              N,
                                              ', '.join(top_directors))

top_dir_movies.head()

68 movies by top 4 directors: Steven Spielberg, Woody Allen, Ridley Scott, Joel Schumacher.


Unnamed: 0,alt_title,director,domestic_gross,mojo_slug,opening_per_theater,opening_weekend_take,production_budget,release_date_limited,release_date_wide,title,widest_release,worldwide_gross,year
71,8MM (1999),Joel Schumacher,36663315,8mm,6013,14252888,40000000,,1999-02-26,8MM,2370,96618699.0,1999
138,Alien (1979),Ridley Scott,80931801,alien,8366,5312945,11000000,1979-05-25,1979-06-22,Alien,757,104931801.0,1979
169,American Gangster (2007),Ridley Scott,130164645,americangangster,14264,43565115,100000000,,2007-11-02,American Gangster,3110,266465037.0,2007
183,Amistad (1997),Steven Spielberg,44229441,amistad,7270,5176006,36000000,1997-12-10,1997-12-26,Amistad,1019,,1997
226,Anything Else (2003),Woody Allen,3212310,anythingelse,1619,1673125,18000000,,2003-09-19,Anything Else,1033,13585075.0,2003


We need to convert each director into a column, where the value is either 1 (directed by our guy) or 0 (not directed by our guy). Pandas has a quick way of handling this.

In [4]:
pd.get_dummies(top_dir_movies['director'])

Unnamed: 0,Joel Schumacher,Ridley Scott,Steven Spielberg,Woody Allen
71,1,0,0,0
138,0,1,0,0
169,0,1,0,0
183,0,0,1,0
226,0,0,0,1
313,1,0,0,0
343,1,0,0,0
344,1,0,0,0
451,0,1,0,0
462,0,1,0,0


#### Let's put the dummy variables for director into our data frame

In [5]:
dummies = pd.get_dummies(top_dir_movies['director'])
top_dir_movies = pd.concat([top_dir_movies,dummies],axis=1)
top_dir_movies.head()

Unnamed: 0,alt_title,director,domestic_gross,mojo_slug,opening_per_theater,opening_weekend_take,production_budget,release_date_limited,release_date_wide,title,widest_release,worldwide_gross,year,Joel Schumacher,Ridley Scott,Steven Spielberg,Woody Allen
71,8MM (1999),Joel Schumacher,36663315,8mm,6013,14252888,40000000,,1999-02-26,8MM,2370,96618699.0,1999,1,0,0,0
138,Alien (1979),Ridley Scott,80931801,alien,8366,5312945,11000000,1979-05-25,1979-06-22,Alien,757,104931801.0,1979,0,1,0,0
169,American Gangster (2007),Ridley Scott,130164645,americangangster,14264,43565115,100000000,,2007-11-02,American Gangster,3110,266465037.0,2007,0,1,0,0
183,Amistad (1997),Steven Spielberg,44229441,amistad,7270,5176006,36000000,1997-12-10,1997-12-26,Amistad,1019,,1997,0,0,1,0
226,Anything Else (2003),Woody Allen,3212310,anythingelse,1619,1673125,18000000,,2003-09-19,Anything Else,1033,13585075.0,2003,0,0,0,1


#### Build the model, use dummies among the features in the model

In [23]:
features = ['production_budget',
            'Steven Spielberg',
            'Woody Allen',
            'Ridley Scott',
            'Joel Schumacher']

# only pick columns you'll use in the model and dropna so we get
# rid of movies without budget info, etc.
related_columns = features + ['domestic_gross']
clean_top_dir_movies = top_dir_movies[related_columns].dropna()
print '%i movies with all necessary info.' % len(clean_top_dir_movies)

# build the model
import statsmodels.api as sm
Y = clean_top_dir_movies['domestic_gross']
X = sm.add_constant(clean_top_dir_movies[features])
director_model = sm.OLS(Y, X).fit()
director_model.summary()

45 movies with all necessary info.


0,1,2,3
Dep. Variable:,domestic_gross,R-squared:,0.405
Model:,OLS,Adj. R-squared:,0.345
Method:,Least Squares,F-statistic:,6.795
Date:,"Thu, 17 Sep 2015",Prob (F-statistic):,0.000285
Time:,01:14:09,Log-Likelihood:,-880.68
No. Observations:,45,AIC:,1771.0
Df Residuals:,40,BIC:,1780.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,5.538e+07,1.75e+07,3.168,0.003,2.01e+07 9.07e+07
production_budget,0.3099,0.343,0.903,0.372,-0.384 1.003
Steven Spielberg,1.09e+08,1.85e+07,5.878,0.000,7.15e+07 1.46e+08
Woody Allen,-4.222e+07,2.85e+07,-1.482,0.146,-9.98e+07 1.53e+07
Ridley Scott,5.409e+06,2.46e+07,0.220,0.827,-4.42e+07 5.51e+07
Joel Schumacher,-1.679e+07,2.37e+07,-0.708,0.483,-6.47e+07 3.12e+07

0,1,2,3
Omnibus:,17.9,Durbin-Watson:,2.035
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.661
Skew:,1.233,Prob(JB):,2.68e-06
Kurtosis:,5.758,Cond. No.,3.96e+23


In [22]:
import numpy as np
mse = director_model.mse_resid
rmse = np.sqrt(mse)
print '2 x Root Mean Squared Error = %g' % (2 * rmse)

2 x Root Mean Squared Error = 1.62108e+08


## Separating training and test sets

##### Define your X and Y

In [27]:
model_columns = ['domestic_gross',
                 'opening_weekend_take',
                 'Steven Spielberg',
                 'Woody Allen']
df = top_dir_movies[model_columns].dropna()

In [33]:
X = sm.add_constant(df[['opening_weekend_take',
                        'Steven Spielberg',
                        'Woody Allen']])
Y = df['domestic_gross']

##### Scikit.learn has a function to do this split

In [72]:
from sklearn.cross_validation import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.33)

print len(X_train), len(X_test)

44 23


##### Fit model to the training set

In [73]:
model = sm.OLS(Y_train, X_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,domestic_gross,R-squared:,0.598
Model:,OLS,Adj. R-squared:,0.567
Method:,Least Squares,F-statistic:,19.8
Date:,"Thu, 17 Sep 2015",Prob (F-statistic):,5.01e-08
Time:,02:07:23,Log-Likelihood:,-847.42
No. Observations:,44,AIC:,1703.0
Df Residuals:,40,BIC:,1710.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.444e+07,1.61e+07,1.518,0.137,-8.09e+06 5.7e+07
opening_weekend_take,2.3163,0.648,3.576,0.001,1.007 3.625
Steven Spielberg,1.095e+08,2.27e+07,4.828,0.000,6.36e+07 1.55e+08
Woody Allen,-1.188e+07,2.28e+07,-0.521,0.606,-5.8e+07 3.43e+07

0,1,2,3
Omnibus:,48.787,Durbin-Watson:,1.941
Prob(Omnibus):,0.0,Jarque-Bera (JB):,273.676
Skew:,2.621,Prob(JB):,3.7299999999999997e-60
Kurtosis:,14.036,Cond. No.,63900000.0


##### Evaluate performance on the test set

In [74]:
from sklearn.metrics import mean_squared_error

# Root mean squared error is standard deviation of
# the differences between predicted and actual values
def RMSE(model, X_, Y_):
    Y_pred = model.predict(X_)
    Y_true = Y_
    MSE = mean_squared_error(Y_pred, Y_true)
    return np.sqrt(MSE)

train_RMSE = RMSE(model, X_train, Y_train)
test_RMSE  = RMSE(model, X_test, Y_test)

print 'Training RMSE is $%.1f Million' % (train_RMSE / 1e6)
print 'Test RMSE     is $%.1f Million' % (test_RMSE / 1e6)

Training RMSE is $56.0 Million
Test RMSE     is $53.5 Million


### Cross Validation

##### For the same X and Y, instead of creating a single training/test split, let's do a 5-fold cross validation

In [78]:
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

model = LinearRegression()
RMSE_folds = cross_validation.cross_val_score(model,X,Y,
                                          cv=5,
                                          scoring=RMSE)
for i,error in enumerate(RMSE_folds):
    print 'Cross val fold %i: %.1f Million' % (i, error/1e6)

Cross val fold 0: 35.6 Million
Cross val fold 1: 83.4 Million
Cross val fold 2: 64.6 Million
Cross val fold 3: 49.9 Million
Cross val fold 4: 54.1 Million
