In [367]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import math as m


np.random.seed(521)

In [296]:
df = pd.read_csv('Project_1_Clean.csv', index_col = 0)
df.head(5)

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,Dev_same_publisher,Years_Since_Release,Good
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E,1,11.0,1
1,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E,1,9.0,1
2,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E,1,8.0,1
3,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.28,9.14,6.5,2.88,29.8,89.0,65.0,8.5,431.0,Nintendo,E,1,11.0,1
4,Wii Play,Wii,2006.0,Misc,Nintendo,13.96,9.18,2.93,2.84,28.92,58.0,41.0,6.6,129.0,Nintendo,E,1,11.0,0


In [297]:
del df['Year_of_Release']

In [298]:
df2 = df.copy()

In [299]:
y = df2.pop('Global_Sales')

In [300]:
df3 = df2.copy()

***Feature selection was done in part 1. We found that Platform, Genre, Critic_Score, Critic_Count, User_Score, User_Count, Developer and Rating were significant at the univariate level. So we will create the dataframe with these features.***

***The following code is taken from part 1***

In [301]:
df3['Publisher'] = 'P_' + df3['Publisher']
df3['Developer'] = 'D_' + df3['Developer']
df_Platform = pd.get_dummies(df3['Platform'])
del df_Platform['PS2'] #reference group
df_Genre = pd.get_dummies(df3['Genre'])
del df_Genre['Action'] #reference group
df_Publisher = pd.get_dummies(df3['Publisher'])
del df_Publisher['P_Other'] #reference group
df_Rating = pd.get_dummies(df3['Rating'])
del df_Rating['T'] #reference group
df_Developer = pd.get_dummies(df3['Developer'])
del df_Developer['D_Other'] #reference group
df4 = df3.copy()
df4 = pd.concat([df4, df_Platform, df_Genre, df_Publisher, df_Rating, df_Developer], axis=1)
del df4['Name']
del df4['Dev_same_publisher']
del df4['Years_Since_Release']
del df4['Good']
del df4['Platform']
del df4['Genre']
del df4['Publisher']
del df4['Rating']
del df4['Developer']
del df4['NA_Sales']
del df4['JP_Sales']
del df4['EU_Sales']
del df4['Other_Sales']

In [302]:
X = df4.copy()

In [303]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

## Linear Regression

In [304]:
param_grid = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}

In [305]:
linear_reg = LinearRegression()

In [306]:
linear_reg_cv = GridSearchCV(linear_reg, param_grid, cv=5)

In [307]:
linear_reg_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'fit_intercept': [True, False], 'normalize': [True, False], 'copy_X': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [308]:
predictions = linear_reg_cv.predict(X_test)

In [309]:
mean_squared_error(y_test, predictions)

2.0051443897534571

In [310]:
linear_reg_cv.best_params_

{'copy_X': True, 'fit_intercept': True, 'normalize': True}

In [311]:
linear_reg_cv.best_score_

0.28191117600827664

In [312]:
linear_reg = LinearRegression(fit_intercept = True, normalize = True, copy_X = True)

In [313]:
cv = cross_val_score(linear_reg, X_train, y_train, cv=5)
print("Cross-validated scores: {}".format(cv))

Cross-validated scores: [ 0.32622432  0.23606413  0.34889257  0.2752973   0.22307755]


In [314]:
predictions = cross_val_predict(linear_reg, X_test, y_test, cv=5)

In [315]:
mean_squared_error(y_test, predictions)

2.4186543171248021

***Try with ln transformation on global_sales as in part 1***

In [323]:
y_train_log = y_train.apply(m.log)

In [324]:
param_grid = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}

In [325]:
linear_reg = LinearRegression()

In [326]:
linear_reg_cv = GridSearchCV(linear_reg, param_grid, cv=5)

In [327]:
linear_reg_cv.fit(X_train, y_train_log)

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'fit_intercept': [True, False], 'normalize': [True, False], 'copy_X': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [328]:
predictions = linear_reg_cv.predict(X_test)

In [329]:
mean_squared_error(y_test, np.exp(predictions))

2.2080726682829543

In [206]:
linear_reg_cv.best_params_

{'copy_X': True, 'fit_intercept': True, 'normalize': False}

In [207]:
linear_reg_cv.best_score_

0.52212497831885196

In [208]:
linear_reg = LinearRegression(fit_intercept = True, normalize = False, copy_X = True)

In [168]:
cv = cross_val_score(linear_reg, X_train, y_train_log, cv=5)
print("Cross-validated scores: {}".format(cv))

Cross-validated scores: [ 0.55155524  0.53260263  0.50873208  0.51247546  0.50525948]


In [None]:
predictions = cross_val_predict(linear_reg, X_test, y_test, cv=5)

## Ridge

In [392]:
param_grid = {'alpha':np.logspace(-4, 0, 10), 'fit_intercept':[True, False], 'normalize':[True, False], 'copy_X': [True,False],
              'solver':['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}

In [393]:
ridge = Ridge()

In [394]:
ridge_cv = RandomizedSearchCV(ridge, param_grid, cv=5, n_iter = 100)

In [395]:
ridge_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
          fit_params=None, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'alpha': array([  1.00000e-04,   2.78256e-04,   7.74264e-04,   2.15443e-03,
         5.99484e-03,   1.66810e-02,   4.64159e-02,   1.29155e-01,
         3.59381e-01,   1.00000e+00]), 'fit_intercept': [True, False], 'normalize': [True, False], 'copy_X': [True, False], 'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [396]:
predictions = ridge_cv.predict(X_test)

In [397]:
mean_squared_error(y_test, predictions)

2.0154563394788734

With log transform

In [398]:
ridge = Ridge()

In [399]:
ridge_cv = RandomizedSearchCV(ridge, param_grid, cv=5, n_iter = 100)

In [400]:
ridge_cv.fit(X_train, y_train_log)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
          fit_params=None, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'alpha': array([  1.00000e-04,   2.78256e-04,   7.74264e-04,   2.15443e-03,
         5.99484e-03,   1.66810e-02,   4.64159e-02,   1.29155e-01,
         3.59381e-01,   1.00000e+00]), 'fit_intercept': [True, False], 'normalize': [True, False], 'copy_X': [True, False], 'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [401]:
predictions = ridge_cv.predict(X_test)

In [402]:
mean_squared_error(y_test, np.exp(predictions))

2.210552425269543

## Support Vector Regressor

In [412]:
param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1.0], 'gamma': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]}

In [413]:
svr = SVR(cache_size = 10000)

In [414]:
svr_cv = GridSearchCV(svr, param_grid, cv=5)

In [415]:
svr_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVR(C=1.0, cache_size=10000, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1.0], 'gamma': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [416]:
predictions = svr_cv.predict(X_test)

In [417]:
mean_squared_error(y_test, predictions)

2.4542831820538034

With log transform

In [418]:
svr = SVR(cache_size = 2000)

In [419]:
svr_cv = GridSearchCV(svr, param_grid, cv=5)

In [420]:
svr_cv.fit(X_train, y_train_log)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVR(C=1.0, cache_size=2000, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1.0], 'gamma': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [421]:
predictions = svr_cv.predict(X_test)

In [422]:
mean_squared_error(y_test, np.exp(predictions))

2.5249133868714431

## Random Forest Regressor

In [427]:
param_grid = {'bootstrap': [True, False],
              'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
              'max_features': ['auto', 'sqrt', 'log2'],
              'min_samples_leaf': [1, 2, 4, 8, 16, 32],
              'min_samples_split': [2, 5, 10, 20, 40],
              'n_estimators': [200, 400, 600, 800, 1000, 1500, 2000, 3000, 4000, 5000]}

In [428]:
forest = RandomForestRegressor()

In [430]:
forest_cv = RandomizedSearchCV(forest, param_grid, cv=5, n_iter = 50)

In [None]:
forest_cv.fit(X_train, y_train)

In [None]:
predictions = forest_cv.predict(X_test)

In [None]:
mean_squared_error(y_test, predictions)

With log transform

In [None]:
forest = RandomForestRegressor()

In [None]:
forest_cv = RandomizedSearchCV(forest, param_grid, cv=5, n_iter = 50)

In [None]:
forest_cv.fit(X_train, y_train_log)

In [None]:
predictions = forest_cv.predict(X_test)

In [None]:
mean_squared_error(y_test, np.exp(predictions))