In [None]:
# Scores for each model (normal, median):
# RF R2: 0.416722914171, 0.446112871477
# RF RMSE: 2.74341996991, 2.86137821844

In [1]:
# Importing necessary packages
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(15,15)})
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
import itertools

In [2]:
# Uploading Dataset and inspecting its contents
df = pd.DataFrame()
df = pd.read_csv('/Users/dwreiter/Desktop/Work/Springboard/Video Game Capstone Project/Data Wrangling/vg_nlp_df.csv', delimiter='\t', index_col=0)
df.head()

Unnamed: 0,Name,Platform,a,about,absolut,ac,account,across,act,action,...,PlayStation,PlayStation 2,PlayStation 3,PlayStation 4,PlayStation Vita,PSP,Sega Dreamcast,Xbox,Xbox 360,Xbox One
0,2010 FIFA World Cup South Africa,PS3,0.077199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
1,2010 FIFA World Cup South Africa,X360,0.036675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,ATV Offroad Fury,PS2,0.034426,0.0,0.0,0.0,0.0,0.0,0.0,0.039822,...,0,1,0,0,0,0,0,0,0,0
3,ATV Offroad Fury 2,PS2,0.035777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
4,Ace Combat 04: Shattered Skies,PS2,0.0533,0.001371,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0


In [3]:
# Converting Categorical variables into Dummy Variables
df = pd.get_dummies(df, columns = ['Publisher', 'Developer', 'Rating'])

# Setting X and y for Global Sales
GS_X = df.drop(['Name', 'Platform', 'Year_of_Release', 'Genre', 'Global_Sales', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'index'], axis=1)
GS_y = df['Global_Sales']

In [4]:
# Splitting the data into Train and Test
GS_X_train, GS_X_test, GS_y_train, GS_y_test = train_test_split(GS_X, GS_y, test_size = 0.2, random_state=42)

In [5]:
# Trying a Random Forest Regressor
# Update c_space, tried [10, 100], [1000, 3000] and [5000, 10000, 1000]. First one is fastest, second is sometimes better but takes time
# Add some parameters: Max_Depth, Max_Leaf_Nodes had impact. Oob_Score, N_Jobs, Warm_Start, Min_Impurity_Split (both ways), 
# Verbose had essentially no impact. Min_Samples_Split, Max_Features, Bootstrap, Min_Weight_Fraction_Leaf, Min_Impurity_Decrease made R2 smaller
# Min_Samples_Leaf of 3 put R2 over 0.42 and RMSE of 2.72.
# Best is min_samples_leaf=3, max_leaf_nodes=1000, random_state=42 with c_space = [10, 100]
c_space = [10, 100]
GS_PG = {'n_estimators': c_space}

GS_rf = RandomForestRegressor(min_samples_leaf=3, max_leaf_nodes=1000, random_state=42)

# Using GridSearchCV to help avoid overfitting
GS_CV = GridSearchCV(GS_rf, GS_PG, cv=5)

GS_CV.fit(GS_X_train, GS_y_train)

GS_rf_pred = GS_CV.predict(GS_X_test)

GS_rf_R2 = GS_CV.score(GS_X_test, GS_y_test)
GS_rf_rmse = np.sqrt(mean_squared_error(GS_y_test, GS_rf_pred))

print('R Squared: ' + str(GS_rf_R2))
print('Root Mean Squared Error: ' + str(GS_rf_rmse))

R Squared: 0.401885158174
Root Mean Squared Error: 2.77809522218


In [None]:
R squared: 0.427177708777
Root Mean Squared Error: 2.71872194631

In [15]:
pd.Series(GS_CV.best_estimator_.feature_importances_, index=GS_X.columns).sort_values(ascending=False)

Critic_Score                                                0.083531
mw                                                          0.074927
super                                                       0.072384
User_Count                                                  0.067672
PC                                                          0.055662
mario                                                       0.051665
Critic_Count                                                0.027872
hasnt                                                       0.022519
a                                                           0.017193
perk                                                        0.015752
car                                                         0.012758
drive                                                       0.012724
cod                                                         0.012298
grand                                                       0.011764
fanboy                            

In [12]:
# Trying a Random Forest Regressor
GS_rf = RandomForestRegressor(n_estimators=1000, random_state=42)

GS_rf.fit(GS_X_train, GS_y_train)

GS_rf_pred = GS_rf.predict(GS_X_test)

GS_rf_R2 = GS_rf.score(GS_X_test, GS_y_test)
GS_rf_rmse = np.sqrt(mean_squared_error(GS_y_test, GS_rf_pred))

print('R squared: ' + str(GS_rf_R2))
print('Root Mean Squared Error: ' + str(GS_rf_rmse))

R squared: -0.0352958692423
Root Mean Squared Error: 3.65499745196


In [10]:
pd.Series(GS_rf.feature_importances_, index=GS_X.columns).sort_values()

Developer_Koei/Inis                   0.000000
Developer_Core Design Ltd.            0.000000
choic                                 0.000000
Developer_Croteam                     0.000000
experienc                             0.000000
Publisher_1C Company                  0.000000
Publisher_2D Boy                      0.000000
Developer_The Coalition               0.000000
industri                              0.000000
compar                                0.000000
Developer_Ryu ga Gotoku Studios       0.000000
Developer_Dimps Corporation           0.000000
Publisher_BAM! Entertainment          0.000000
Publisher_Blue Byte                   0.000000
Publisher_Codemasters Online          0.000000
experi                                0.000000
incr                                  0.000000
Developer_Dreamworks Interactive      0.000000
Publisher_Devolver Digital            0.000000
Developer_Runic Games                 0.000000
Developer_Orbital Media, Inc.         0.000000
Publisher_Dre

** Trying same models when Global Sales is higher than the median.**

In [6]:
# Trying it with Global Sales higher than the Median
GS_median = df['Global_Sales'].median()
GS_median = df[df['Global_Sales'] >= GS_median]

# Setting X and y for Global Sales
GS_X = GS_median.drop(['Name', 'Platform', 'Year_of_Release', 'Genre', 'Global_Sales', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'index'], axis=1)
GS_y = GS_median['Global_Sales']

In [7]:
# Splitting the data into Train and Test
GS_X_train, GS_X_test, GS_y_train, GS_y_test = train_test_split(GS_X, GS_y, test_size = 0.2, random_state=42)

In [9]:
# Did better without parameters
# Trying a Random Forest Regressor
c_space = [10, 100]
GS_PG = {'n_estimators': c_space}

GS_rf = RandomForestRegressor(random_state=42)

# Using GridSearchCV to help avoid overfitting
GS_CV = GridSearchCV(GS_rf, GS_PG, cv=5)

GS_CV.fit(GS_X_train, GS_y_train)

GS_rf_pred = GS_CV.predict(GS_X_test)

GS_rf_R2 = GS_CV.score(GS_X_test, GS_y_test)
GS_rf_rmse = np.sqrt(mean_squared_error(GS_y_test, GS_rf_pred))

print('R Squared: ' + str(GS_rf_R2))
print('Root Mean Squared Error: ' + str(GS_rf_rmse))

R Squared: 0.450507280728
Root Mean Squared Error: 2.85000486672


In [23]:
# Trying a Random Forest Regressor
GS_rf = RandomForestRegressor(n_estimators=1000, random_state=42)

GS_rf.fit(GS_X_train, GS_y_train)

GS_rf_pred = GS_rf.predict(GS_X_test)

GS_rf_R2 = GS_rf.score(GS_X_test, GS_y_test)
GS_rf_rmse = np.sqrt(mean_squared_error(GS_y_test, GS_rf_pred))

print('R squared: ' + str(GS_rf_R2))
print('Root Mean Squared Error: ' + str(GS_rf_rmse))

R squared: 0.446112871477
Root Mean Squared Error: 2.86137821844


In [24]:
pd.Series(GS_rf.feature_importances_, index=GS_X.columns).sort_values()

Developer_Frontier Developments                         0.000000
Developer_Nintendo, Headstrong Games                    0.000000
Developer_Nippon Ichi Software                          0.000000
Developer_1st Playable Productions                      0.000000
Developer_Nitro+                                        0.000000
Developer_Nixxes Software                               0.000000
Developer_1C: Maddox Games                              0.000000
Developer_1C, 1C Company                                0.000000
Developer_Nordic Games Publishing                       0.000000
Publisher_inXile Entertainment                          0.000000
Publisher_Zoo Digital Publishing                        0.000000
Publisher_Yacht Club Games                              0.000000
Developer_Number None Inc., Knockout Games              0.000000
Developer_Opus                                          0.000000
Developer_Opus, Project Sora                            0.000000
Developer_Orbital Media, 

** Trying Model for North American sales.**

In [4]:
# Setting X and y for North American Sales
NA_X = df.drop(['Name', 'Platform', 'Year_of_Release', 'Genre', 'Global_Sales', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'index'], axis=1)
NA_y = df['NA_Sales']

In [5]:
# Splitting the data into Train and Test
NA_X_train, NA_X_test, NA_y_train, NA_y_test = train_test_split(NA_X, NA_y, test_size = 0.2, random_state=42)

In [7]:
# Trying a Random Forest Regressor
# Better with parameters
c_space = [10, 100]
NA_PG = {'n_estimators': c_space}

NA_rf = RandomForestRegressor(min_samples_leaf=3, max_leaf_nodes=1000, random_state=42)

# Using GridSearchCV to help avoid overfitting
NA_CV = GridSearchCV(NA_rf, NA_PG, cv=5)

NA_CV.fit(NA_X_train, NA_y_train)

NA_rf_pred = NA_CV.predict(NA_X_test)

NA_rf_R2 = NA_CV.score(NA_X_test, NA_y_test)
NA_rf_rmse = np.sqrt(mean_squared_error(NA_y_test, NA_rf_pred))

print('R Squared: ' + str(NA_rf_R2))
print('Root Mean Squared Error: ' + str(NA_rf_rmse))

R Squared: 0.427934081067
Root Mean Squared Error: 1.32674922366


** Trying same models when North American Sales is higher than the median.**

In [13]:
# Trying it with North American Sales higher than the Median
NA_median = df['NA_Sales'].median()
NA_median = df[df['NA_Sales'] >= NA_median]

# Setting X and y for North American Sales
NA_X = NA_median.drop(['Name', 'Platform', 'Year_of_Release', 'Genre', 'Global_Sales', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'index'], axis=1)
NA_y = NA_median['NA_Sales']

In [14]:
# Splitting the data into Train and Test
NA_X_train, NA_X_test, NA_y_train, NA_y_test = train_test_split(NA_X, NA_y, test_size = 0.2, random_state=42)

In [17]:
# Trying a Random Forest Regressor
# Better without parameters
c_space = [10, 100]
NA_PG = {'n_estimators': c_space}

NA_rf = RandomForestRegressor(min_samples_leaf=3, max_leaf_nodes=1000, random_state=42)

# Using GridSearchCV to help avoid overfitting
NA_CV = GridSearchCV(NA_rf, NA_PG, cv=5)

NA_CV.fit(NA_X_train, NA_y_train)

NA_rf_pred = NA_CV.predict(NA_X_test)

NA_rf_R2 = NA_CV.score(NA_X_test, NA_y_test)
NA_rf_rmse = np.sqrt(mean_squared_error(NA_y_test, NA_rf_pred))

print('R Squared: ' + str(NA_rf_R2))
print('Root Mean Squared Error: ' + str(NA_rf_rmse))

R Squared: 0.155557221297
Root Mean Squared Error: 1.51378394604
