In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('vgsales_webscrap.csv', index_col = 'Rank')
df.head()

Unnamed: 0_level_0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [3]:
# platform, Genre, Publisher, webscrap for years 2019

In [4]:
X = df.drop(['Name', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'], axis = 1)
y = df.Global_Sales

In [5]:
display(X.isna().sum().sum(), y.isna().sum())

144

0

In [6]:
X.Publisher = X.Publisher.astype('category')
X.Publisher = X.Publisher.cat.codes

In [7]:
X.Genre = X.Genre.astype('category')
X.Genre = X.Genre.cat.codes

In [8]:
X.Platform = X.Platform.astype('category')
X.Platform = X.Platform.cat.codes

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [10]:
X_train.fillna(method = 'bfill', inplace = True)
X_test.fillna(method = 'bfill', inplace = True)

In [11]:
X_train.head()

Unnamed: 0_level_0,Platform,Year,Genre,Publisher
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3158,10,1999.0,8,425
12663,16,2006.0,3,275
16377,28,2009.0,7,98
7216,6,2004.0,3,53
1954,17,2008.0,0,21


In [13]:
#  Simple Models with no hyperparameter tuning
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR


Lr = LinearRegression()
Lr.fit(X_train, y_train)
Lr_pred = Lr.predict(X_test)
print('The RMSE with the Linear Regression Model is :', np.round(np.sqrt(mean_squared_error(y_test, Lr_pred)), 3))
print('The R2 score with the Linear Regression Model is :', np.round(r2_score(y_test, Lr_pred), 5))

Dt = DecisionTreeRegressor()
Dt.fit(X_train, y_train)
Dt_pred = Dt.predict(X_test)
print('The RMSE with the Decision Tree Regressor is :', np.round(np.sqrt(mean_squared_error(y_test, Dt_pred)), 3))
print('The R2 score with the Decision Tree Regressor is :', np.round(r2_score(y_test, Dt_pred), 5))

SVR = SVR()
SVR.fit(X_train, y_train)
SVR_pred = SVR.predict(X_test)
print('The RMSE with the Support Vector Regressor is :', np.round(np.sqrt(mean_squared_error(y_test, SVR_pred)), 3))
print('The R2 score with the Support Vector Regressor is :', np.round(r2_score(y_test, SVR_pred), 5))


The RMSE with the Linear Regression Model is : 1.356
The R2 score with the Linear Regression Model is : 0.0127
The RMSE with the Decision Tree Regressor is : 1.492
The R2 score with the Decision Tree Regressor is : -0.19527
The RMSE with the Support Vector Regressor is : 1.399
The R2 score with the Support Vector Regressor is : -0.05062


In [14]:
# Tuning Decision Tree 
from sklearn.model_selection import GridSearchCV

Dt_paramgrid = {'max_depth' : np.arange(1, 20)}
grid_dt = GridSearchCV(estimator = DecisionTreeRegressor(), scoring = 'neg_mean_squared_error', param_grid=Dt_paramgrid)
grid_dt.fit(X_train, y_train)
print("\n The best score across ALL searched params:\n", grid_dt.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_dt.best_params_)


 The best score across ALL searched params:
 -2.36334227452721

 The best parameters across ALL searched params:
 {'max_depth': 3}


In [21]:
Dt_tuned = DecisionTreeRegressor(max_depth = 4)
Dt_tuned.fit(X_train, y_train)
Dt_tuned_pred = Dt_tuned.predict(X_test)
print('The RMSE with the tuned Decision Tree Regressor is :', np.round(np.sqrt(mean_squared_error(y_test, Dt_tuned_pred)), 3))

The RMSE with the tuned Decision Tree Regressor is : 1.292


In [20]:
# Ensemble Models

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

RF = RandomForestRegressor()
RF.fit(X_train, y_train)
RF_pred = RF.predict(X_test)
print('The RMSE with the Random Forest Model is :', np.round(np.sqrt(mean_squared_error(y_test, RF_pred)), 3))

ADA = AdaBoostRegressor()
ADA.fit(X_train, y_train)
ADA_pred = ADA.predict(X_test)
print('The RMSE with the Adaptative Boosting Model is :', np.round(np.sqrt(mean_squared_error(y_test, ADA_pred)), 3))

XGB = XGBRegressor()
XGB.fit(X_train, y_train)
XGB_pred = XGB.predict(X_test)
print('The RMSE with the Extreme Gradient Boosting Model is :', np.round(np.sqrt(mean_squared_error(y_test, XGB_pred)), 3))



The RMSE with the Random Forest Model is : 1.342
The RMSE with the Adaptative Boosting Model is : 1.322
The RMSE with the Extreme Gradient Boosting Model is : 1.268


In [25]:
# tuning Random Forest

RF_paramgrid = {'n_estimators' : [10, 25, 50, 100, 200], 'max_depth' : np.arange(1, 26)}
grid_RF = GridSearchCV(estimator = RandomForestRegressor(), scoring = 'neg_mean_squared_error', param_grid=RF_paramgrid)
grid_RF.fit(X_train, y_train)
print("\n The best score across ALL searched params:\n", grid_RF.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_RF.best_params_)


 The best score across ALL searched params:
 -2.3436803805440136

 The best parameters across ALL searched params:
 {'max_depth': 4, 'n_estimators': 50}


In [26]:
# tuning the adaptative boosting model
ADA_paramgrid = {'n_estimators' : [10, 50, 100, 500, 1000, 5000], 'learning_rate' : [0.1, 0.01, 0.001, 0.001, 0.0001, 0.00001]}
grid_ADA = GridSearchCV(estimator = AdaBoostRegressor(), scoring = 'neg_mean_squared_error', param_grid=ADA_paramgrid)
grid_ADA.fit(X_train, y_train)
print("\n The best score across ALL searched params:\n", grid_ADA.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_ADA.best_params_)


 The best score across ALL searched params:
 -2.3547696080112908

 The best parameters across ALL searched params:
 {'learning_rate': 0.1, 'n_estimators': 10}


In [None]:
# tuning the extreme gradient boosting model
XGB_paramgrid = {'n_estimators' : np.arange(100, 1001, 100), 'learning_rate' : [0.1, 0.01, 0.001, 0.001, 0.0001, 0.00001]}
grid_XGB = GridSearchCV(estimator = AdaBoostRegressor(), scoring = 'neg_mean_squared_error', param_grid=XGB_paramgrid)
grid_XGB.fit(X_train, y_train)
print("\n The best score across ALL searched params:\n", grid_XGB.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_XGB.best_params_)

In [54]:
BestRF = RandomForestRegressor(max_depth = 4, n_estimators = 50, random_state=4)
BestRF.fit(X_train, y_train)
y_pred_bestRF = BestRF.predict(X_test)
np.round(np.sqrt(mean_squared_error(y_test, y_pred_bestRF)), 3)

1.272

In [56]:
BestADA = AdaBoostRegressor(learning_rate = 0.1, n_estimators = 10, random_state=4)
BestADA.fit(X_train, y_train)
y_pred_bestADA = BestADA.predict(X_test)
np.round(np.sqrt(mean_squared_error(y_test, y_pred_bestADA)), 3)

1.319

In [57]:
BestXGB = XGBRegressor(learning_rate = 0.0001, n_estimators = 500, random_state=4)
BestXGB.fit(X_train, y_train)
y_pred_bestXGB = BestXGB.predict(X_test)
np.round(np.sqrt(mean_squared_error(y_test, y_pred_bestXGB)), 3)

1.356

In [58]:
from joblib import dump, load

dump(BestRF, 'BestRF.joblib')

my_model = load('BestRF.joblib')

In [60]:
# and then my_model.predict(X_test)
# np.sqrt(mean_squared_error(y_test, my_model.predict(X_test)))

array([0.22323262, 0.34582302, 2.44672993, ..., 0.50455108, 0.59182655,
       0.45357838])