# Split data

In [98]:
sales = pd.DataFrame(newdata['Global_Sales'])

pred = ['encoded_Platform', 'Year', 'encoded_Genre', 'encoded_Publisher']
x_data = pd.DataFrame(newdata[pred])

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, sales, test_size = 0.25)

# Model building

In [99]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [100]:
lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression()

In [101]:
gbr = GradientBoostingRegressor(random_state = 42)
gbr.fit(x_train, y_train.values.ravel())

GradientBoostingRegressor(random_state=42)

In [102]:
rf = RandomForestRegressor(random_state = 42)
rf.fit(x_train, y_train.values.ravel())

RandomForestRegressor(random_state=42)

In [103]:
y_train_predlr = lr.predict(x_train)
y_test_predlr = lr.predict(x_test)

In [104]:
y_train_predgbr = gbr.predict(x_train)
y_test_predgbr = gbr.predict(x_test)

In [105]:
y_train_predrf = rf.predict(x_train)
y_test_predrf = rf.predict(x_test)

In [106]:
mae_lr1 = mean_absolute_error(y_train, y_train_predlr)
mse_lr1 = mean_squared_error(y_train, y_train_predlr)
r_lr1 = lr.score(x_train, y_train)
print("Goodness of Fit of Model \tTrain Dataset")
print("LR: Mean Absolute Error (MAE) \t:", mae_lr1)
print("LR: Mean Squared Error (MSE) \t:", mse_lr1)
print("LR: Explained Variance (R^2) \t:", r_lr1)
print()

mae_lr2 = mean_absolute_error(y_test, y_test_predlr)
mse_lr2 = mean_squared_error(y_test, y_test_predlr)
r_lr2 = lr.score(x_test, y_test)
print("Goodness of Fit of Model \tTest Dataset")
print("LR: Mean Absolute Error (MAE) \t:", mae_lr2)
print("LR: Mean Squared Error (MSE) \t:", mse_lr2)
print("LR: Explained Variance (R^2) \t:", r_lr2)

Goodness of Fit of Model 	Train Dataset
LR: Mean Absolute Error (MAE) 	: 186.9406885205486
LR: Mean Squared Error (MSE) 	: 59679.550341333954
LR: Explained Variance (R^2) 	: 0.02748711090588918

Goodness of Fit of Model 	Test Dataset
LR: Mean Absolute Error (MAE) 	: 190.10513148858502
LR: Mean Squared Error (MSE) 	: 61511.939529926145
LR: Explained Variance (R^2) 	: 0.01801954068816769


In [107]:
mae_gbr1 = mean_absolute_error(y_train, y_train_predgbr)
mse_gbr1 = mean_squared_error(y_train, y_train_predgbr)
r_gbr1 = gbr.score(x_train,y_train)
print("Goodness of Fit of Model \tTrain Dataset")
print("GBR: Mean Absolute Error (MAE) \t:", mae_gbr1)
print("GBR: Mean Squared Error (MSE) \t:", mse_gbr1)
print("GBR: Explained Variance (R^2) \t:", r_gbr1)
print()

mae_gbr2 = mean_absolute_error(y_test, y_test_predgbr)
mse_gbr2 = mean_squared_error(y_test, y_test_predgbr)
r_gbr2 = gbr.score(x_test, y_test)
print("Goodness of Fit of Model \tTest Dataset")
print("GBR: Mean Absolute Error (MAE) \t:", mae_gbr2)
print("GBR: Mean Squared Error (MSE) \t:", mse_gbr2)
print("GBR: Explained Variance (R^2) \t:", r_gbr2)

Goodness of Fit of Model 	Train Dataset
GBR: Mean Absolute Error (MAE) 	: 161.7392549061337
GBR: Mean Squared Error (MSE) 	: 47733.56497739404
GBR: Explained Variance (R^2) 	: 0.22215387151173804

Goodness of Fit of Model 	Test Dataset
GBR: Mean Absolute Error (MAE) 	: 169.06495877666904
GBR: Mean Squared Error (MSE) 	: 50895.83542433594
GBR: Explained Variance (R^2) 	: 0.18749569223494056


In [108]:
mae_rf1 = mean_absolute_error(y_train, y_train_predrf)
mse_rf1 = mean_squared_error(y_train, y_train_predrf)
r_rf1 = rf.score(x_train, y_train)
print("Goodness of Fit of Model \tTrain Dataset")
print("RF: Mean Absolute Error (MAE) \t:", mae_rf1)
print("RF: Mean Squared Error (MSE) \t:", mse_rf1)
print("RF: Explained Variance (R^2) \t:", r_rf1)
print()

mae_rf2 = mean_absolute_error(y_test, y_test_predrf)
mse_rf2 = mean_squared_error(y_test, y_test_predrf)
r_rf2 = rf.score(x_test, y_test)
print("Goodness of Fit of Model \tTrain Dataset")
print("RF: Mean Absolute Error (MAE) \t:", mae_rf2)
print("RF: Mean Squared Error (MSE) \t:", mse_rf2)
print("RF: Explained Variance (R^2) \t:", r_rf2)

Goodness of Fit of Model 	Train Dataset
RF: Mean Absolute Error (MAE) 	: 85.3625140816884
RF: Mean Squared Error (MSE) 	: 16545.889440446488
RF: Explained Variance (R^2) 	: 0.7303751343579461

Goodness of Fit of Model 	Train Dataset
RF: Mean Absolute Error (MAE) 	: 171.89298305468458
RF: Mean Squared Error (MSE) 	: 57682.51089895985
RF: Explained Variance (R^2) 	: 0.07915277944921628


# Prediction

In [109]:
game_pred = newdata[newdata['Name'].isin(['Jungle Hunt', 'Megamania', 'Know How 2', 'Guitar Hero 5'])]
game_pred

Unnamed: 0,Name,Platform,Year,Genre,Publisher,Global_Sales,encoded_Platform,encoded_Genre,encoded_Publisher,encoded_Year
1848,Megamania,2600,1981,Shooter,Activision,1100.0,2,9,22,1
1851,Jungle Hunt,2600,1982,Platform,Atari,1100.0,2,5,54,2
1847,Guitar Hero 5,PS3,2009,Misc,Activision,1100.0,19,4,22,29
16596,Know How 2,DS,2010,Puzzle,7G//AMES,10.0,6,6,9,30


In [110]:
# Extract Predictors for Prediction
X_pred = pd.DataFrame(game_pred[pred])

# Predict LR Response corresponding to Predictors
y_predlr = lr.predict(X_pred)
y_predlr

# Predict GBR Response corresponding to Predictors
y_predgbr = gbr.predict(X_pred)
y_predgbr

# Predict RF Response corresponding to Predictors
y_predrf = rf.predict(X_pred)
y_predrf

array([809.9       , 950.12333333, 733.69186147,  17.8       ])

In [111]:
y_predlr = pd.DataFrame(y_predlr, columns = ["PredTotal"], index = game_pred.index)
gamedata_acclr = pd.concat([game_pred[["Name", "Global_Sales"]], y_predlr], axis = 1)

y_errslr = 100 * abs(gamedata_acclr["Global_Sales"] - gamedata_acclr["PredTotal"]) / gamedata_acclr["Global_Sales"]
y_errslr = pd.DataFrame(y_errslr, columns = ["Error"], index = game_pred.index)
game_acclr = pd.concat([gamedata_acclr, y_errslr], axis = 1)

gamedata_acclr

Unnamed: 0,Name,Global_Sales,PredTotal
1848,Megamania,1100.0,341.207645
1851,Jungle Hunt,1100.0,330.518332
1847,Guitar Hero 5,1100.0,214.692994
16596,Know How 2,10.0,171.915635


In [112]:
y_predgbr = pd.DataFrame(y_predgbr, columns = ["PredTotal"], index = game_pred.index)
gamedata_accgbr = pd.concat([game_pred[["Name", "Global_Sales"]], y_predgbr], axis = 1)

y_errsgbr = 100 * abs(gamedata_accgbr["Global_Sales"] - gamedata_accgbr["PredTotal"]) / gamedata_accgbr["Global_Sales"]
y_errsgbr = pd.DataFrame(y_errsgbr, columns = ["Error"], index = game_pred.index)
game_accgbr = pd.concat([gamedata_accgbr, y_errsgbr], axis = 1)

gamedata_accgbr

Unnamed: 0,Name,Global_Sales,PredTotal
1848,Megamania,1100.0,577.970499
1851,Jungle Hunt,1100.0,601.505439
1847,Guitar Hero 5,1100.0,352.147298
16596,Know How 2,10.0,102.859196


In [113]:
y_predrf = pd.DataFrame(y_predrf, columns = ["PredTotal"], index = game_pred.index)
gamedata_accrf = pd.concat([game_pred[["Name", "Global_Sales"]], y_predrf], axis = 1)

y_errsrf = 100 * abs(gamedata_accrf["Global_Sales"] - gamedata_accrf["PredTotal"]) / gamedata_accrf["Global_Sales"]
y_errsrf = pd.DataFrame(y_errsrf, columns = ["Error"], index = game_pred.index)
game_accrf = pd.concat([gamedata_accrf, y_errsrf], axis = 1)

gamedata_accrf

Unnamed: 0,Name,Global_Sales,PredTotal
1848,Megamania,1100.0,809.9
1851,Jungle Hunt,1100.0,950.123333
1847,Guitar Hero 5,1100.0,733.691861
16596,Know How 2,10.0,17.8
