In [49]:
import pandas as pd
import pandas_profiling
import seaborn as sb
import matplotlib.pyplot as plt 
import numpy as np

In [50]:
rawdata = pd.read_csv('vgsales.csv')

In [51]:
data = rawdata.drop(columns = ['Rank', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'])

In [52]:
data.head()

Unnamed: 0,Name,Platform,Year,Genre,Publisher,Global_Sales
0,Wii Sports,Wii,2006.0,Sports,Nintendo,82.74
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,40.24
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,35.82
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,33.0
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,31.37


In [53]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

platnum = le.fit_transform(data['Platform'])
gennum = le.fit_transform(data['Genre'])
pubnum = le.fit_transform(data['Publisher'])

In [54]:
#encoded_plat = pd.DataFrame(platnum)
data['encoded_Platform'] = platnum
data['encoded_Genre'] = gennum
data['encoded_Publisher'] = pubnum
data['Global_Sales'] = data['Global_Sales'].apply(lambda x: x*1000)
print(data)


                                                   Name Platform    Year  \
0                                            Wii Sports      Wii  2006.0   
1                                     Super Mario Bros.      NES  1985.0   
2                                        Mario Kart Wii      Wii  2008.0   
3                                     Wii Sports Resort      Wii  2009.0   
4                              Pokemon Red/Pokemon Blue       GB  1996.0   
...                                                 ...      ...     ...   
16591                Woody Woodpecker in Crazy Castle 5      GBA  2002.0   
16592                     Men in Black II: Alien Escape       GC  2003.0   
16593  SCORE International Baja 1000: The Official Game      PS2  2008.0   
16594                                        Know How 2       DS  2010.0   
16595                                  Spirits & Spells      GBA  2003.0   

              Genre   Publisher  Global_Sales  encoded_Platform  \
0            Sports 

In [58]:
newdata = data.dropna()

target = newdata[['Global_Sales']].copy()

Q1 = target.quantile(0.25)
Q3 = target.quantile(0.75)

rule = ((target<(Q1-1.5*(Q3-Q1))) | (target>(Q3+1.5*(Q3-Q1))))

outliers = rule.any(axis = 1)

outlierindices = outliers.index[outliers == True]

newdata.drop(index=outlierindices, inplace = True)

In [59]:
# randomdata = data.dropna()
# split train/test

# platform = pd.DataFrame(newdata['encoded_Platform'])
# year = pd.DataFrame(newdata['Year'])
# genre = pd.DataFrame(newdata['encoded_Genre'])
# publisher = pd.DataFrame(newdata['encoded_Publisher'])
sales = pd.DataFrame(newdata['Global_Sales'])

pred = ['encoded_Platform', 'Year', 'encoded_Genre', 'encoded_Publisher']
x_data = pd.DataFrame(newdata[pred])

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, sales, test_size = 0.25)

In [60]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [61]:
lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression()

In [62]:
gbr = GradientBoostingRegressor(random_state = 42)
gbr.fit(x_train, y_train.values.ravel())

GradientBoostingRegressor(random_state=42)

In [63]:
rf = RandomForestRegressor(random_state = 42)
rf.fit(x_train, y_train.values.ravel())

RandomForestRegressor(random_state=42)

In [64]:
y_train_predlr = lr.predict(x_train)
y_test_predlr = lr.predict(x_test)

In [65]:
y_train_predgbr = gbr.predict(x_train)
y_test_predgbr = gbr.predict(x_test)

In [66]:
y_train_predrf = rf.predict(x_train)
y_test_predrf = rf.predict(x_test)

In [67]:
mae_lr1 = mean_absolute_error(y_train, y_train_predlr)
mse_lr1 = mean_squared_error(y_train, y_train_predlr)
r_lr1 = lr.score(x_train, y_train)
print("Goodness of Fit of Model \tTrain Dataset")
print("LR: Mean Absolute Error (MAE) \t:", mae_lr1)
print("LR: Mean Squared Error (MSE) \t:", mse_lr1)
print("LR: Explained Variance (R^2) \t:", r_lr1)
print()

mae_lr2 = mean_absolute_error(y_test, y_test_predlr)
mse_lr2 = mean_squared_error(y_test, y_test_predlr)
r_lr2 = lr.score(x_test, y_test)
print("Goodness of Fit of Model \tTest Dataset")
print("LR: Mean Absolute Error (MAE) \t:", mae_lr2)
print("LR: Mean Squared Error (MSE) \t:", mse_lr2)
print("LR: Explained Variance (R^2) \t:", r_lr2)

Goodness of Fit of Model 	Train Dataset
LR: Mean Absolute Error (MAE) 	: 187.0237539938945
LR: Mean Squared Error (MSE) 	: 59970.382656138936
LR: Explained Variance (R^2) 	: 0.025935904177879276

Goodness of Fit of Model 	Test Dataset
LR: Mean Absolute Error (MAE) 	: 189.37036120751964
LR: Mean Squared Error (MSE) 	: 60671.070180621835
LR: Explained Variance (R^2) 	: 0.021920405259943654


In [68]:
mae_gbr1 = mean_absolute_error(y_train, y_train_predgbr)
mse_gbr1 = mean_squared_error(y_train, y_train_predgbr)
r_gbr1 = gbr.score(x_train,y_train)
print("Goodness of Fit of Model \tTrain Dataset")
print("GBR: Mean Absolute Error (MAE) \t:", mae_gbr1)
print("GBR: Mean Squared Error (MSE) \t:", mse_gbr1)
print("GBR: Explained Variance (R^2) \t:", r_gbr1)
print()

mae_gbr2 = mean_absolute_error(y_test, y_test_predgbr)
mse_gbr2 = mean_squared_error(y_test, y_test_predgbr)
r_gbr2 = gbr.score(x_test, y_test)
print("Goodness of Fit of Model \tTest Dataset")
print("GBR: Mean Absolute Error (MAE) \t:", mae_gbr2)
print("GBR: Mean Squared Error (MSE) \t:", mse_gbr2)
print("GBR: Explained Variance (R^2) \t:", r_gbr2)

Goodness of Fit of Model 	Train Dataset
GBR: Mean Absolute Error (MAE) 	: 162.66298083203003
GBR: Mean Squared Error (MSE) 	: 48270.6970726173
GBR: Explained Variance (R^2) 	: 0.21596710215539483

Goodness of Fit of Model 	Test Dataset
GBR: Mean Absolute Error (MAE) 	: 165.66905782819157
GBR: Mean Squared Error (MSE) 	: 50048.27631837251
GBR: Explained Variance (R^2) 	: 0.19317068788829406


In [69]:
mae_rf1 = mean_absolute_error(y_train, y_train_predrf)
mse_rf1 = mean_squared_error(y_train, y_train_predrf)
r_rf1 = rf.score(x_train, y_train)
print("Goodness of Fit of Model \tTrain Dataset")
print("RF: Mean Absolute Error (MAE) \t:", mae_rf1)
print("RF: Mean Squared Error (MSE) \t:", mse_rf1)
print("RF: Explained Variance (R^2) \t:", r_rf1)
print()

mae_rf2 = mean_absolute_error(y_test, y_test_predrf)
mse_rf2 = mean_squared_error(y_test, y_test_predrf)
r_rf2 = rf.score(x_test, y_test)
print("Goodness of Fit of Model \tTest Dataset")
print("RF: Mean Absolute Error (MAE) \t:", mae_rf2)
print("RF: Mean Squared Error (MSE) \t:", mse_rf2)
print("RF: Explained Variance (R^2) \t:", r_rf2)

Goodness of Fit of Model 	Train Dataset
RF: Mean Absolute Error (MAE) 	: 87.18183931156851
RF: Mean Squared Error (MSE) 	: 17065.019032525663
RF: Explained Variance (R^2) 	: 0.7228228068942826

Goodness of Fit of Model 	Test Dataset
RF: Mean Absolute Error (MAE) 	: 166.3652415129334
RF: Mean Squared Error (MSE) 	: 54988.529495242015
RF: Explained Variance (R^2) 	: 0.11352876281987634


In [70]:
game_pred = newdata[newdata['Name'].isin(['Jungle Hunt', 'Megamania', 'Know How 2', 'Guitar Hero 5'])]
game_pred

Unnamed: 0,Name,Platform,Year,Genre,Publisher,Global_Sales,encoded_Platform,encoded_Genre,encoded_Publisher
1847,Guitar Hero 5,PS3,2009.0,Misc,Activision,1100.0,19,3,21
1848,Megamania,Atari2600,1981.0,Shooter,Activision,1100.0,4,8,21
1851,Jungle Hunt,Atari2600,1982.0,Platform,Atari,1100.0,4,4,53
16594,Know How 2,DS,2010.0,Puzzle,7G//AMES,10.0,6,5,8


In [71]:
# Extract Predictors for Prediction
X_pred = pd.DataFrame(game_pred[pred])

# Predict LR Response corresponding to Predictors
y_predlr = lr.predict(X_pred)
y_predlr

# Predict GBR Response corresponding to Predictors
y_predgbr = gbr.predict(X_pred)
y_predgbr

# Predict RF Response corresponding to Predictors
y_predrf = rf.predict(X_pred)
y_predrf


array([675.70865884, 893.55      , 948.31373016,  18.875     ])

In [72]:
y_predlr = pd.DataFrame(y_predlr, columns = ["PredTotal"], index = game_pred.index)
gamedata_acclr = pd.concat([game_pred[["Name", "Global_Sales"]], y_predlr], axis = 1)

y_errslr = 100 * abs(gamedata_acclr["Global_Sales"] - gamedata_acclr["PredTotal"]) / gamedata_acclr["Global_Sales"]
y_errslr = pd.DataFrame(y_errslr, columns = ["Error"], index = game_pred.index)
game_acclr = pd.concat([gamedata_acclr, y_errslr], axis = 1)

gamedata_acclr

Unnamed: 0,Name,Global_Sales,PredTotal
1847,Guitar Hero 5,1100.0,215.894234
1848,Megamania,1100.0,345.838085
1851,Jungle Hunt,1100.0,331.914469
16594,Know How 2,10.0,177.221991


In [73]:
y_predgbr = pd.DataFrame(y_predgbr, columns = ["PredTotal"], index = game_pred.index)
gamedata_accgbr = pd.concat([game_pred[["Name", "Global_Sales"]], y_predgbr], axis = 1)

y_errsgbr = 100 * abs(gamedata_accgbr["Global_Sales"] - gamedata_accgbr["PredTotal"]) / gamedata_accgbr["Global_Sales"]
y_errsgbr = pd.DataFrame(y_errsgbr, columns = ["Error"], index = game_pred.index)
game_accgbr = pd.concat([gamedata_accgbr, y_errsgbr], axis = 1)

gamedata_accgbr

Unnamed: 0,Name,Global_Sales,PredTotal
1847,Guitar Hero 5,1100.0,339.641652
1848,Megamania,1100.0,548.895434
1851,Jungle Hunt,1100.0,530.540456
16594,Know How 2,10.0,54.984387


In [74]:
y_predrf = pd.DataFrame(y_predrf, columns = ["PredTotal"], index = game_pred.index)
gamedata_accrf = pd.concat([game_pred[["Name", "Global_Sales"]], y_predrf], axis = 1)

y_errsrf = 100 * abs(gamedata_accrf["Global_Sales"] - gamedata_accrf["PredTotal"]) / gamedata_accrf["Global_Sales"]
y_errsrf = pd.DataFrame(y_errsrf, columns = ["Error"], index = game_pred.index)
game_accrf = pd.concat([gamedata_accrf, y_errsrf], axis = 1)

gamedata_accrf

Unnamed: 0,Name,Global_Sales,PredTotal
1847,Guitar Hero 5,1100.0,675.708659
1848,Megamania,1100.0,893.55
1851,Jungle Hunt,1100.0,948.31373
16594,Know How 2,10.0,18.875
