In [1]:
import pandas as pd
import pandas_profiling
import seaborn as sb
import matplotlib.pyplot as plt 
import numpy as np

In [2]:
rawdata = pd.read_csv('vgsales.csv')

In [3]:
data = rawdata.drop(columns = ['Rank', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'])

In [4]:
data = data.drop(data[data.Global_Sales > 80].index)

In [5]:
data.head()

Unnamed: 0,Name,Platform,Year,Genre,Publisher,Global_Sales
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,40.24
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,35.82
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,33.0
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,31.37
5,Tetris,GB,1989.0,Puzzle,Nintendo,30.26


In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

platnum = le.fit_transform(data['Platform'])
gennum = le.fit_transform(data['Genre'])
pubnum = le.fit_transform(data['Publisher'])

In [7]:
#encoded_plat = pd.DataFrame(platnum)
data['encoded_Platform'] = platnum
data['encoded_Genre'] = gennum
data['encoded_Publisher'] = pubnum
data['Global_Sales'] = data['Global_Sales'].apply(lambda x: x*1000)
print(data)


                                                   Name Platform    Year  \
1                                     Super Mario Bros.      NES  1985.0   
2                                        Mario Kart Wii      Wii  2008.0   
3                                     Wii Sports Resort      Wii  2009.0   
4                              Pokemon Red/Pokemon Blue       GB  1996.0   
5                                                Tetris       GB  1989.0   
...                                                 ...      ...     ...   
16591                Woody Woodpecker in Crazy Castle 5      GBA  2002.0   
16592                     Men in Black II: Alien Escape       GC  2003.0   
16593  SCORE International Baja 1000: The Official Game      PS2  2008.0   
16594                                        Know How 2       DS  2010.0   
16595                                  Spirits & Spells      GBA  2003.0   

              Genre   Publisher  Global_Sales  encoded_Platform  \
1          Platform 

In [32]:
newdata = data.dropna()

target = newdata[['Global_Sales']].copy()

Q1 = target.quantile(0.25)
Q3 = target.quantile(0.75)

rule = ((target<(Q1-1.5*(Q3-Q1))) | (target>(Q3+1.5*(Q3-Q1))))

outliers = rule.any(axis = 1)

outlierindices = outliers.index[outliers == True]

newdata.drop(index=outlierindices, inplace = True)

In [33]:
# randomdata = data.dropna()
# split train/test

# platform = pd.DataFrame(newdata['encoded_Platform'])
# year = pd.DataFrame(newdata['Year'])
# genre = pd.DataFrame(newdata['encoded_Genre'])
# publisher = pd.DataFrame(newdata['encoded_Publisher'])
sales = pd.DataFrame(newdata['Global_Sales'])

pred = ['encoded_Platform', 'Year', 'encoded_Genre', 'encoded_Publisher']
x_data = pd.DataFrame(newdata[pred])

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, sales, test_size = 0.25)

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [35]:
lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression()

In [36]:
gbr = GradientBoostingRegressor(random_state = 42)
gbr.fit(x_train, y_train.values.ravel())

GradientBoostingRegressor(random_state=42)

In [37]:
rf = RandomForestRegressor(random_state = 42)
rf.fit(x_train, y_train.values.ravel())

RandomForestRegressor(random_state=42)

In [38]:
y_train_predlr = lr.predict(x_train)
y_test_predlr = lr.predict(x_test)

In [39]:
y_train_predgbr = gbr.predict(x_train)
y_test_predgbr = gbr.predict(x_test)

In [40]:
y_train_predrf = rf.predict(x_train)
y_test_predrf = rf.predict(x_test)

In [41]:
mae_lr1 = mean_absolute_error(y_train, y_train_predlr)
mse_lr1 = mean_squared_error(y_train, y_train_predlr)
r_lr1 = lr.score(x_train, y_train)
print("Goodness of Fit of Model \tTrain Dataset")
print("LR: Mean Absolute Error (MAE) \t:", mae_lr1)
print("LR: Mean Squared Error (MSE) \t:", mse_lr1)
print("LR: Explained Variance (R^2) \t:", r_lr1)
print()

mae_lr2 = mean_absolute_error(y_test, y_test_predlr)
mse_lr2 = mean_squared_error(y_test, y_test_predlr)
r_lr2 = lr.score(x_test, y_test)
print("Goodness of Fit of Model \tTest Dataset")
print("LR: Mean Absolute Error (MAE) \t:", mae_lr2)
print("LR: Mean Squared Error (MSE) \t:", mse_lr2)
print("LR: Explained Variance (R^2) \t:", r_lr2)

Goodness of Fit of Model 	Train Dataset
LR: Mean Absolute Error (MAE) 	: 188.54126804161274
LR: Mean Squared Error (MSE) 	: 60572.10590554968
LR: Explained Variance (R^2) 	: 0.02721321261335563

Goodness of Fit of Model 	Test Dataset
LR: Mean Absolute Error (MAE) 	: 185.54429202808706
LR: Mean Squared Error (MSE) 	: 58864.925702075634
LR: Explained Variance (R^2) 	: 0.017954145009471456


In [42]:
mae_gbr1 = mean_absolute_error(y_train, y_train_predgbr)
mse_gbr1 = mean_squared_error(y_train, y_train_predgbr)
r_gbr1 = gbr.score(x_train,y_train)
print("Goodness of Fit of Model \tTrain Dataset")
print("GBR: Mean Absolute Error (MAE) \t:", mae_gbr1)
print("GBR: Mean Squared Error (MSE) \t:", mse_gbr1)
print("GBR: Explained Variance (R^2) \t:", r_gbr1)
print()

mae_gbr2 = mean_absolute_error(y_test, y_test_predgbr)
mse_gbr2 = mean_squared_error(y_test, y_test_predgbr)
r_gbr2 = gbr.score(x_test, y_test)
print("Goodness of Fit of Model \tTest Dataset")
print("GBR: Mean Absolute Error (MAE) \t:", mae_gbr2)
print("GBR: Mean Squared Error (MSE) \t:", mse_gbr2)
print("GBR: Explained Variance (R^2) \t:", r_gbr2)

Goodness of Fit of Model 	Train Dataset
GBR: Mean Absolute Error (MAE) 	: 164.07951061859566
GBR: Mean Squared Error (MSE) 	: 48817.348191762554
GBR: Explained Variance (R^2) 	: 0.21599438212946576

Goodness of Fit of Model 	Test Dataset
GBR: Mean Absolute Error (MAE) 	: 164.41710227422888
GBR: Mean Squared Error (MSE) 	: 48429.43208742002
GBR: Explained Variance (R^2) 	: 0.19204989263548633


In [43]:
mae_rf1 = mean_absolute_error(y_train, y_train_predrf)
mse_rf1 = mean_squared_error(y_train, y_train_predrf)
r_rf1 = rf.score(x_train, y_train)
print("Goodness of Fit of Model \tTrain Dataset")
print("RF: Mean Absolute Error (MAE) \t:", mae_rf1)
print("RF: Mean Squared Error (MSE) \t:", mse_rf1)
print("RF: Explained Variance (R^2) \t:", r_rf1)
print()

mae_rf2 = mean_absolute_error(y_test, y_test_predrf)
mse_rf2 = mean_squared_error(y_test, y_test_predrf)
r_rf2 = rf.score(x_test, y_test)
print("Goodness of Fit of Model \tTest Dataset")
print("RF: Mean Absolute Error (MAE) \t:", mae_rf2)
print("RF: Mean Squared Error (MSE) \t:", mse_rf2)
print("RF: Explained Variance (R^2) \t:", r_rf2)

Goodness of Fit of Model 	Train Dataset
RF: Mean Absolute Error (MAE) 	: 87.98970685546628
RF: Mean Squared Error (MSE) 	: 17513.600095702575
RF: Explained Variance (R^2) 	: 0.7187319391001727

Goodness of Fit of Model 	Test Dataset
RF: Mean Absolute Error (MAE) 	: 163.20209459192958
RF: Mean Squared Error (MSE) 	: 52507.375912460564
RF: Explained Variance (R^2) 	: 0.12401739650130583


In [44]:
game_pred = newdata[newdata['Name'].isin(['Jungle Hunt', 'Megamania', 'Know How 2', 'Guitar Hero 5'])]
game_pred

Unnamed: 0,Name,Platform,Year,Genre,Publisher,Global_Sales,encoded_Platform,encoded_Genre,encoded_Publisher
1847,Guitar Hero 5,PS3,2009.0,Misc,Activision,1100.0,19,3,21
1848,Megamania,Atari2600,1981.0,Shooter,Activision,1100.0,4,8,21
1851,Jungle Hunt,Atari2600,1982.0,Platform,Atari,1100.0,4,4,53
16594,Know How 2,DS,2010.0,Puzzle,7G//AMES,10.0,6,5,8


In [45]:
# Extract Predictors for Prediction
X_pred = pd.DataFrame(game_pred[pred])

# Predict LR Response corresponding to Predictors
y_predlr = lr.predict(X_pred)
y_predlr

# Predict GBR Response corresponding to Predictors
y_predgbr = gbr.predict(X_pred)
y_predgbr

# Predict RF Response corresponding to Predictors
y_predrf = rf.predict(X_pred)
y_predrf


array([601.81730159, 449.4       , 842.12      ,  31.31428571])

In [46]:
y_predlr = pd.DataFrame(y_predlr, columns = ["PredTotal"], index = game_pred.index)
gamedata_acclr = pd.concat([game_pred[["Name", "Global_Sales"]], y_predlr], axis = 1)

y_errslr = 100 * abs(gamedata_acclr["Global_Sales"] - gamedata_acclr["PredTotal"]) / gamedata_acclr["Global_Sales"]
y_errslr = pd.DataFrame(y_errslr, columns = ["Error"], index = game_pred.index)
game_acclr = pd.concat([gamedata_acclr, y_errslr], axis = 1)

gamedata_acclr

Unnamed: 0,Name,Global_Sales,PredTotal
1847,Guitar Hero 5,1100.0,213.909543
1848,Megamania,1100.0,349.340969
1851,Jungle Hunt,1100.0,335.990333
16594,Know How 2,10.0,174.013059


In [47]:
y_predgbr = pd.DataFrame(y_predgbr, columns = ["PredTotal"], index = game_pred.index)
gamedata_accgbr = pd.concat([game_pred[["Name", "Global_Sales"]], y_predgbr], axis = 1)

y_errsgbr = 100 * abs(gamedata_accgbr["Global_Sales"] - gamedata_accgbr["PredTotal"]) / gamedata_accgbr["Global_Sales"]
y_errsgbr = pd.DataFrame(y_errsgbr, columns = ["Error"], index = game_pred.index)
game_accgbr = pd.concat([gamedata_accgbr, y_errsgbr], axis = 1)

gamedata_accgbr

Unnamed: 0,Name,Global_Sales,PredTotal
1847,Guitar Hero 5,1100.0,339.287022
1848,Megamania,1100.0,525.09184
1851,Jungle Hunt,1100.0,521.464468
16594,Know How 2,10.0,63.899746


In [48]:
y_predrf = pd.DataFrame(y_predrf, columns = ["PredTotal"], index = game_pred.index)
gamedata_accrf = pd.concat([game_pred[["Name", "Global_Sales"]], y_predrf], axis = 1)

y_errsrf = 100 * abs(gamedata_accrf["Global_Sales"] - gamedata_accrf["PredTotal"]) / gamedata_accrf["Global_Sales"]
y_errsrf = pd.DataFrame(y_errsrf, columns = ["Error"], index = game_pred.index)
game_accrf = pd.concat([gamedata_accrf, y_errsrf], axis = 1)

gamedata_accrf

Unnamed: 0,Name,Global_Sales,PredTotal
1847,Guitar Hero 5,1100.0,601.817302
1848,Megamania,1100.0,449.4
1851,Jungle Hunt,1100.0,842.12
16594,Know How 2,10.0,31.314286
