# Global Sales Predictions

Importing Modules

In [23]:
import numpy as np
import pandas as pd
import plotly.express as px

Reading Data Set

In [24]:
df=pd.read_csv('Video_Games_Sales_as_at_22_Dec_2016.csv')

Dropping Unnecessary Columns

In [25]:
df.drop(columns = ['Year_of_Release', 'Developer', 'Publisher', 'Platform'], inplace = True)

In [26]:
x=df.iloc[:,:].values
y=df.iloc[:,6:7]
x=np.delete(x,6,1)

Spliting The DataSet for Training

In [27]:
from sklearn.model_selection import train_test_split as tts
x_train,x_test,Y_train,Y_test=tts(x,y,test_size=0.3)
games_in_training_set = x_train[:, 0]
games_in_test_set = x_test[:, 0]

Dropping Name Column

In [28]:
x_train=x_train[:,1:]
x_test=x_test[:,1:]

Imputing The Test and training Case

In [29]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer()
x_train[:, [5 ,6, 7, 8]] = imputer.fit_transform(x_train[:, [5, 6, 7, 8]])
x_test[:, [5 ,6, 7, 8]] = imputer.fit_transform(x_test[:, [5, 6, 7, 8]])

Using Categorical imputer to Fill nan Values with 'NA'


In [30]:
from sklearn_pandas import CategoricalImputer
categorical_imputer = CategoricalImputer(strategy = 'constant', fill_value = 'NA')
x_train[:, [0, 9]] = categorical_imputer.fit_transform(x_train[:, [0, 9]])
x_test[:, [0, 9]] = categorical_imputer.transform(x_test[:, [0, 9]])

Using Column Transformer Transfrom every Column

In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0, 9])], remainder = 'passthrough') 
x_train = ct.fit_transform(x_train)
x_test = ct.transform(x_test)

Creating and Training The Model

In [32]:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators = 200, learning_rate= 0.08)
model.fit(x_train, Y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.08, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

# Predicting The Sales and Putting them in a seperate Dataset

In [37]:
Y_pred = model.predict(x_test)
games_in_test_set = games_in_test_set.reshape(-1,1)
Y_pred = Y_pred.reshape(-1, 1)
predictions = np.concatenate([games_in_test_set, Y_pred, Y_test], axis = 1)
predictions = pd.DataFrame(predictions, columns = ['Name', 'Predicted_Global_Sales', 'Actual_Global_Sales'])

In [38]:
predictions.head(10)

Unnamed: 0,Name,Predicted_Global_Sales,Actual_Global_Sales
0,Majesty 2: The Fantasy Kingdom Sim,0.0223673,0.02
1,Pro Yakyuu Team o Tsukurou!,0.2235,0.23
2,Naruto Shippuden: Ultimate Ninja Storm Revolution,0.471086,0.43
3,Wheel of Fortune,0.256758,0.25
4,Ar tonelico 2: Melody of Metafalica,0.153101,0.18
5,Samurai Warriors 4,0.23364,0.24
6,Secret Files: Tunguska,0.0546963,0.06
7,Million God,0.0841893,0.09
8,Rocket League,0.0530906,0.06
9,Max Payne,1.20482,1.22


Calculating r2 Score

In [39]:
from sklearn.metrics import r2_score, mean_squared_error
import math
r2_score = r2_score(Y_test, Y_pred)
rmse = math.sqrt(mean_squared_error(Y_test, Y_pred))
print(f"r2 score of the model : {r2_score:.3f}")
print(f"Root Mean Squared Error of the model : {rmse:.3f}")

r2 score of the model : 0.861
Root Mean Squared Error of the model : 0.648


In [40]:
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(Y_test,Y_pred))

0.035204369492793194


Scatter Plot of Prediction and Test Data

In [41]:
px.scatter(predictions,title='Scatter Plot of Predicted_values and Actual Values',x='Actual_Global_Sales',y='Predicted_Global_Sales',hover_data=['Name'],log_x=True,size_max=500)