In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import category_encoders as ce
import sklearn.metrics as sm_metrics
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
import time
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer, r2_score



vgsales = pd.read_csv("vgsales.csv", index_col=0)
vgsales.head(10)


Unnamed: 0_level_0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37
6,Tetris,GB,1989.0,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26
7,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01
8,Wii Play,Wii,2006.0,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02
9,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62
10,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31


In [3]:
vgsales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16598 entries, 1 to 16600
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          16598 non-null  object 
 1   Platform      16598 non-null  object 
 2   Year          16327 non-null  float64
 3   Genre         16598 non-null  object 
 4   Publisher     16540 non-null  object 
 5   NA_Sales      16598 non-null  float64
 6   EU_Sales      16598 non-null  float64
 7   JP_Sales      16598 non-null  float64
 8   Other_Sales   16598 non-null  float64
 9   Global_Sales  16598 non-null  float64
dtypes: float64(6), object(4)
memory usage: 1.4+ MB


# Data Preparation & Feature Engineering

Here we select and encode the categorical features. Then we split the data into train and test sets.

In [3]:
X = vgsales[['NA_Sales', 'EU_Sales', 'Platform', 'Genre', 'Publisher', 'Name', 'Year']]
y = vgsales['JP_Sales']

# Perform one-hot encoding for categorical variables
encoder = ce.OneHotEncoder(cols=['Platform', 'Genre', 'Publisher', 'Name'], drop_invariant=True, use_cat_names=True)

X_encoded = encoder.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=91)

print(X_train.shape, y_train.shape)

(13278, 3204) (13278,)


In [5]:
X_train.dtypes

NA_Sales                                        float64
EU_Sales                                        float64
Platform_Wii                                      int64
Platform_NES                                      int64
Platform_GB                                       int64
                                                 ...   
Name_WSC REAL 09: World Snooker Championship      int64
Name_Sakura-Sou no Pet na Kanojo                  int64
Name_Heart no Kuni no Alice                       int64
Name_Akatsuki no Goei Trinity                     int64
Year                                            float64
Length: 3204, dtype: object

# Run an XGBoost with starting parameters

In [6]:
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
import time
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer, r2_score



In [7]:
xgb = XGBRegressor(n_estimators=100, max_depth=3, colsample_bytree=0.8)




xgb.fit(X_train.values, y_train.values)

In [8]:
feature_importance = pd.DataFrame(index=X_train.columns, data=xgb.feature_importances_)

feature_importance = feature_importance.sort_values(0, ascending=False)

feature_importance.head(10)
#feature_importance.to_csv('feature_importance2.csv')


The top 10 features of importance with respect to Sales in Japan are:
1. Publisher_Nintendo
2. Genre_Role-Playing
3. Publisher_Enix Corporation
4. EU_Sales
5. Platform_DS
6. NA_Sales
7. Platform_NES	
8. Publisher_SquareSoft
9. Platform_SNES
10. Genre_Misc 

# Testing the Model

In [9]:
mse_scorer = make_scorer(mean_squared_error)
mae_scorer = make_scorer(mean_absolute_error)
r2_scorer = make_scorer(r2_score)

def calculate_metrics(y_real, y_pred, metric):
    return metric(y_real, y_pred)

def model_evaluate(model, X_train, y_train, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    metrics = {}
    #RMSE Test
    rmse_test = np.sqrt(calculate_metrics(y_test, y_pred, mean_squared_error))
    #RMSE Train
    rmse_train = np.sqrt(calculate_metrics(y_train, y_pred_train, mean_squared_error))
    r2_test = calculate_metrics(y_test, y_pred, r2_score)
    r2_train = calculate_metrics(y_train, y_pred_train, r2_score)
    metrics = {
              'RMSE Test': rmse_test,
              'RMSE Train': rmse_train,
              'r2 Test': r2_test,
              'r2 Train': r2_train}

    return metrics 

In [10]:
print(X_test.columns.shape)
print(X_train.columns.shape)

(3204,)
(3204,)


In [11]:
model_evaluate(xgb, X_train, y_train, X_test, y_test)

{'RMSE Test': 0.22060830453487518,
 'RMSE Train': 0.14328200202366212,
 'r2 Test': 0.5136219890255578,
 'r2 Train': 0.7828744229293322}

The low RMSE value suggests that, on average, the model's predictions are accurate and close to the true values in the test dataset. This indicates that the model is making reasonably accurate predictions. The R-squared value of approximately 0.4865 indicates that your model explains around 48.65% of the variance in the target variable. While this is not extremely high, it still suggests that the model is capturing some patterns in the data.

In [12]:
raw_preds = xgb.predict(X_test.values)

test_predictions = pd.DataFrame(np.array([X_test.index, raw_preds, y_test]).T, columns= ['index', 'raw_preds', 'real value'])
test_predictions = test_predictions.set_index("index")

In [13]:
mean_absolute_error(y_test, test_predictions.raw_preds.values)

0.06682554813414718

In [14]:
train_pred = xgb.predict(X_train.values)
mean_squared_error(y_train, train_pred)

0.020529732103908717

In [15]:
r2_score(y_train, train_pred)

0.7828744229293322

In [16]:
r2_score(y_test, raw_preds)

0.5136219890255578

In [17]:
test_predictions[test_predictions.raw_preds < 0]

Unnamed: 0_level_0,raw_preds,real value
index,Unnamed: 1_level_1,Unnamed: 2_level_1
11146.0,-0.001374,0.0
16357.0,-0.011738,0.0
15498.0,-0.005650,0.0
16540.0,-0.011738,0.0
7205.0,-0.002092,0.0
...,...,...
4775.0,-0.009891,0.0
3950.0,-0.003076,0.0
16060.0,-0.003423,0.0
11097.0,-0.004923,0.0


In [18]:
raw_preds2 = np.where(raw_preds<0, 0, raw_preds)
r2_score(y_test, raw_preds2)

0.5137274727550244

In [19]:
%%skip

# Visualizing the learning curve

from sklearn.model_selection import learning_curve

def plot_learning_curves(model, X, y, cv, train_sizes=np.linspace(0.1, 1.0, 10)):
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, train_sizes=train_sizes, cv=cv, scoring='neg_mean_squared_error'
    )
    
    train_rmse = np.sqrt(-train_scores.mean(axis=1))
    test_rmse = np.sqrt(-test_scores.mean(axis=1))
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_rmse, label='Training RMSE')
    plt.plot(train_sizes, test_rmse, label='Validation RMSE')
    plt.xlabel('Training Set Size')
    plt.ylabel('Root Mean Squared Error (RMSE)')
    plt.title('Learning Curves')
    plt.legend()
    plt.grid(True)
    plt.show()

# Example usage
xgb = XGBRegressor(n_estimators=100, max_depth=4, colsample_bytree=0.8)
plot_learning_curves(xgb, X_train, y_train, cv=5)


UsageError: Cell magic `%%skip` not found.


# Conclusion and Further Improvements


The XGBoost model appears to have achieved a relatively good performance on both the training and testing datasets. The RMSE values, which measure the average deviation of the model's predictions from the actual values, are quite low for both the training (0.1433) and testing (0.2206) sets. This suggests that the model's predictions are generally accurate and close to the true target values, indicating good predictive accuracy.

The R-squared (r^2) values for both the training (0.7829) and testing (0.5136) sets provide insights into the model's explanatory capability. The R-squared value indicates the proportion of variance in the target variable that is explained by the model. The model has achieved a reasonably high R-squared value on the training set, suggesting that it can explain around 78.29% of the variance in the target variable within the training data. The R-squared value on the testing set (0.5136) indicates that your model explains around 51.36% of the variance in the target variable within the testing data. While the R-squared value on the testing set is not as high as that on the training set, it's still a positive sign, and the model appears to generalize reasonably well to unseen data.

Overall, the XGBoost model with the chosen hyperparameters (n_estimators=100, max_depth=3, colsample_bytree=0.8) demonstrates a good balance between prediction accuracy and explanatory power. However, there is room for further improvement, especially in increasing the model's R-squared value on the testing set to better capture the variability in the target variable. Apart from additional hyperparameter tuning and exploring more complex models,  conducting feature engineering can be done to enhance the model's performance further.

Ideas for further feature engineering include:
- Obtaining the actual dates of release, instead of just the release year.
- Acquiring review data from influencial game critics in japan as their opinion will affect public sentiment and hence the sales.
- Including lagged versions of the target variable or other relevant features can be beneficial. As genre or platform specific trends can play a role in Sales in Japan.

