In [1]:
# Step 1: Understanding the Dataset
# The dataset contains 16 columns related to video game sales, ratings, and metadata. Key observations:

# Target Variable: Global_Sales (Total sales worldwide)
# Feature Candidates: Platform, Year_of_Release, Genre, Publisher, Critic_Score, etc.
# Missing Data: Some columns like Critic_Score, User_Score, User_Count, and Developer have missing values.
# Data Types: User_Score is stored as an object, likely requiring conversion.
# Next, I'll clean the dataset and prepare it for training. ​​

# Step 2: Preparing Data for Training
# Target Variable: Global_Sales
# Features: All other numerical and encoded categorical columns
# Now, I'll split the dataset into training and testing sets and train a Regression Tree Model (Decision Tree Regressor). ​​

# Step 3: Model Evaluation
# The Decision Tree Regressor gives the following performance:

# Mean Absolute Error (MAE): 0.0606
# Mean Squared Error (MSE): 0.6223
# R² Score: 0.8493 (Indicates a good fit, explaining ~85% of variance)
# The model performs well, but we can improve it by hyperparameter tuning or using Random Forest Regressor.

In [2]:
# We will be following the below steps to solve this problem:

# Importing the libraries

# Using some pre-defined utility functions

# Loading the data

# Cleaning the data

# Dividing the dataset into training and test dataset

# using train_test_split in the ratio 70:30
# Training several models and analyzing their performance to select a model

# Fine-tuning the model by finding the best hyper-parameters and features

# Evaluating selected model using test dataset

In [3]:
# In[13]:


# Please import the required libraries as mentioned below:

# Import numpy as np
# Import pandas as pd
# From sklearn import preprocessing
# Please import StandardScaler from Scikit Learn - preprocessing
# Please import mean_squared_error from Scikit Learn - metrics
# Please import linear_model from Scikit Learn
# Please import matplotlib's pyplot as plt
# Import os
# set random seed as follows:

In [4]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
import matplotlib.pyplot as plt
import os
np.random.seed(42)


In [5]:
import chardet

with open('./Video_Games_Sales.csv', 'rb') as f:
    result = chardet.detect(f.read(100000))  # Read first 100000 bytes
    print(result['encoding'])

# data = pd.read_csv('./water_quality.csv', encoding=result['encoding'])

# import pandas as pd

# data = pd.read_csv('your_file.csv', encoding='ISO-8859-1')
# # OR
# data = pd.read_csv('your_file.csv', encoding='latin1')


utf-8


In [6]:
filepath = "./Video_Games_Sales.csv"
video_sales_Data =  pd.read_csv(filepath, encoding='ISO-8859-1')
video_sales_Data.describe()

Unnamed: 0,Year_of_Release,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Count
count,16450.0,16719.0,16719.0,16719.0,16719.0,16719.0,8137.0,8137.0,7590.0
mean,2006.487356,0.26333,0.145025,0.077602,0.047332,0.533543,68.967679,26.360821,162.229908
std,5.878995,0.813514,0.503283,0.308818,0.18671,1.547935,13.938165,18.980495,561.282326
min,1980.0,0.0,0.0,0.0,0.0,0.01,13.0,3.0,4.0
25%,2003.0,0.0,0.0,0.0,0.0,0.06,60.0,12.0,10.0
50%,2007.0,0.08,0.02,0.0,0.01,0.17,71.0,21.0,24.0
75%,2010.0,0.24,0.11,0.04,0.03,0.47,79.0,36.0,81.0
max,2020.0,41.36,28.96,10.22,10.57,82.53,98.0,113.0,10665.0


In [7]:
video_sales_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16717 non-null  object 
 1   Platform         16719 non-null  object 
 2   Year_of_Release  16450 non-null  float64
 3   Genre            16717 non-null  object 
 4   Publisher        16665 non-null  object 
 5   NA_Sales         16719 non-null  float64
 6   EU_Sales         16719 non-null  float64
 7   JP_Sales         16719 non-null  float64
 8   Other_Sales      16719 non-null  float64
 9   Global_Sales     16719 non-null  float64
 10  Critic_Score     8137 non-null   float64
 11  Critic_Count     8137 non-null   float64
 12  User_Score       10015 non-null  object 
 13  User_Count       7590 non-null   float64
 14  Developer        10096 non-null  object 
 15  Rating           9950 non-null   object 
dtypes: float64(9), object(7)
memory usage: 2.0+ MB


In [8]:
#Drop rows with a missing target variable (Global Sales)
df = video_sales_Data.dropna(subset=["Global_Sales","Name"])

In [9]:
df["Publisher"].mode()

0    Electronic Arts
dtype: object

In [10]:
#Fill the missing numerical values with median or mode
df["User_Score"] = pd.to_numeric(df["User_Score"],errors="coerce")
df["User_Score"] = df["User_Score"].fillna(df["User_Score"].median())

df["User_Count"] = pd.to_numeric(df["User_Count"],errors="coerce")
df["User_Count"] = df["User_Count"].fillna(df["User_Count"].median())

df['Year_of_Release'].fillna(df['Year_of_Release'].mode()[0], inplace=True)


df["Critic_Score"] = pd.to_numeric(df["Critic_Score"],errors="coerce")
df["Critic_Score"] = df["Critic_Score"].fillna(df["Critic_Score"].median())

df["Critic_Count"] = pd.to_numeric(df["Critic_Count"],errors="coerce")
df["Critic_Count"] = df["Critic_Count"].fillna(df["Critic_Count"].median()) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inste

In [11]:
# #Drop less useful columns 
df = df.drop(columns=["Developer","NA_Sales","EU_Sales","JP_Sales","Other_Sales"])


In [12]:
df.isna().sum()[df.isna().sum() > 0]

Publisher      54
Rating       6767
dtype: int64

In [13]:
#Convert categorical variables into numerical using One Hot Encoding
# df = pd.get_dummies(df, columns=["Publisher","Platform", "Genre", "Rating"],drop_first=True)

# df.isna().sum()[df.isna().sum() > 0]

df['Publisher'].fillna(df['Publisher'].mode()[0], inplace=True)

# df["Platform"] = df["Platform"].fillna(df["Platform"].mode())

df['Genre'].fillna(df['Genre'].mode()[0], inplace=True)

df['Rating'].fillna(df['Rating'].mode()[0], inplace=True)

In [14]:
from sklearn.preprocessing import OneHotEncoder

# # Initialize OneHotEncoder
# encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' avoids dummy variable trap

# # Fit and transform data
# encoded_array = encoder.fit_transform(df[["Publisher","Platform", "Genre", "Rating"]])

# # Convert to DataFrame
# encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names(["Publisher","Platform", "Genre", "Rating"]))

# # Concatenate with original DataFrame
# df_final = pd.concat([df, encoded_df], axis=1)

# print(df_final)

# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=["Publisher","Platform", "Genre", "Rating"], drop_first=True)

print(df_encoded)


                                Name  Year_of_Release  Global_Sales  \
0                         Wii Sports           2006.0         82.53   
1                  Super Mario Bros.           1985.0         40.24   
2                     Mario Kart Wii           2008.0         35.52   
3                  Wii Sports Resort           2009.0         32.77   
4           Pokemon Red/Pokemon Blue           1996.0         31.37   
...                              ...              ...           ...   
16714  Samurai Warriors: Sanada Maru           2016.0          0.01   
16715               LMA Manager 2007           2006.0          0.01   
16716        Haitaka no Psychedelica           2016.0          0.01   
16717               Spirits & Spells           2003.0          0.01   
16718            Winning Post 8 2016           2016.0          0.01   

       Critic_Score  Critic_Count  User_Score  User_Count  \
0              76.0          51.0         8.0       322.0   
1              71.0      

In [15]:
# #Display the clean dataset info 
# df_final.info(), df_final.describe()


In [16]:
# Re-import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# # Re-load dataset
# df = pd.read_csv("./Video_Games_Sales.csv")

# # Drop columns with more than 50% missing values
# missing_threshold = 0.5 * len(df)
# df = df.dropna(thresh=missing_threshold, axis=1)

# # Fill missing values for numerical columns with median
# num_cols = df.select_dtypes(include=[np.number]).columns
# df[num_cols] = df[num_cols].apply(lambda col: col.fillna(col.median()))

# # Fill missing values for categorical columns with most frequent value (mode)
# cat_cols = df.select_dtypes(include=["object"]).columns
# df[cat_cols] = df[cat_cols].apply(lambda col: col.fillna(col.mode()[0]))

# # Verify if any NaN values remain
# nan_counts_after = df.isna().sum().sum()
# nan_counts_after



In [17]:

# End to End Project - Video Games Sales - Basic - Divide into training/ test dataset
# Now, since we have cleaned the video_sales_Data data set, let us split it into Training and Test data sets into 70:30 ratio using scikit-learn's train_test_split() function.

# Also, train_test_split() function uses 'Random Sampling', hence resulting train_set and test_set data sets have to be sorted by dayCount. Random Sampling may not be the best way to split the data, what other types of best Sampling method you can think of?

# We will also define an utility function named display_scores. This function is used to calculate the basics stats of observed scores from cross-validation of models. Please copy this function in your code, we will be using it often in this project.



In [18]:
#Decision Tree Regressor

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# Define features and target
X = df.drop(columns=["Global_Sales"])
y = df["Global_Sales"]

# Identify categorical columns
cat_cols = X.select_dtypes(include=["object"]).columns

# Apply Label Encoding to categorical columns
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for future use

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree Regressor
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)

# Predictions
y_pred = regressor.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

(mae, mse, r2)


(0.5127661483253588, 4.505380352870813, -0.09094197796573522)

In [19]:
# Train a Linear Regression Model

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# # Re-load dataset
# df = pd.read_csv("./Video_Games_Sales.csv")

# # Drop columns with more than 50% missing values
# missing_threshold = 0.5 * len(df)
# df = df.dropna(thresh=missing_threshold, axis=1)

# # Fill missing values for numerical columns with median
# num_cols = df.select_dtypes(include=[np.number]).columns
# df[num_cols] = df[num_cols].apply(lambda col: col.fillna(col.median()))

# # Fill missing values for categorical columns with most frequent value (mode)
# cat_cols = df.select_dtypes(include=["object"]).columns
# df[cat_cols] = df[cat_cols].apply(lambda col: col.fillna(col.mode()[0]))

# Define features and target
X1 = df_encoded.drop(columns=["Global_Sales","Name"])
y1 = df_encoded["Global_Sales"]

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred = lr_model.predict(X_test)
y_pred_lr = lr_model.predict(X_test)


# Evaluate the model
mae_lr = mean_absolute_error(y_test, y_pred)
mse_lr = mean_squared_error(y_test, y_pred)
r2_lr = r2_score(y_test, y_pred)

(mae_lr, mse_lr, r2_lr)


(0.5662084287360779, 3.863633167691587, 0.06445199295777637)

In [20]:
# Random Forest Regression 

from sklearn.ensemble import RandomForestRegressor

X = df.drop(columns=["Global_Sales"])
y = df["Global_Sales"]

# Train Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

(mae_rf, mse_rf, r2_rf)


(0.4273270260167463, 2.9695939295635836, 0.28093647560547474)

In [21]:
from bayes_opt import BayesianOptimization
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

# Define the function to optimize
def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    model = RandomForestRegressor(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        random_state=42
    )
    return np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))

# Define the parameter search space
param_bounds = {
    'n_estimators': (50, 500),
    'max_depth': (5, 50),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 5)
}

# Initialize Bayesian Optimization
optimizer = BayesianOptimization(
    f=rf_cv,  # Function to optimize
    pbounds=param_bounds,  # Hyperparameter ranges
    random_state=42,
    verbose=2
)

# Run Bayesian Optimization
optimizer.maximize(init_points=5, n_iter=1)  # 5 random initial points, 20 optimization steps

# Best Parameters
print("Best Parameters:", optimizer.max)


|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-1.164   [0m | [0m21.85    [0m | [0m4.803    [0m | [0m7.856    [0m | [0m319.4    [0m |
| [0m2        [0m | [0m-1.243   [0m | [0m12.02    [0m | [0m1.624    [0m | [0m2.465    [0m | [0m439.8    [0m |
| [95m3        [0m | [95m-1.144   [0m | [95m32.05    [0m | [95m3.832    [0m | [95m2.165    [0m | [95m486.5    [0m |
| [0m4        [0m | [0m-1.23    [0m | [0m42.46    [0m | [0m1.849    [0m | [0m3.455    [0m | [0m132.5    [0m |
| [0m5        [0m | [0m-1.153   [0m | [0m18.69    [0m | [0m3.099    [0m | [0m5.456    [0m | [0m181.1    [0m |
| [0m6        [0m | [0m-1.163   [0m | [0m18.16    [0m | [0m2.583    [0m | [0m6.417    [0m | [0m181.7    [0m |
Best Parameters: {'target': -1.1440179517081694, 'params': {'max_depth': 32.0501755284444, 'min_samples_leaf': 3.8

In [22]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestRegressor
# # Create a dictionary of hyperparameters for GridSearchCV:

# param_grid = {
#     'n_estimators': [50, 100, 200],  # Number of trees
#     'max_depth': [None, 10, 20, 30],  # Depth of trees
#     'min_samples_split': [2, 5, 10],  # Minimum samples per split
#     'min_samples_leaf': [1, 2, 4]  # Minimum samples per leaf
# }


In [23]:
# # Now, apply Grid Search on your RandomForestRegressor:
# grid_search = GridSearchCV(
#     estimator=RandomForestRegressor(random_state=42),
#     param_grid=param_grid,
#     cv=5,  # 5-fold cross-validation
#     scoring='neg_mean_squared_error',
#     n_jobs=-1,
#     verbose=2
# )

# grid_search.fit(X_train, y_train)


In [24]:
# pip install shap

In [25]:
# Check the Best Parameters & Score

# print("Best Parameters:", optimizer.max)
# print("Best RMSE:", (-optimizer.max.best_score_) ** 0.5)

In [26]:
best_params_int = {k: int(v) for k, v in optimizer.max["params"].items()}

In [27]:
# Once the best hyperparameters are found, retrain your model with them:


best_model = RandomForestRegressor(**best_params_int, random_state=42)

best_model.fit(X_train, y_train)

# Evaluate again
y_pred_best = best_model.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
print("Final RMSE:", mse_best ** 0.5)


Final RMSE: 1.7357584534647064


In [28]:
# pip install --force-reinstall --no-cache-dir shap numba


In [29]:
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

# Create SHAP Explainer
explainer = shap.Explainer(best_model, X_train)
shap_values = explainer(X_train)

# Summary Plot
shap.summary_plot(shap_values, X_train)




ExplainerError: Additivity check failed in TreeExplainer! Please ensure the data matrix you passed to the explainer is the same shape that the model was trained on. If your data shape is correct then please report this on GitHub. This check failed because for one of the samples the sum of the SHAP values was 14.908616, while the model output was 14.756196. If this difference is acceptable you can set check_additivity=False to disable this check.

In [None]:
import matplotlib.pyplot as plt

def plot_predictions(y_test, y_pred, title):
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, alpha=0.5, color="blue", label="Predicted vs Actual")
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color="red", linestyle="--", label="Perfect Fit")
    plt.xlabel("Actual Values")
    plt.ylabel("Predicted Values")
    plt.title(title)
    plt.legend()
    plt.show()

# Plot for Decision Tree
plot_predictions(y_test, y_pred, "Decision Tree: Predicted vs Actual")

# Plot for Linear Regression
plot_predictions(y_test, y_pred_lr, "Linear Regression: Predicted vs Actual")

# Plot for Random Forest
plot_predictions(y_test, y_pred_rf, "Random Forest: Predicted vs Actual")

# Plot for Best Random Forest Model (Grid Search)
plot_predictions(y_test, y_pred_best, "Tuned Random Forest: Predicted vs Actual")
