In [None]:
#Modules imported
import kagglehub
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

In [4]:
# Load the dataset
# Download latest version
path = kagglehub.dataset_download("ibriiee/video-games-sales-dataset-2022-updated-extra-feat")
print(path)

C:\Users\User\.cache\kagglehub\datasets\ibriiee\video-games-sales-dataset-2022-updated-extra-feat\versions\1


In [5]:
#Convert the dataset to a pandas dataframe
df = pd.read_csv(path+"/Video_Games.csv")
# Display the first few rows of the dataset
print(df.head())

                       Name Platform  Year_of_Release         Genre Publisher  \
0                Wii Sports      Wii           2006.0        Sports  Nintendo   
1         Super Mario Bros.      NES           1985.0      Platform  Nintendo   
2            Mario Kart Wii      Wii           2008.0        Racing  Nintendo   
3         Wii Sports Resort      Wii           2009.0        Sports  Nintendo   
4  Pokemon Red/Pokemon Blue       GB           1996.0  Role-Playing  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  Critic_Score  \
0     41.36     28.96      3.77         8.45         82.53          76.0   
1     29.08      3.58      6.81         0.77         40.24           NaN   
2     15.68     12.76      3.79         3.29         35.52          82.0   
3     15.61     10.93      3.28         2.95         32.77          80.0   
4     11.27      8.89     10.22         1.00         31.37           NaN   

   Critic_Count User_Score  User_Count Developer Rating 

In [None]:
# Preprocess the data
# Use One-Hot Encoding for categorical variables
# Convert 'Platform', 'Genre', and 'Publisher' to one-hot encoded variables
df = pd.get_dummies(df, columns=['Platform', 'Genre', 'Publisher'], drop_first=True)

#remove unnecessary columns
df.drop(columns=['Name', 'Year_of_Release', 'Rating'], inplace=True)

#Clean the data
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

#Label encode the target variable
le = LabelEncoder()


In [7]:
#We need to predict if how much sales a game will make
#We will use the Global_Sales as the target variable
print(df["Global_Sales"].describe())
print(df.dtypes)

count    7013.000000
mean        0.767403
std         1.940812
min         0.010000
25%         0.110000
50%         0.290000
75%         0.750000
max        82.530000
Name: Global_Sales, dtype: float64
NA_Sales                          float64
EU_Sales                          float64
JP_Sales                          float64
Other_Sales                       float64
Global_Sales                      float64
                                   ...   
Publisher_id Software                bool
Publisher_imageepoch Inc.            bool
Publisher_inXile Entertainment       bool
Publisher_mixi, Inc                  bool
Publisher_responDESIGN               bool
Length: 631, dtype: object


In [8]:
#Split the data into features and target variable
X = df.drop(columns=['Global_Sales'])
y = df['Global_Sales']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# We are going to use MLP (Multi-Layer Perceptron) for this task
# Convert categorical columns to numerical values using Label Encoding
# Loop through and find any colums tagged as object and convert them to numerical values

for column in X_train.select_dtypes(include=['object']).columns:
	combined_data = pd.concat([X_train[column], X_test[column]], axis=0)
	le.fit(combined_data)  # Fit the LabelEncoder on combined data
	X_train[column] = le.transform(X_train[column])
	X_test[column] = le.transform(X_test[column])
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [10]:
#Ceate the MLP model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(8, activation='relu'),
    Dense(4, activation='relu'),
    Dense(2, activation='relu'),
    Dense(1, activation='relu'),
])


# Compile the model
# We will use Adam optimizer and Mean Absolute Error as the loss function
# We will also use Mean Absolute Error as the metric
# We will also use Root Mean Squared Error as the metric
model.compile(optimizer = tf.keras.optimizers.Adam(), loss=tf.keras.losses.MeanAbsoluteError(),
              metrics=[tf.keras.metrics.RootMeanSquaredError(), tf.keras.metrics.MeanAbsoluteError()])

# Convert target variables to float32
y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

# Train the model
model.fit(X_train, y_train, epochs=60, batch_size=32, validation_split=0.2)
print(model.summary())

Epoch 1/60


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.5957 - mean_absolute_error: 0.5957 - root_mean_squared_error: 2.1837 - val_loss: 0.1642 - val_mean_absolute_error: 0.1642 - val_root_mean_squared_error: 0.2709
Epoch 2/60
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1451 - mean_absolute_error: 0.1451 - root_mean_squared_error: 0.2701 - val_loss: 0.1045 - val_mean_absolute_error: 0.1045 - val_root_mean_squared_error: 0.1583
Epoch 3/60
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1018 - mean_absolute_error: 0.1018 - root_mean_squared_error: 0.1774 - val_loss: 0.0925 - val_mean_absolute_error: 0.0925 - val_root_mean_squared_error: 0.1676
Epoch 4/60
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0874 - mean_absolute_error: 0.0874 - root_mean_squared_error: 0.1667 - val_loss: 0.0771 - val_mean_absolute_error: 0.0771 - val_root_mean_squared_e

None


In [11]:

# Evaluate the model on the test set
model.evaluate(X_test, y_test, verbose = 2)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Print the predictions
print(y_pred[:10])
# Print the actual values
print(y_test[:10])

#Output what the model predicted in what context:
#Ensure X_test is a pandas DataFrame for column access
X_test_df = pd.DataFrame(X_test, columns=X.columns)

predictions_df = pd.DataFrame({
    'Predicted Global Sales (in millions)': model.predict(X_test).flatten(),
    'Actual Global Sales (in millions)': y_test.values,
    'Genre': X_test_df.filter(like='Genre_').idxmax(axis=1).str.replace('Genre_', ''),  # Extract the original Genre
    'Publisher': X_test_df.filter(like='Publisher_').idxmax(axis=1).str.replace('Publisher_', ''),  # Extract the original Publisher
    'Platform': X_test_df.filter(like='Platform_').idxmax(axis=1).str.replace('Platform_', '')  # Extract the original Platform
})

#Print out the predictions
for index, row in predictions_df.iterrows():
    print(f"Game {index+1}:")
    print(f"  Genre: {row['Genre']}")
    print(f"  Publisher: {row['Publisher']}")
    print(f"  Platform: {row['Platform']}")
    print(f"  Predicted Sales = {row['Predicted Global Sales (in millions)']:.2f} million")
    print(f"  Actual Sales = {row['Actual Global Sales (in millions)']:.2f} million")
    print("-" * 50)  # Separator for better readability

44/44 - 0s - 2ms/step - loss: 0.0355 - mean_absolute_error: 0.0355 - root_mean_squared_error: 0.1016
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[[0.06194393]
 [0.07830182]
 [3.6077313 ]
 [0.2028926 ]
 [0.20369616]
 [0.7355924 ]
 [1.241912  ]
 [0.27298468]
 [0.1656518 ]
 [0.08617288]]
12097    0.07
5961     0.29
357      3.59
8261     0.17
7430     0.21
2794     0.73
1630     1.23
6240     0.28
9102     0.14
9479     0.13
Name: Global_Sales, dtype: float32
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 936us/step
Game 1:
  Genre: Simulation
  Publisher: Electronic Arts
  Platform: PC
  Predicted Sales = 0.06 million
  Actual Sales = 0.07 million
--------------------------------------------------
Game 2:
  Genre: Shooter
  Publisher: Midas Interactive Entertainment
  Platform: PS2
  Predicted Sales = 0.08 million
  Actual Sales = 0.29 million
--------------------------------------------------
Game 3:
  Genre: Shooter
  Publisher: Electronic 