In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# Load cleaned dataset
df = pd.read_csv("vgs_cleaned.csv")

# Select important columns (features)
features = ['Platform', 'Genre', 'Publisher', 'Year_of_Release', 'Critic_Score', 'User_Score', 'Rating']
target = 'Global_Sales'

# Drop rows with missing values in selected columns
df_model = df.dropna(subset=features + [target])

In [4]:
x = df_model[features]
y = df_model[target]

In [5]:
# Categorical and numerical columns
cat_cols = ['Platform', 'Genre', 'Publisher', 'Rating']
num_cols = ['Year_of_Release', 'Critic_Score', 'User_Score']

# Column Transformer
preprocessor = ColumnTransformer([('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)],
                                 remainder='passthrough')  # numeric columns will pass through

In [6]:
# Create pipeline
model = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

In [8]:
model.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [9]:
# Predict
y_pred = model.predict(X_test)
print(y_pred)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


[0.3693     0.02840619 0.33873667 ... 0.2046     0.93759167 0.4164    ]
Root Mean Squared Error: 1.87
R² Score: 0.17


In [10]:
# Predict for a new game (example input)
new_game = pd.DataFrame({
    'Platform': ['PS4'],
    'Genre': ['Action'],
    'Publisher': ['Sony'],
    'Year_of_Release': [2024],
    'Critic_Score': [85],
    'User_Score': [8.5],
    'Rating': ['M']
})

prediction = model.predict(new_game)
print(f"Predicted Global Sales: {prediction[0]:.2f} million copies")

Predicted Global Sales: 7.99 million copies
