## **Import Libraries**

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

## **Load Dataset**

In [2]:


df = pd.read_csv("https://raw.githubusercontent.com/RaiyanEOF/Machine-Learning-2-/refs/heads/main/vgsales.csv")
df.head(10)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37
5,6,Tetris,GB,1989.0,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26
6,7,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01
7,8,Wii Play,Wii,2006.0,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62
9,10,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31


## **Y-Data Profiling**

In [3]:
!pip install ydata-profiling

Collecting ydata-profiling
  Downloading ydata_profiling-4.18.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting visions<0.8.2,>=0.7.5 (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling)
  Downloading visions-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting minify-html>=0.15.0 (from ydata-profiling)
  Downloading minify_html-0.18.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting filetype>=1.0.0 (from ydata-profiling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting phik<0.13,>=0.12.5 (from ydata-profiling)
  Downloading phik-0.12.5-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.2 (from ydata-profiling)
  Downloading ImageHash-4.3.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting dacite<2,>=1.9 (from ydata-profiling)
  Downloading

In [4]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Profiling Report",explorative=True)
profile.to_file("y_data.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/11 [00:00<?, ?it/s][A
  9%|▉         | 1/11 [00:00<00:01,  6.78it/s][A
 18%|█▊        | 2/11 [00:01<00:05,  1.69it/s][A
100%|██████████| 11/11 [00:01<00:00,  6.65it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## **Data Preprocessing**

In [5]:

df = df.dropna(subset=["Global_Sales"])


df["Year"] = df["Year"].fillna(df["Year"].median())


df = df.drop(columns=["NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales", "Name", "Publisher"])


X = df[["Rank", "Platform", "Genre", "Year"]]
y = df["Global_Sales"]


numeric_features = ["Rank", "Year"]
categorical_features = ["Platform", "Genre"]


## **Pipeline Creation**

In [6]:
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=300,
        max_depth=20,
        min_samples_split=5,
        random_state=42,
        n_jobs=-1
    ))
])


## **Model Selection**

**The Random Forest Classifier was selected as the primary model because it is well suited for datasets that contain both numerical and categorical features, such as release year, decade, platform, and genre. Video game sales are influenced by multiple interacting factors and do not follow a simple linear pattern, and Random Forest is capable of capturing these complex, non-linear relationships by combining the predictions of many decision trees. The model is also robust to noise and outliers, which are common in real-world sales data, and it does not require strong assumptions about data distribution. Additionally, Random Forest generally provides stable performance on structured tabular datasets without extensive manual feature tuning, making it a reliable and appropriate choice for this classification task.**

## **Model Training**

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)


## **Cross Validation**

In [8]:
cv_scores = cross_val_score(
    pipeline,
    X_train,
    y_train,
    cv=5,
    scoring="r2",
    n_jobs=-1
)

print("Cross-validation R² scores:", cv_scores)
print("Mean R²:", cv_scores.mean())
print("Standard Deviation:", cv_scores.std())


Cross-validation R² scores: [0.99988177 0.9973079  0.99745449 0.99929479 0.98145347]
Mean R²: 0.9950784825319328
Standard Deviation: 0.006886296493100083


## **Hyper Parameter Tuning**

In [9]:
param_grid = {
    "model__n_estimators": [200, 300],
    "model__max_depth": [10, 20],
    "model__min_samples_split": [2, 5]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring="r2",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best CV R² Score:", grid_search.best_score_)


Best Parameters: {'model__max_depth': 10, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Best CV R² Score: 0.9925022609828326


## **Best Model Selection**

In [10]:
best_model = grid_search.best_estimator_


## **Model Performance Evaluation**

In [11]:
y_pred = best_model.predict(X_test)

print("Final Test Performance")
print("R² Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred))


Final Test Performance
R² Score: 0.835141390454656
MAE: 0.016235284317949843
RMSE: 0.6926310233511083


## **Save Model**

In [12]:
import pickle

model_name = "final_rf_model.pkl"

with open(model_name, "wb") as file:
    pickle.dump(best_model, file)

print("Model saved successfully as final_rf_model.pkl")


Model saved successfully as final_rf_model.pkl


**Load and Predict**

In [13]:
import pickle

with open("final_rf_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

print("Model loaded successfully")
sample = pd.DataFrame({
    "Rank": [50],
    "Platform": ["Wii"],
    "Genre": ["Sports"],
    "Year": [2008]
})

prediction = loaded_model.predict(sample)
print("Predicted Global Sales:", prediction[0])

Model loaded successfully
Predicted Global Sales: 11.339249999999991


## **RF Using MLFLOW**

In [14]:
!pip install mlflow


Collecting mlflow
  Downloading mlflow-3.8.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-skinny==3.8.1 (from mlflow)
  Downloading mlflow_skinny-3.8.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.8.1 (from mlflow)
  Downloading mlflow_tracing-3.8.1-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Downloading flask_cors-6.0.2-py3-none-any.whl.metadata (5.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting huey<3,>=2.5.0 (from mlflow)
  Downloading huey-2.6.0-py3-none-any.whl.metadata (4.3 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.8.1->mlflow)
  Downloading databricks_sdk-0.78.0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [15]:
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


mlflow.set_tracking_uri("file:///tmp/mlruns")
mlflow.set_experiment("VGSales_RF_Model")


my_params = {
    "n_estimators": best_model.named_steps['model'].n_estimators,
    "max_depth": best_model.named_steps['model'].max_depth,
    "min_samples_split": best_model.named_steps['model'].min_samples_split,
    "random_state": best_model.named_steps['model'].random_state,
    "n_jobs": best_model.named_steps['model'].n_jobs
}

with mlflow.start_run(run_name="RandomForest_Best_Model"):


    mlflow.log_params(my_params)
    mlflow.log_param("model_type", "RandomForestRegressor")
    mlflow.log_param("numeric_features", numeric_features)
    mlflow.log_param("categorical_features", categorical_features)


    y_train_pred = best_model.predict(X_train)
    train_r2 = r2_score(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("train_mae", train_mae)
    mlflow.log_metric("train_rmse", train_rmse)


    y_test_pred = best_model.predict(X_test)
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

    mlflow.log_metric("test_r2", test_r2)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("test_rmse", test_rmse)


    mlflow.sklearn.log_model(best_model, artifact_path="rf_pipeline_model")

    print("MLflow run completed!")
    print(f"Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}")
    print(f"Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
    print(f"Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")


  return FileStore(store_uri, store_uri)
2026/01/22 07:29:52 INFO mlflow.tracking.fluent: Experiment with name 'VGSales_RF_Model' does not exist. Creating a new experiment.


MLflow run completed!
Train R²: 0.9996, Test R²: 0.8351
Train MAE: 0.0010, Test MAE: 0.0162
Train RMSE: 0.0276, Test RMSE: 0.8322
