## **Import Libraries**

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

## **Load Dataset**

In [11]:


df = pd.read_csv("https://raw.githubusercontent.com/RaiyanEOF/Machine-Learning-2-/refs/heads/main/vgsales.csv")
df.head(10)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37
5,6,Tetris,GB,1989.0,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26
6,7,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01
7,8,Wii Play,Wii,2006.0,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62
9,10,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31


## **Data Preprocessing**

In [12]:
df.dropna(subset=["Year", "Publisher"], inplace=True)
df["Decade"] = (df["Year"] // 10) * 10
df["High_Sales"] = (df["Global_Sales"] > 1).astype(int)

df.drop(columns=["Name", "Rank", "Global_Sales"], inplace=True)

X = df.drop(columns=["High_Sales"])
y = df["High_Sales"]

## **Pipeline Creation**

In [13]:
numeric_features = ["Year", "Decade"]
categorical_features = ["Platform", "Genre"]

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=300,
        max_depth=20,
        min_samples_split=5,
        random_state=42,
        n_jobs=-1
    ))
])

## **Model Selection**

**The Random Forest Classifier was selected as the primary model because it is well suited for datasets that contain both numerical and categorical features, such as release year, decade, platform, and genre. Video game sales are influenced by multiple interacting factors and do not follow a simple linear pattern, and Random Forest is capable of capturing these complex, non-linear relationships by combining the predictions of many decision trees. The model is also robust to noise and outliers, which are common in real-world sales data, and it does not require strong assumptions about data distribution. Additionally, Random Forest generally provides stable performance on structured tabular datasets without extensive manual feature tuning, making it a reliable and appropriate choice for this classification task.**

## **Model Training**

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipeline.fit(X_train, y_train)

## **Cross Validation**

In [15]:
cv_scores = cross_val_score(
    pipeline,
    X_train,
    y_train,
    cv=5,
    scoring="f1"
)

print("Cross-Validation F1 Scores:", cv_scores)
print("Mean F1 Score:", cv_scores.mean())
print("Standard Deviation:", cv_scores.std())

Cross-Validation F1 Scores: [0.12637363 0.10497238 0.11538462 0.14168937 0.11142061]
Mean F1 Score: 0.11996812071184453
Standard Deviation: 0.012897938577559779


## **Hyper Parameter Tuning**

In [16]:
param_grid = {
    "classifier__n_estimators": [200, 300],
    "classifier__max_depth": [15, 20],
    "classifier__min_samples_split": [2, 5]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring="f1",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation F1 Score:", grid_search.best_score_)

Best Parameters: {'classifier__max_depth': 20, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Best Cross-Validation F1 Score: 0.11816445995820919


## **Best Model Selection**

In [17]:
best_model = grid_search.best_estimator_

## **Model Performance Evaluation**

In [18]:
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.8821724455354403
Precision: 0.6833333333333333
Recall: 0.10098522167487685
F1 Score: 0.1759656652360515

Classification Report:

              precision    recall  f1-score   support

           0       0.89      0.99      0.94      2853
           1       0.68      0.10      0.18       406

    accuracy                           0.88      3259
   macro avg       0.78      0.55      0.56      3259
weighted avg       0.86      0.88      0.84      3259

