## **Import Libraries**

In [44]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor


## **Load Dataset**

In [52]:


df = pd.read_csv("https://raw.githubusercontent.com/RaiyanEOF/Machine-Learning-2-/refs/heads/main/vgsales.csv")
df.head(10)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37
5,6,Tetris,GB,1989.0,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26
6,7,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01
7,8,Wii Play,Wii,2006.0,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62
9,10,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31


## **Y Data Profiling**

In [46]:
!pip install ydata-profiling



In [47]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Profiling Report",explorative=True)
profile.to_file("y_data.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/9 [00:00<?, ?it/s][A
 22%|██▏       | 2/9 [00:00<00:00, 15.43it/s][A
100%|██████████| 9/9 [00:00<00:00, 35.00it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## **Data Preprocessing**

In [53]:
#Drop irrelevant text columns
df = df.drop(columns=['Name', 'Publisher'])

#Handle missing values
df = df.dropna()

#Feature selection
features = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales',
            'Rank', 'Platform', 'Genre']
X = df[features]
y = df['Global_Sales']

#Separate feature types
num_features = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Rank']
cat_features = ['Platform', 'Genre']


## **Pipeline Creation**

In [34]:
numeric_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_features),
    ('cat', categorical_pipeline, cat_features)
])

pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', XGBRegressor(
        objective='reg:squarederror',
        n_estimators=200,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42
    ))
])


## **Model Selection**

**A classification report cannot be used here because this problem is a regression task, not a classification task. The model is predicting Global_Sales, which is a continuous numerical value, not a category or class. Classification reports only work when the output is discrete such as low, medium, or high. Since the goal is to predict an exact sales number, it is more correct to evaluate the model using regression metrics like R2, RMSE, and MAE, which measure how close the predicted values are to the actual sales.**

## **Model Training**

In [35]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)


## **Cross Validation**

In [36]:
cv_scores = cross_val_score(
    pipeline,
    X_train,
    y_train,
    cv=5,
    scoring='r2'
)

print("Mean CV R2:", cv_scores.mean())
print("CV Std:", cv_scores.std())


Mean CV R2: 0.9459251685055682
CV Std: 0.03142674959131748


## **Hyper Parameter Tuning**

In [37]:
param_grid = {
    'model__n_estimators': [150, 200],
    'model__max_depth': [3, 4],
    'model__learning_rate': [0.05],
    'model__subsample': [0.9],
    'model__colsample_bytree': [0.9]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='r2',
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
print("Best CV Score:", grid.best_score_)


Best Parameters: {'model__colsample_bytree': 0.9, 'model__learning_rate': 0.05, 'model__max_depth': 4, 'model__n_estimators': 200, 'model__subsample': 0.9}
Best CV Score: 0.9403194069302389


## **Best Model Selection**

In [38]:
best_model = grid.best_estimator_


## **Model Performance Evaluation**

In [45]:
y_pred = best_model.predict(X_test)

print("R2 Score:", r2_score(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))


R2 Score: 0.8162153041865099
RMSE: 0.7852274795123227
MAE: 0.03900136133935359
