In [1]:
pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os

In [3]:
train_files = [
    "One-Hot/Train_Orig_OH.csv",
    "One-Hot/Scaled/Train_Scaled_All_OH.csv",
    "One-Hot/Scaled/Train_Scaled_Cont_OH.csv",
    "One-Hot/MinMax/Train_MM_OH.csv",
    "One-Hot/MinMax/train_OH_MM_PCA15.csv",
    "One-Hot/MinMax/train_OH_MM_PCA20.csv",
    "One-Hot/MinMax/train_OH_MM_PCA25.csv",
    "One-Hot/MinMax/train_OH_MM_PCA30.csv",
    "One-Hot/MinMax/train_OH_MM_PCA35.csv",
    "IntClasses/Train_Orig_Int.csv",
    "IntClasses/Scaled/Train_Scaled_All_Int.csv",
    "IntClasses/Scaled/Train_Scaled_Cont_Int.csv",
    "IntClasses/MinMax/Train_MM_Int.csv",
    "IntClasses/MinMax/train_Int_MM_PCA10.csv",
    "IntClasses/MinMax/train_Int_MM_PCA15.csv",
    "IntClasses/MinMax/train_Int_MM_PCA20.csv",
    "IntClasses/MinMax/train_Int_MM_PCA25.csv",
]

results = []

In [4]:
def evaluate_model(name, model, X, y, dataset_type):
    y_pred = model.predict(X)
    mae = mean_absolute_error(y, y_pred)
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y, y_pred)
    rmse_pct = (rmse / y.mean()) * 100

    print(f"\n{name} - {dataset_type} Regression Evaluation")
    print("MAE :", mae)
    print("MSE :", mse)
    print("RMSE:", rmse)
    print("RMSE % Err :", rmse_pct)
    print("R2  :", r2)

    return mae, mse, rmse, rmse_pct, r2

In [5]:
for path in train_files:
    print("\n================================================================================")
    print(f"Processing dataset: {path}")

    train_df = pd.read_csv(path)
    valid_path = path.replace("Train", "Valid").replace("train", "valid")
    test_path = path.replace("Train", "Test").replace("train", "test")

    valid_df = pd.read_csv(valid_path)
    test_df = pd.read_csv(test_path)

    X_train = train_df.iloc[:, :-1]
    y_train = train_df.iloc[:, -1]

    X_valid = valid_df.iloc[:, :-1]
    y_valid = valid_df.iloc[:, -1]

    X_test = test_df.iloc[:, :-1]
    y_test = test_df.iloc[:, -1]

    model = XGBRegressor(objective='reg:squarederror', random_state=42)
    model.fit(X_train, y_train)

    name = os.path.basename(path)
    evaluate_model(name, model, X_valid, y_valid, "Validation")
    mae, mse, rmse, rmse_pct, r2 = evaluate_model(name, model, X_test, y_test, "Test")

    results.append({
        "Dataset": name,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "RMSE %": rmse_pct,
        "R2": r2
    })

results_df = pd.DataFrame(results)


Processing dataset: One-Hot/Train_Orig_OH.csv

Train_Orig_OH.csv - Validation Regression Evaluation
MAE : 0.3078185495376588
MSE : 0.25742627472860574
RMSE: 0.5073719293857375
RMSE % Err : 0.9975294923224661
R2  : 0.9958383095231809

Train_Orig_OH.csv - Test Regression Evaluation
MAE : 0.3140824020385743
MSE : 0.267293703461135
RMSE: 0.5170045487818603
RMSE % Err : 1.0203255121206702
R2  : 0.9955297804857566

Processing dataset: One-Hot/Scaled/Train_Scaled_All_OH.csv

Train_Scaled_All_OH.csv - Validation Regression Evaluation
MAE : 0.03964864662587075
MSE : 0.004270854163350779
RMSE: 0.06535177245760652
RMSE % Err : 528.4168009520321
R2  : 0.9958383707296835

Train_Scaled_All_OH.csv - Test Regression Evaluation
MAE : 0.04045583577210341
MSE : 0.004434645928652672
RMSE: 0.06659313724891382
RMSE % Err : -536.9628372192582
R2  : 0.9955297602591788

Processing dataset: One-Hot/Scaled/Train_Scaled_Cont_OH.csv

Train_Scaled_Cont_OH.csv - Validation Regression Evaluation
MAE : 0.039648646625


train_Int_MM_PCA25.csv - Validation Regression Evaluation
MAE : 0.0209764753107211
MSE : 0.0007247338649759151
RMSE: 0.026920881578728344
RMSE % Err : 6.735451961762892
R2  : 0.9642995561589882

train_Int_MM_PCA25.csv - Test Regression Evaluation
MAE : 0.020822901161347072
MSE : 0.0007036245515123063
RMSE: 0.026525922255640922
RMSE % Err : 6.694988962378077
R2  : 0.9641442127734383


In [6]:
best_r2 = results_df.loc[results_df['R2'].idxmax()]
best_rmse_pct = results_df.loc[results_df['RMSE %'].idxmin()]
best_rmse = results_df.loc[results_df['RMSE'].idxmin()]
best_mae = results_df.loc[results_df['MAE'].idxmin()]
best_mse = results_df.loc[results_df['MSE'].idxmin()]

print("BEST REGRESSION PERFORMING DATASETS")
print("Lowest MAE  :", best_mae['Dataset'], f"({best_mae['MAE']:.4f})")
print("Lowest MSE  :", best_mse['Dataset'], f"({best_mse['MSE']:.4f})")
print("Lowest RMSE :", best_rmse['Dataset'], f"({best_rmse['RMSE']:.4f})")
print("Lowest RMSE % :", best_rmse_pct['Dataset'], f"({best_rmse['RMSE %']:.4f})")
print("Highest R²  :", best_r2['Dataset'], f"({best_r2['R2']:.4f})")

BEST REGRESSION PERFORMING DATASETS
Lowest MAE  : Train_MM_OH.csv (0.0059)
Lowest MSE  : Train_MM_Int.csv (0.0001)
Lowest RMSE : Train_MM_Int.csv (0.0094)
Lowest RMSE % : Train_Scaled_All_Int.csv (2.3808)
Highest R²  : Train_Orig_OH.csv (0.9955)


In [7]:
results_df

Unnamed: 0,Dataset,MAE,MSE,RMSE,RMSE %,R2
0,Train_Orig_OH.csv,0.314082,0.267294,0.517005,1.020326,0.99553
1,Train_Scaled_All_OH.csv,0.040456,0.004435,0.066593,-536.962837,0.99553
2,Train_Scaled_Cont_OH.csv,0.040456,0.004435,0.066593,-536.962837,0.99553
3,Train_MM_OH.csv,0.005861,9.1e-05,0.00955,2.410483,0.995352
4,train_OH_MM_PCA15.csv,0.05542,0.005261,0.07253,18.306276,0.731923
5,train_OH_MM_PCA20.csv,0.054686,0.005011,0.070785,17.865706,0.744671
6,train_OH_MM_PCA25.csv,0.035509,0.001964,0.04432,11.186121,0.899904
7,train_OH_MM_PCA30.csv,0.033955,0.001827,0.042746,10.788757,0.906889
8,train_OH_MM_PCA35.csv,0.020093,0.000663,0.025756,6.500701,0.966195
9,Train_Orig_Int.csv,0.317741,0.269632,0.519261,1.024778,0.995491
