In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
X_train = np.load('../Data/X_train.npy')
y_train = np.load('../Data/y_train.npy')
X_test = np.load('../Data/X_test.npy')
y_test = np.load('../Data/y_test.npy')

X_train_scaled = np.load('../Data/X_train_scaled.npy')
X_test_scaled = np.load('../Data/X_test_scaled.npy')

In [4]:
y_train_bin = (y_train != 0).astype(int)
y_test_bin  = (y_test  != 0).astype(int)

print("Train distribution:")
print(pd.Series(y_train_bin).value_counts())

print("\nTest distribution:")
print(pd.Series(y_test_bin).value_counts())

Train distribution:
0    1817055
1     445245
Name: count, dtype: int64

Test distribution:
0    454265
1    111311
Name: count, dtype: int64


In [9]:
neg = np.sum(y_train_bin == 0)
pos = np.sum(y_train_bin == 1)

scale_weight = neg / pos
print(scale_weight)

4.081022807667688


Added this as the XGboost learned that even if it classify something as Benign it can get an accuracy of 80+ hence the model was badly trained 

Unlike Random Forest, XGBoost does NOT automatically handle imbalance well.

Even if you oversampled training data:

The test set is still imbalanced

XGBoost’s loss optimization may collapse to majority class

Especially if scale_pos_weight is not set

So it learns:

“Predict 0 always → still 80% accuracy.”

Which matches your 0.8032 accuracy.

In [19]:
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.metrics import precision_recall_curve

def train_xgb(name, X_train_res, y_train_res):
    print(f"\n===== XGBOOST + {name.upper()} =====")


    model = xgb.XGBClassifier(
        max_depth=6,    
        n_estimators=500,
        learning_rate=0.05,
        reg_alpha=1,
        reg_lambda=1,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_weight,
        eval_metric='mlogloss',
        tree_method='hist',
        n_jobs=-1,
        random_state=42
    )

    model.fit(X_train_res, y_train_res)

    y_prob = model.predict_proba(X_test_scaled)[:, 1]

    precision, recall, thresholds = precision_recall_curve(y_test_bin, y_prob)
    f1_scores = 2 * (precision * recall) / (precision + recall)

    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx]

    print("Best threshold:", best_threshold)

    y_pred = (y_prob >= best_threshold).astype(int)

    print("\nClassification Report:")
    print(classification_report(y_test_bin, y_pred, digits=4))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test_bin, y_pred))

    print("ROC-AUC:", roc_auc_score(y_test_bin, y_prob))

    return model

Added   **scale_pos_weight=scale_weight,**  

scale_pos_weight tells XGBoost:

“Mistakes on minority class are more costly.”

Without it, boosting collapses toward majority.

Random Forest did not have this problem because:

Bagging behaves differently

Implicit balancing effect

Boosting is more sensitive.

Even though SMOTE was used

we should STILL use **scale_pos_weight.** as

Test set remains imbalanced. The Boosting loss is influenced by class distribution


In [7]:
x_smote_vanilla = np.load('../Data/Sampled_Data/x_smote.npy')
y_smote_vanilla = np.load('../Data/Sampled_Data/y_smote.npy')

In [12]:
print("Unique train labels:", np.unique(y_smote_vanilla))
print("Unique test labels:", np.unique(y_test_bin))

Unique train labels: [0 1]
Unique test labels: [0 1]


In [16]:
pd.Series(y_smote_vanilla).value_counts()

0    1335735
1    1335735
Name: count, dtype: int64

In [20]:
train_xgb("SMOTE", x_smote_vanilla, y_smote_vanilla)


===== XGBOOST + SMOTE =====
Best threshold: 0.07472793

Classification Report:
              precision    recall  f1-score   support

           0     0.9674    0.9844    0.9759    454265
           1     0.9316    0.8647    0.8969    111311

    accuracy                         0.9609    565576
   macro avg     0.9495    0.9246    0.9364    565576
weighted avg     0.9604    0.9609    0.9603    565576

Confusion Matrix:
[[447201   7064]
 [ 15062  96249]]
ROC-AUC: 0.9858314981669705


In [21]:
del x_smote_vanilla, y_smote_vanilla

In [22]:
x_smote_kmeans = np.load('../Data/Sampled_Data/x_smote_kmeans.npy')
y_smote_kmeans = np.load('../Data/Sampled_Data/y_smote_kmeans.npy')

In [24]:
print("Unique train labels:", np.unique(y_smote_kmeans))
print("Unique test labels:", np.unique(y_test_bin))

Unique train labels: [0 1]
Unique test labels: [0 1]


In [25]:
pd.Series(y_smote_kmeans).value_counts()

1    1335837
0    1335735
Name: count, dtype: int64

In [27]:
train_xgb("SMOTE_KMeans", x_smote_kmeans, y_smote_kmeans)


===== XGBOOST + SMOTE_KMEANS =====
Best threshold: 0.04168745

Classification Report:
              precision    recall  f1-score   support

           0     0.9345    0.9889    0.9609    454265
           1     0.9408    0.7170    0.8138    111311

    accuracy                         0.9354    565576
   macro avg     0.9376    0.8530    0.8874    565576
weighted avg     0.9357    0.9354    0.9320    565576

Confusion Matrix:
[[449245   5020]
 [ 31505  79806]]
ROC-AUC: 0.9171585570231052


In [28]:
del x_smote_kmeans, y_smote_kmeans

In [29]:
x_smote_bl = np.load('../Data/Sampled_Data/x_smote_bl.npy')
y_smote_bl = np.load('../Data/Sampled_Data/y_smote_bl.npy')

In [31]:
print("Unique train labels:", np.unique(y_smote_bl))

Unique train labels: [0 1]


In [32]:
pd.Series(y_smote_bl).value_counts()

0    1335735
1    1335735
Name: count, dtype: int64

In [33]:
train_xgb("SMOTE_BorderLine", x_smote_bl, y_smote_bl)


===== XGBOOST + SMOTE_BORDERLINE =====
Best threshold: 0.20184046

Classification Report:
              precision    recall  f1-score   support

           0     0.9881    0.9840    0.9861    454265
           1     0.9359    0.9517    0.9437    111311

    accuracy                         0.9777    565576
   macro avg     0.9620    0.9679    0.9649    565576
weighted avg     0.9778    0.9777    0.9777    565576

Confusion Matrix:
[[447011   7254]
 [  5376 105935]]
ROC-AUC: 0.9943845781996453


In [34]:
del x_smote_bl, y_smote_bl

In [37]:
train_xgb("None", X_train_scaled, y_train_bin)


===== XGBOOST + NONE =====
Best threshold: 0.008634087

Classification Report:
              precision    recall  f1-score   support

           0     0.9793    0.9757    0.9775    454265
           1     0.9022    0.9159    0.9090    111311

    accuracy                         0.9639    565576
   macro avg     0.9407    0.9458    0.9432    565576
weighted avg     0.9641    0.9639    0.9640    565576

Confusion Matrix:
[[443210  11055]
 [  9364 101947]]
ROC-AUC: 0.9899239855274019
