In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

import joblib

In [2]:
X_train = np.load('../Data/X_train.npy')
X_test = np.load('../Data/X_test.npy')

X_train_scaled = np.load('../Data/X_train_scaled.npy')
X_test_scaled = np.load('../Data/X_test_scaled.npy')

y_train = np.load('../Data/y_train.npy')
y_test = np.load('../Data/y_test.npy')


print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

y_train_bin = (y_train != 0).astype(int)
print(pd.Series(y_train_bin).value_counts())


Train shape: (2262300, 78)
Test shape: (565576, 78)
0    1817055
1     445245
Name: count, dtype: int64


In [3]:
print("Before balancing:")
print(pd.Series(y_train).value_counts())

Before balancing:
0     1817055
4      184099
10     127043
2      102420
3        8234
7        6348
11       4717
6        4637
5        4399
1        1565
12       1206
14        522
9          29
13         17
8           9
Name: count, dtype: int64


In [4]:
attack_total = np.sum(y_train_bin == 1)

rus = RandomUnderSampler(
    sampling_strategy={0: attack_total * 3, 1: attack_total},
    random_state=42
)

X_under, y_under = rus.fit_resample(X_train_scaled, y_train_bin)
print("After RUS:\n", pd.Series(y_under).value_counts())


After RUS:
 0    1335735
1     445245
Name: count, dtype: int64


In [5]:
def run_sampler(name, sampler, X, y):
    print(f"\n===== {name} =====")

    X_res, y_res = sampler.fit_resample(X, y)

    print("Class distribution:")
    print(pd.Series(y_res).value_counts())
    print("Shape:", X_res.shape)

    return X_res, y_res


In [6]:
smote = SMOTE(
    sampling_strategy="auto",
    k_neighbors=5,
    random_state=42
)

X_smote, y_smote = run_sampler("Vanilla SMOTE", smote, X_under, y_under)


===== Vanilla SMOTE =====
Class distribution:
0    1335735
1    1335735
Name: count, dtype: int64
Shape: (2671470, 78)


In [7]:
np.save("../Data/Sampled_Data/X_smote.npy", X_smote); np.save("../Data/Sampled_Data/y_smote.npy", y_smote)
del X_smote, y_smote

In [8]:
X_smote = np.load("../Data/Sampled_Data/X_smote.npy")
y_smote = np.load("../Data/Sampled_Data/y_smote.npy")

In [9]:
from sklearn.model_selection import train_test_split

X_train_rf, X_val_rf, y_train_rf, y_val_rf = train_test_split(
    X_smote,
    y_smote,
    test_size=0.2,
    random_state=42,
    stratify=y_smote
)


In [10]:
rf = RandomForestClassifier(
    n_estimators=200,      # strong but still memory safe
    max_depth=None,        # let trees grow fully (RF handles it well)
    min_samples_split=2,
    min_samples_leaf=1,
    n_jobs=-1,             # use all CPU cores
    random_state=42
)

rf.fit(X_train_rf, y_train_rf)

In [11]:
y_pred = rf.predict(X_val_rf)
y_prob = rf.predict_proba(X_val_rf)[:, 1]


In [12]:
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score
)

print("Classification Report:\n")
print(classification_report(y_val_rf, y_pred, digits=4))

print("Confusion Matrix:\n")
print(confusion_matrix(y_val_rf, y_pred))

roc_auc = roc_auc_score(y_val_rf, y_prob)
print("ROC-AUC Score:", round(roc_auc, 4))


Classification Report:

              precision    recall  f1-score   support

           0     0.9992    0.9990    0.9991    267147
           1     0.9990    0.9992    0.9991    267147

    accuracy                         0.9991    534294
   macro avg     0.9991    0.9991    0.9991    534294
weighted avg     0.9991    0.9991    0.9991    534294

Confusion Matrix:

[[266890    257]
 [   209 266938]]
ROC-AUC Score: 0.9997


In [13]:
joblib.dump(rf, "../Data/Sampled_Data/rf_smote.pkl")

['../Data/Sampled_Data/rf_smote.pkl']

In [17]:
from imblearn.over_sampling import KMeansSMOTE

kmeans_smote = KMeansSMOTE(
    sampling_strategy="auto",
    k_neighbors=5,
    kmeans_estimator=500, 
    random_state=42,
    n_jobs=-1
)

X_smote_kmeans, y_smote_kmeans = run_sampler("KMeans SMOTE", kmeans_smote, X_under, y_under)


===== KMeans SMOTE =====
Class distribution:
1    1335837
0    1335735
Name: count, dtype: int64
Shape: (2671572, 78)


In [18]:
from sklearn.model_selection import train_test_split

X_train_rf, X_val_rf, y_train_rf, y_val_rf = train_test_split(
    X_smote_kmeans,
    y_smote_kmeans,
    test_size=0.2,
    random_state=42,
    stratify=y_smote_kmeans
)

In [19]:
rf = RandomForestClassifier(
    n_estimators=200,      # strong but still memory safe
    max_depth=None,        # let trees grow fully (RF handles it well)
    min_samples_split=2,
    min_samples_leaf=1,
    n_jobs=-1,             # use all CPU cores
    random_state=42
)

rf.fit(X_train_rf, y_train_rf)

In [20]:
y_pred = rf.predict(X_val_rf)
y_prob = rf.predict_proba(X_val_rf)[:, 1]

In [22]:
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score
)

print("Classification Report:\n")
print(classification_report(y_val_rf, y_pred, digits=4))

print("Confusion Matrix:\n")
print(confusion_matrix(y_val_rf, y_pred))

roc_auc = roc_auc_score(y_val_rf, y_prob)
print("ROC-AUC Score:", round(roc_auc, 4))

Classification Report:

              precision    recall  f1-score   support

           0     0.9991    0.9992    0.9992    267147
           1     0.9992    0.9991    0.9992    267168

    accuracy                         0.9992    534315
   macro avg     0.9992    0.9992    0.9992    534315
weighted avg     0.9992    0.9992    0.9992    534315

Confusion Matrix:

[[266934    213]
 [   231 266937]]
ROC-AUC Score: 0.9998


In [23]:
joblib.dump(rf, "../Data/Sampled_Data/rf_smote_kmeans.pkl")

['../Data/Sampled_Data/rf_smote_kmeans.pkl']

In [21]:
np.save("../Data/Sampled_Data/X_smote_kmeans.npy", X_smote_kmeans); np.save("../Data/Sampled_Data/y_smote_kmeans.npy", y_smote_kmeans)
del X_smote_kmeans, y_smote_kmeans

In [25]:
from imblearn.over_sampling import BorderlineSMOTE

Bl_smote = BorderlineSMOTE(
    sampling_strategy="auto",
    k_neighbors=5,
    random_state=42,
)

X_smote_bl, y_smote_bl = run_sampler("Borderline SMOTE", Bl_smote, X_under, y_under)


===== Borderline SMOTE =====
Class distribution:
0    1335735
1    1335735
Name: count, dtype: int64
Shape: (2671470, 78)


In [26]:
from sklearn.model_selection import train_test_split

X_train_rf, X_val_rf, y_train_rf, y_val_rf = train_test_split(
    X_smote_bl ,
    y_smote_bl,
    test_size=0.2,
    random_state=42,
    stratify=y_smote_bl
)

In [28]:
rf = RandomForestClassifier(
    n_estimators=200,      # strong but still memory safe
    max_depth=None,        # let trees grow fully (RF handles it well)
    min_samples_split=2,
    min_samples_leaf=1,
    n_jobs=-1,             # use all CPU cores
    random_state=42
)

rf.fit(X_train_rf, y_train_rf)

In [29]:
y_pred = rf.predict(X_val_rf)
y_prob = rf.predict_proba(X_val_rf)[:, 1]

In [30]:
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score
)

print("Classification Report:\n")
print(classification_report(y_val_rf, y_pred, digits=4))

print("Confusion Matrix:\n")
print(confusion_matrix(y_val_rf, y_pred))

roc_auc = roc_auc_score(y_val_rf, y_prob)
print("ROC-AUC Score:", round(roc_auc, 4))

Classification Report:

              precision    recall  f1-score   support

           0     0.9994    0.9989    0.9991    267147
           1     0.9989    0.9994    0.9991    267147

    accuracy                         0.9991    534294
   macro avg     0.9991    0.9991    0.9991    534294
weighted avg     0.9991    0.9991    0.9991    534294

Confusion Matrix:

[[266844    303]
 [   159 266988]]
ROC-AUC Score: 0.9998


In [31]:
joblib.dump(rf, "../Data/Sampled_Data/rf_smote_bl.pkl")

['../Data/Sampled_Data/rf_smote_bl.pkl']

In [32]:
np.save("../Data/Sampled_Data/X_smote_bl.npy", X_smote_bl); np.save("../Data/Sampled_Data/y_smote_bl.npy", y_smote_bl)
del X_smote_bl, y_smote_bl