## Handling Class Imbalance

In [None]:
import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek, SMOTEENN

sns.set(style='whitegrid')

In [None]:
# load Processed and scaled Parquet files

DATA_DIR = '../data/processed/'

X_train_scaled = pd.read_parquet(DATA_DIR / 'X_train.parquet')
X_val_scaled   = pd.read_parquet(DATA_DIR / 'X_val.parquet')
X_test_scaled  = pd.read_parquet(DATA_DIR / 'X_test.parquet')

y_train = pd.read_parquet(DATA_DIR / 'y_train.parquet')
y_val   = pd.read_parquet(DATA_DIR / 'y_val.parquet')
y_test  = pd.read_parquet(DATA_DIR / 'y_test.parquet')

print("Train shape:", X_train_scaled.shape)
print("Valid shape:", X_val_scaled.shape)
print("Test shape:", X_test_scaled.shape)
print("Class distribution (train):\n", y_train.value_counts())


In [None]:
# visualise the original imbalance 

sns.countplot(x=y_train)
plt.title("Original class distribution in trainset")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

print("Class balance ratio (majority/minority):",
      y_train.value_counts()[0] / y_train.value_counts()[1])

In [None]:
# SMOTE (Oversampling)

smote = SMOTE(random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print("After SMOTE class distribution:\n", pd.Series(y_train_smote).value_counts())

In [None]:
# Random Undersampling 

rus = RandomUnderSampler(random_state=42)

X_train_rus, y_train_rus = rus.fit_resample(X_train_scaled, y_train)

print("After Random UnderSampling:\n", pd.Series(y_train_rus).value_counts())

In [None]:
# combined (SMOTETomek)

smt = SMOTETomek(random_state=42)

X_train_smt, y_train_smt = smt.fit_resample(X_train_scaled, y_train)

print("After SMOTE + Tomek:\n", pd.Series(y_train_smt).value_counts())

In [None]:
# visualise resampled distribution 

sns.countplot(x=y_train_smote)
plt.title("Balanced Training Set after SMOTE")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

In [None]:
# save the balanced dataset 

X_train_smote.to_parquet(DATA_DIR / 'X_train_balanced_smote.parquet', index=False)
y_train_smote.to_parquet(DATA_DIR / 'y_train_balanced_smote.parquet', index=False)
