# Assignment 3
Objective - To balance an imbalanced dataset and evaluate the performance of various techniques.

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.combine import SMOTEENN
from sklearn.utils.class_weight import compute_class_weight
from sklearn import datasets
from collections import Counter

In [None]:
X,y = datasets.make_classification(n_samples=10000, n_features=20, n_informative=15, n_redundant=5, random_state=42,weights=[0.8,0.2])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Random Under Sampling
print("Original class distribution:", Counter(y_train))
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train, y_train)
print("Original class distribution:", Counter(y_rus))

Original class distribution: Counter({0: 6409, 1: 1591})
Original class distribution: Counter({0: 1591, 1: 1591})


In [None]:
# Random Over Sampling
print("Original class distribution:", Counter(y_train))
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train, y_train)
print("Original class distribution:", Counter(y_ros))

Original class distribution: Counter({0: 6409, 1: 1591})
Original class distribution: Counter({0: 6409, 1: 6409})


In [None]:
# SMOTE(Synthetic Minority Oversampling Techniqu)
print("Original class distribution:", Counter(y_train))
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
print("Original class distribution:", Counter(y_smote))

Original class distribution: Counter({0: 6409, 1: 1591})
Original class distribution: Counter({0: 6409, 1: 6409})


In [None]:
# Tomek Links
print("Original class distribution:", Counter(y_train))
tl = TomekLinks()
X_tl, y_tl = tl.fit_resample(X_train, y_train)
print("Original class distribution:", Counter(y_tl))

Original class distribution: Counter({0: 6409, 1: 1591})
Original class distribution: Counter({0: 6366, 1: 1591})


In [None]:
# Class Weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))
class_weights

array([0.62412233, 2.51414205])

In [None]:
# Function to evaluate model performance
def evaluate_model(X_train, y_train, X_test, y_test, class_weights=None):
    model = RandomForestClassifier(class_weight=class_weights, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    print(classification_report(y_test, y_pred))
    print("AUC:", roc_auc_score(y_test, y_prob))

In [None]:
# Evaluate each sampling technique
print("Original:")
evaluate_model(X_train, y_train, X_test, y_test)
print("\nRandom Undersampling:")
evaluate_model(X_rus, y_rus, X_test, y_test)

print("\nRandom Oversampling:")
evaluate_model(X_ros, y_ros, X_test, y_test)

print("\nSMOTE:")
evaluate_model(X_smote, y_smote, X_test, y_test)

print("\nTomek Links:")
evaluate_model(X_tl, y_tl, X_test, y_test)

print("\nClass Weights:")
evaluate_model(X_train, y_train, X_test, y_test, class_weights_dict)

Original:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1571
           1       0.98      0.75      0.85       429

    accuracy                           0.94      2000
   macro avg       0.96      0.87      0.91      2000
weighted avg       0.95      0.94      0.94      2000

AUC: 0.9725702898841028

Random Undersampling:
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1571
           1       0.83      0.88      0.86       429

    accuracy                           0.94      2000
   macro avg       0.90      0.92      0.91      2000
weighted avg       0.94      0.94      0.94      2000

AUC: 0.9724834893517262

Random Oversampling:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1571
           1       0.94      0.79      0.86       429

    accuracy                           0.94      2000
   macro avg       0.94  