# Bias Mitiagation using SMOTE

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE

In [None]:
X, y = make_classification(
    n_samples=5000,
    n_features=2,
    n_informative=2,
    n_redundant=0,
    weights=[0.95, 0.05],  # 5% defaulters
    random_state=42
)

df = pd.DataFrame(X, columns=["Credit_Utilization", "Income_Risk"])
df["Default"] = y

df["Default"].value_counts(normalize=True)
df.info()

In [None]:
plt.figure(figsize=(6,4))
plt.scatter(df["Credit_Utilization"], df["Income_Risk"],
            c=df["Default"], cmap="coolwarm", alpha=0.6)
plt.title("Imbalanced Dataset (Before SMOTE)")
plt.xlabel("Credit Utilization")
plt.ylabel("Income Risk")
plt.show()

### How to interpret: Dominant rule learned by the model. Because most of the plot is blue, the model learns a rule like: “If a point lies anywhere along this diagonal cloud → classify as BLUE (safe).”
### This rule alone already classifies most points correctly. Many red points lie inside the blue diagonal.They do not form a clean red-only region. So when the model evaluates a red point and when its neighbours are mostly blue, the model assigns it to the blue class

### Result : Red → predicted as blue, (risky person → predicted safe). Note that these are false negatives.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df[["Credit_Utilization", "Income_Risk"]],
    df["Default"],
    test_size=0.3,
    stratify=df["Default"],
    random_state=42
)
#during the test train split, 3500 data was used to train and 1500 data for test.Hence we are precting the values for 1500, so the classification report and confusion matrix is for 1500
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))



In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create labeled DataFrame
cm_df = pd.DataFrame(
    cm,
    index=["Actual SAFE (Blue)", "Actual RISKY (Red)"],
    columns=["Predicted SAFE (Blue)", "Predicted RISKY (Red)"]
)

print(cm_df)

In [None]:
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

pd.Series(y_train_sm).value_counts()
X_train_sm.info()

#### After SMOTE my train dataset increases but my test data still remains the same. This helps my model to learn better as there are more from the imbalenced dataset due to SMOTE

In [None]:
plt.figure(figsize=(6,4))
plt.scatter(X_train_sm["Credit_Utilization"],
            X_train_sm["Income_Risk"],
            c=y_train_sm, cmap="coolwarm", alpha=0.5)
plt.title("Balanced Dataset (After SMOTE)")
plt.xlabel("Credit Utilization")
plt.ylabel("Income Risk")
plt.show()



### Here red samples are more dominanat due to SMOTE and hence it learns more about the default data. Hence it classifies red better thus helping the bank not to give credit to people who may default.

In [None]:
model_smote = LogisticRegression()
model_smote.fit(X_train_sm, y_train_sm)

y_pred_smote = model_smote.predict(X_test)

print(classification_report(y_test, y_pred_smote))


#### see a significant improvement in the Recall and F1-score for the minority class (class 1) when SMOTE is used, compared to the model trained without it. The model without SMOTE might have a recall of 0 for the minority class, indicating it failed to identify any positive cases.

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_smote)

# Create labeled DataFrame
cm_df = pd.DataFrame(
    cm,
    index=["Actual SAFE (Blue)", "Actual RISKY (Red)"],
    columns=["Predicted SAFE (Blue)", "Predicted RISKY (Red)"]
)

print(cm_df)