In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, EditedNearestNeighbours, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, ADASYN
from imblearn.combine import SMOTETomek
from collections import Counter

In [None]:
from scipy.stats.mstats import winsorize

In [None]:
# from google.colab import files
# uploaded = files.upload()

In [None]:
# Step 2: Load Data

df = pd.read_csv("Churn_Modelling.csv")
print(df.head())


In [None]:
print(df['Exited'].value_counts(normalize=True))  # checking imbalance

In [None]:
# Step 3: Select Features & Target
# Drop non-useful columns
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [None]:
# Encode categorical variables (simple get_dummies)
df = pd.get_dummies(df, drop_first=True)
df.head()

In [None]:
X = df.drop("Exited", axis=1)
y = df["Exited"]

print("Feature shape:", X.shape)

Winsorizing

* Problem it solves:

  Sometimes outliers are too extreme and may harm the   model even after scaling.

  Example: a few customers with unrealistic Balance   values (like 10× larger than typical).

* What it does:

  Instead of removing rows, caps extreme values at  certain percentiles.

  Example: 1st percentile → values below this are set   to that percentile value.

  99th percentile → values above this are capped at   that level.

* When to use:

  When you don’t want to drop records but still need  to reduce outlier impact.
  
  Safer than deleting rows since you keep all data.

In [None]:
# Step 4: Winsorizing

# Applying winsorization to selected numeric columns
print("Before winsorization:")
print(X[['CreditScore', 'Balance', 'EstimatedSalary']].describe())

# Fix winsorize output (convert masked array -> normal array)
from scipy.stats.mstats import winsorize

for col in ['CreditScore', 'Balance', 'EstimatedSalary']:
    X[col] = winsorize(X[col], limits=[0.01, 0.01]).data  # use .data to avoid NaN

# After all transformations, check for NaNs
print("NaN count before filling:", X.isna().sum().sum())

# Fill any remaining NaNs (safe fallback)
X = X.fillna(0)


print("After winsorization:")
print(X[['CreditScore', 'Balance', 'EstimatedSalary']].describe())


 Robust Scaling

* Problem it solves:

  Features like Balance or Salary can have extreme   values (outliers).
  
  Standard scaling (z-score) and MinMax scaling get    heavily affected by outliers → pulling the scale   too much.
  
* What it does:

  Instead of using mean and standard deviation  (sensitive to outliers), RobustScaler uses:

  Median (central point)

  Interquartile range (IQR = Q3 − Q1) (spread of  middle 50%)

  So outliers don’t distort the scaling.

* Robust Scaling Formula

  For a feature \( x \):

  $
  x' = \frac{x - \text{median}(x)}{\text{IQR}(x)}
  $

  where:

  - $ \text{median}(x) $ = 50th percentile (middle value)  
  - $ \text{IQR}(x) = Q_3 - Q_1 $ = difference between 75th percentile (Q3) and 25th percentile (Q1)

* When to use:

  When dataset has skewed distributions or many  outliers.

  E.g., in churn data, one customer may have an   extremely high balance compared to others.

In [None]:
# Step 5: Robust Scaling

scaler = RobustScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
print("Scaled data sample:")
print(X_scaled.head())

In [None]:
import numpy as np
from sklearn.preprocessing import RobustScaler

data = np.array([[1], [2], [3], [4], [100]])

scaler = RobustScaler()
scaled_data = scaler.fit_transform(data)

print("Original:\n", data.flatten())
print("Robust Scaled:\n", scaled_data.flatten())

In [None]:
# Step 6: Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)





In [None]:
# Step 7: Baseline Model (Without Resampling)
print("Class distribution", np.bincount(y_train))
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Baseline Results:")
print(classification_report(y_test, y_pred))

**Handling Class Imbalance (Undersampling / OverSampling)**

* Problem it solves:

  In churn prediction (like our dataset), usually:
  
  80–90% customers stay (Exited=0)
  
  10–20% churn (Exited=1)
  
  A model trained on this will just predict “No   churn” most of the time → high accuracy, but  useless for detecting churners.




* **Undersampling**: Reduce majority class (e.g., keep only 10k “No churn” customers to match 2k “Churn” ones).

  Fast, simple
  
  Risk of losing important majority data

In [None]:
# Random Undersampling

undersample = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)

print("Class distribution after random undersampling:", np.bincount(y_train_under))

clf.fit(X_train_under, y_train_under)
y_pred_under = clf.predict(X_test)

print("Undersampling Results:")
print(classification_report(y_test, y_pred_under))


### Tomek Links (Undersampling)

- A **Tomek Link** is a pair of samples:
  - They belong to **different classes** (e.g., churned vs not churned).
  - They are **each other’s nearest neighbor**.

- These pairs usually occur at the **class boundary** where overlap or confusion exists.

- **How it works:**
  - Identify Tomek Link pairs.
  - Remove the **majority class sample** from each pair.
  - This cleans the boundary and reduces class overlap.

Benefit: Unlike random undersampling, Tomek Links remove only the **problematic majority samples** that confuse the classifier.


In [None]:
# TomekLinks Undersampling

TL_undersample = TomekLinks(sampling_strategy="auto")
X_train_TL, y_train_TL = TL_undersample.fit_resample(X_train, y_train)

print("Class distribution after TomekLinks undersampling:", np.bincount(y_train_TL))

clf.fit(X_train_TL, y_train_TL)
y_pred_TL = clf.predict(X_test)

print("Undersampling Results:")
print(classification_report(y_test, y_pred_TL))


### Edited Nearest Neighbors (ENN) Undersampling

- **Idea:** ENN removes **noisy samples** that don’t agree with their neighbors.  
- For each data point, check its *k nearest neighbors* (commonly k=3).  
- If the point’s class label is **different from the majority of its neighbors**, it is considered noise and removed.  

**Benefit:**  
- Cleans the dataset by removing mislabeled or out-of-place points.  
- Helps classifiers build clearer decision boundaries.  

**Note:**  
- ENN can remove **both majority and minority samples**, unlike Tomek Links which usually remove only majority samples.


In [None]:
# ENN Undersampling

ENN_undersample = EditedNearestNeighbours()
X_train_ENN, y_train_ENN = ENN_undersample.fit_resample(X_train, y_train)

print("Class distribution after ENN undersampling:", np.bincount(y_train_ENN))

clf.fit(X_train_ENN, y_train_ENN)
y_pred_ENN = clf.predict(X_test)

print("Undersampling Results:")
print(classification_report(y_test, y_pred_ENN))

**Random Over Sampler:** Random oversampling randomly chooses minority samples (with replacement) and adds duplicates until class counts match.

In [None]:
ROS = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ROS.fit_resample(X_train, y_train)

print("Class distribution after Random Over Sampler:", np.bincount(y_train_ros))

clf.fit(X_train_ros, y_train_ros)
y_pred_ros = clf.predict(X_test)

print("Random Over Sampler Results:")
print(classification_report(y_test, y_pred_ros))

### SMOTE (Synthetic Minority Oversampling Technique)

- **Idea:** Instead of duplicating minority samples (like Random Oversampling), SMOTE creates **synthetic samples**.
- **How it works:**
  1. For each minority sample, find its *k nearest minority neighbors* (default k=5).
  2. Randomly choose one neighbor.
  3. Create a synthetic sample **between the two points** by interpolation.
- **Result:** The minority class grows with *new, artificial points* that are not exact copies.

 **Benefit:** Reduces overfitting (compared to simple duplication).  
 **Limitation:** Can generate samples in regions where classes overlap → may introduce noise.

---

In [None]:
# SMOTE Oversampling

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:", np.bincount(y_train_smote))

clf.fit(X_train_smote, y_train_smote)
y_pred_smote = clf.predict(X_test)

print("SMOTE Results:")
print(classification_report(y_test, y_pred_smote))


### Borderline-SMOTE

- **Idea:** A smarter version of SMOTE that only generates synthetic samples for **minority points near the decision boundary**.
- **How it works:**
  1. Identify minority samples whose neighbors are **mostly majority class** → these are "borderline" points.
  2. Generate synthetic samples **around these borderline cases**.
- **Result:** Focuses oversampling where it matters most: the **class boundary**.

**Benefit:** Strengthens the classifier in the hardest-to-learn region (the boundary).  
**Limitation:** May overfit borderline noise if the boundary is very fuzzy.

In [None]:
#BorderlineSMOTE

BLsmote = BorderlineSMOTE(random_state=42)
X_train_blsmote, y_train_blsmote = BLsmote.fit_resample(X_train, y_train)

print("Class distribution after BL SMOTE:", np.bincount(y_train_blsmote))

clf.fit(X_train_blsmote, y_train_blsmote)
y_pred_blsmote = clf.predict(X_test)

print("BL SMOTE Results:")
print(classification_report(y_test, y_pred_blsmote))


* When to use:

  Whenever target variable distribution is skewed   (imbalanced).
  
  Critical in classification tasks like fraud   detection, churn prediction, medical diagnosis.