In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# ⚖️ Handling Imbalanced Data
from imblearn.combine import SMOTEENN                           # Combines SMOTE and ENN for class balancing
from imblearn.pipeline import Pipeline as ImbPipeline

from sklearn.model_selection import train_test_split 

from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, classification_report,
                             roc_curve, ConfusionMatrixDisplay)

In [3]:
df = pd.read_csv(r"C:\Users\siddh\OneDrive\Desktop\project\Customer Chun Prediction\WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [4]:
df.columns.tolist()

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

In [6]:
print("Shape:",df.shape)

Shape: (7043, 21)


In [7]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## 2. Exploratory Data Analysis

### 2.1 Basic Statistics

In [20]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0,7032.0
mean,0.162147,32.371149,64.761692,2283.300441
std,0.368612,24.559481,30.090047,2266.771362
min,0.0,0.0,18.25,18.8
25%,0.0,9.0,35.5,401.45
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.85,3794.7375
max,1.0,72.0,118.75,8684.8


In [21]:
# Categorical summary
df.describe(include=['O'])

Unnamed: 0,customerID,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
count,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043
unique,7043,2,2,2,2,3,3,3,3,3,3,3,3,3,2,4,2
top,7590-VHVEG,Male,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,No
freq,1,3555,3641,4933,6361,3390,3096,3498,3088,3095,3473,2810,2785,3875,4171,2365,5174


In [22]:
df.isnull().any().any()

True

In [23]:
df['TotalCharges'].unique()

array([  29.85, 1889.5 ,  108.15, ...,  346.45,  306.6 , 6844.5 ])

In [24]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

We observe that the `TotalCharges` column contains blank values, so we need to convert it from a string to a numeric type to properly represent the blanks as `NaN`.

### 2.2 Missing Values

In [25]:
# convert TotalCharges which is object to numerical 
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

#Count missing values per column
df.isna().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

### 2.3 Target Distribution

In [27]:
# check the class balance for churn

counts = df['Churn'].value_counts()
percentage = df['Churn'].value_counts(normalize=True)*100

imbalance_table = pd.DataFrame({'Counts':counts,"Percentage (%)":percentage.round(2)})
imbalance_table

Unnamed: 0_level_0,Counts,Percentage (%)
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1
No,5174,73.46
Yes,1869,26.54


## 3. Data Cleaning & Preprocessing

### 3.1 Drop rows where TotalCharges is Missing

In [28]:
df = df.dropna(subset=['TotalCharges'])

df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### 3.2 Separate features (X) and target (y)

In [29]:
X = df.drop(['customerID','Churn'],axis = 1)
y = df['Churn'].map({'No':0,'Yes':1})

In [30]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols   = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

### 3.3 Building a pipeline

In [31]:
# -  Scale numerical feature
numerical_transformer = Pipeline(steps=[('scaler',StandardScaler())])

# -  One hot encoder categorical feature 
categorical_transformer = Pipeline(steps=[('onehot',OneHotEncoder(
    drop='first',
    sparse_output=False,
    handle_unknown='ignore'
))])

#  - Combine into a single ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])


### 3.5 Handle class imbalance using SMOTE + Edited Nearest Neighbors



In [33]:
# Handle class imbalance using SMOTE + Edited Nearest Neighbors
smote_enn = SMOTEENN(random_state=42)

# Full pipeline: preprocessing → resampling
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('sampler', smote_enn)
])

# Apply transformations
X_resampled, y_resampled = pipeline.fit_resample(X, y)


## 4. Train Test Split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled,
    y_resampled,
    test_size=0.2,
    random_state=42,
    stratify=y_resampled
)

## 5. Model Training
Now that our data is preprocessed and split, we’ll train a baseline Random Forest classifier.

In [35]:
rf = RandomForestClassifier(n_estimators=10 , random_state=42,n_jobs=-1)

rf.fit(X_train,y_train)

## 6. Evaluation on Test Set
We’ll evaluate on the hold-out test set using `F1-score` (suitable for imbalanced classes) and `AUC-ROC`. Then we’ll visualize:
* ROC curve
* Confusion matrix
* Learning curve (F1 over increasing train set sizes)

In [36]:
# Generate predictions and probabilities
y_pred  = rf.predict(X_test)              # class labels
y_proba = rf.predict_proba(X_test)[:, 1]  # probability of positive class (Churn)

# Print key metrics
print("Default Random Forest Performance")
print(f"F1-score : {f1_score(y_test, y_pred):.4f}")
print(f"AUC-ROC  : {roc_auc_score(y_test, y_proba):.4f}\n")
print(classification_report(y_test, y_pred))

Default Random Forest Performance
F1-score : 0.9604
AUC-ROC  : 0.9876

              precision    recall  f1-score   support

           0       0.95      0.94      0.95       557
           1       0.96      0.96      0.96       744

    accuracy                           0.95      1301
   macro avg       0.95      0.95      0.95      1301
weighted avg       0.95      0.95      0.95      1301

