In [129]:
# What is customer churn? - rate at which customers stop using a product or service or cancel subscirptions, over time
#churn=yes =>cutomer terminated their relationship

In [98]:
import numpy as np
import pandas as pd

In [99]:
df=pd.read_csv("churn.csv")

In [100]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [101]:
df.shape

(7043, 21)

In [102]:
#see missing values
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [103]:
# keeping important columns 
columns_to_keep = [
    'gender',
    'SeniorCitizen',
    'Partner',
    'Dependents',
    'tenure',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'TechSupport',
    'Contract',
    'PaperlessBilling',
    'PaymentMethod',
    'MonthlyCharges',
    'TotalCharges',
    'Churn'
]
df=df[columns_to_keep]

In [107]:
df.shape
df.dtypes
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,TechSupport,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,Yes,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [108]:
#supervised or unsupervised? - supervised data - contains labels 
# label/output - Churn 
#other cols - features/inputs
#classification or regression? - classification (label is discrete variable - yes/no)
# Using Logistic Regression

In [109]:
# Encode target
df['Churn']=df['Churn'].map({'Yes':1,'No':0})

In [111]:
df['Churn'].value_counts()
df.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
TechSupport          object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                 int64
dtype: object

In [113]:
#Encode categorical features
cat_cols = [
    'gender',
    'Partner',
    'Dependents',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'TechSupport',
    'Contract',
    'PaperlessBilling',
    'PaymentMethod'
]

#pd.get_dummies()=>
# Works directly on DataFrames
# Keeps column names readable
# Faster to debug
# Less boilerplate code
# Perfect for EDA + learning phase

# OneHotEncoder=>
# Returns a NumPy array
# Column names are lost (unless extra steps)
# Must be used inside a Pipeline
# Better for production / deployment

df=pd.get_dummies(df,columns=cat_cols,drop_first=True)

In [115]:
df.shape
df.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,OnlineSecurity_No internet service,OnlineSecurity_Yes,TechSupport_No internet service,TechSupport_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,False,True,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,0,True,False,False,True,False,...,False,True,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,True,False,False,True,False,...,False,True,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,0,True,False,False,False,True,...,False,True,False,True,True,False,False,False,False,False
4,0,2,70.7,151.65,1,False,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False


In [116]:
df.dtypes.value_counts()

bool       18
int64       3
float64     1
object      1
Name: count, dtype: int64

In [117]:
df.select_dtypes(include='object').columns

Index(['TotalCharges'], dtype='object')

In [118]:
#'TotalCharges' - obejct? - it contains empty strings so pandas reads it as object
#Convert 'TotalCharges' column to float, and handle missing error = 'coerce' to replace with non-numeric values with NaN
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')

In [119]:
#check missing values
df.isnull().sum()

SeniorCitizen                             0
tenure                                    0
MonthlyCharges                            0
TotalCharges                             11
Churn                                     0
gender_Male                               0
Partner_Yes                               0
Dependents_Yes                            0
PhoneService_Yes                          0
MultipleLines_No phone service            0
MultipleLines_Yes                         0
InternetService_Fiber optic               0
InternetService_No                        0
OnlineSecurity_No internet service        0
OnlineSecurity_Yes                        0
TechSupport_No internet service           0
TechSupport_Yes                           0
Contract_One year                         0
Contract_Two year                         0
PaperlessBilling_Yes                      0
PaymentMethod_Credit card (automatic)     0
PaymentMethod_Electronic check            0
PaymentMethod_Mailed check      

In [120]:
#fill missing values with mean of col
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].mean())

In [122]:
df.dtypes.value_counts()

bool       18
int64       3
float64     2
Name: count, dtype: int64

In [124]:
# Train Test Split
from sklearn.model_selection import train_test_split
X=df.drop('Churn',axis=1)
y=df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42,stratify=y)

In [None]:
# Stratify?? - keep the same proportion of classes in train and test
# why stratify important for churn => 
# Churn datasets are imbalanced:
# More people stay
# Fewer people leave

# Without stratify:
# Model may barely see churn cases
# Model learns “predict No Churn always”
# Accuracy looks high but model is useless

# How do you decide WHAT to stratify on?
# Always stratify on the target variable (y)
# Because:
# That’s what you’re predicting
# That’s where imbalance exists

In [130]:
# Quick rule (remember this)
# Classification + imbalance → stratify
# Regression → no stratify needed

In [126]:
X_train.shape
X_test.shape

(1409, 22)

In [127]:
y_train.value_counts(normalize=True)

Churn
0    0.734647
1    0.265353
Name: proportion, dtype: float64

In [128]:
y_test.value_counts(normalize=True)

Churn
0    0.734564
1    0.265436
Name: proportion, dtype: float64

In [131]:
X_train

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,OnlineSecurity_No internet service,OnlineSecurity_Yes,TechSupport_No internet service,TechSupport_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
3738,0,35,49.20,1701.65,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
3151,0,15,75.10,1151.55,True,True,True,True,False,False,...,False,True,False,False,False,False,False,False,False,True
4860,0,13,40.55,590.35,True,True,True,False,True,False,...,False,True,False,True,False,True,False,False,False,True
3867,0,26,73.50,1905.70,False,True,False,True,False,False,...,False,False,False,False,False,True,True,True,False,False
3810,0,1,44.55,44.55,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6303,0,71,109.25,7707.70,False,True,False,True,False,True,...,False,False,False,True,False,True,False,False,True,False
6227,0,2,46.05,80.35,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4673,1,25,102.80,2660.20,False,False,False,True,False,True,...,False,True,False,False,False,False,True,False,False,True
2710,0,24,20.40,482.80,False,True,False,True,False,False,...,True,False,True,False,True,False,False,True,False,False


In [133]:
# Scale numeric columns
num_cols=['tenure','MonthlyCharges','TotalCharges']

from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

X_train[num_cols]=scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [135]:
X_train[num_cols].describe()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
count,5634.0,5634.0,5634.0
mean,-1.0089350000000001e-17,-2.402527e-16,-5.044677e-18
std,1.000089,1.000089,1.000089
min,-1.322329,-1.544028,-1.002799
25%,-0.9559779,-0.9711977,-0.8315477
50%,-0.1418632,0.1848336,-0.3955259
75%,0.9164859,0.8319124,0.6732591
max,1.608483,1.785939,2.802475


In [136]:
# Why we don’t scale everything
# One-hot columns are already 0/1
# Scaling them adds noise
# Hurts interpretability

In [138]:
# Train Logistic Regression

# 1.max_iter=2000
# What it means
# Logistic Regression learns by iterative optimization
# max_iter = maximum number of iterations allowed to converge

# Why we increased it
# Default is 100 → often not enough for:
# many features (after one-hot encoding)
# scaled + unscaled mix

# If the model doesn’t converge:
# coefficients are unstable
# accuracy suffers silently

# increase max_iter to ensure proper convergence after one-hot encoding


# 2.class_weight='balanced'
# What it means
# It tells the model:
# “Hey, churn cases are fewer — give them more importance.”
# Internally:
# Weight ∝ 1 / class frequency

# So:
# Minority class (Churn = 1) → higher penalty if misclassified
# Majority class → lower penalty

# Why we used it
# Churn data is imbalanced
# Without this, model predicts “No Churn” too often
# Accuracy may look fine but churn recall is terrible

# What it improves
# Recall for churn
# F1-score
# Real-world usefulness

# used class_weight=balanced to handle class imbalance without resampling


# 3.solver='liblinear'
# What is a solver?
# A solver is the math algorithm that finds the best coefficients.

# Why liblinear?
# Works well for small–medium datasets
# Very stable for binary classification
# Handles class_weight properly
# Converges reliably

# When liblinear is preferred
# Binary target
# Not millions of rows
# No multinomial classes

# chose liblinear because it’s stable and well-suited for binary classification with class weighting.

In [140]:
# TRAIN MODEL
from sklearn.linear_model import LogisticRegression
lg=LogisticRegression(max_iter=2000,class_weight='balanced',solver='liblinear')
lg.fit(X_train,y_train)

#Make Prediction
y_pred=lg.predict(X_test)


In [142]:
#Accuracy Score
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7352732434350603

In [143]:
#Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.90      0.72      0.80      1035
           1       0.50      0.78      0.61       374

    accuracy                           0.74      1409
   macro avg       0.70      0.75      0.70      1409
weighted avg       0.79      0.74      0.75      1409



In [144]:
#The model prioritizes recall for churn customers, ensuring most potential churners are flagged, even at the cost of some false positives
# Accuracy is 74%, but:
# Churn recall = 78%
# That’s more valuable than an 85% accuracy model with 40% recall

In [152]:
#Improve Accuracy without restraining
# rn log regression usses threshold = 0.5 - use 0.6

In [146]:
y_prob=lg.predict_proba(X_test)[:,1]

In [147]:
y_pred_new=(y_prob>=0.6).astype(int)

In [148]:
print(classification_report(y_test, y_pred_new))
accuracy_score(y_test, y_pred_new)

              precision    recall  f1-score   support

           0       0.88      0.79      0.83      1035
           1       0.55      0.71      0.62       374

    accuracy                           0.77      1409
   macro avg       0.72      0.75      0.73      1409
weighted avg       0.79      0.77      0.78      1409



0.7679205110007097

In [149]:
#When threshold ↑ → recall ↓ and precision ↑
#When threshold ↓ → recall ↑ and precision ↓

In [150]:
#chose a higher decision threshold to improve the reliability of churn predictions.

In [158]:
import pickle
pickle.dump(lg, open("churn_model.pkl", "wb"))
model_bundle = {
    "model": lg,
    "scaler": scaler,
    "columns": X_train.columns,
    "threshold": 0.6
}

pickle.dump(model_bundle, open("churn_model_bundle.pkl", "wb"))
