In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("../data/processed/telco_churn_cleaned.csv")
df.head()


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,MonthlyRevenue,AnnualRevenue,RevenueLost,TenureGroup,HighValueCustomer
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,...,Yes,Electronic check,29.85,29.85,0,29.85,358.2,0.0,0-1 Year,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,...,No,Mailed check,56.95,1889.5,0,56.95,683.4,0.0,2-4 Years,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,...,Yes,Mailed check,53.85,108.15,1,53.85,646.2,53.85,0-1 Year,0
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,...,No,Bank transfer (automatic),42.3,1840.75,0,42.3,507.6,0.0,2-4 Years,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,...,Yes,Electronic check,70.7,151.65,1,70.7,848.4,70.7,0-1 Year,0


In [3]:
y = df['Churn']


In [4]:
X = df.drop(columns=[
    'Churn',
    'RevenueLost',      # leakage (post-churn info)
    'AnnualRevenue'     # derived from MonthlyCharges
])


In [5]:
X = pd.get_dummies(X, drop_first=True)


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)


In [7]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
numeric_features = [
    'tenure',
    'MonthlyCharges',
    'TotalCharges',
    'MonthlyRevenue'
]


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


In [10]:
lr_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=3000))
])


In [11]:
lr_pipeline.fit(X_train, y_train)

y_pred_lr = lr_pipeline.predict(X_test)
y_prob_lr = lr_pipeline.predict_proba(X_test)[:, 1]


In [12]:
from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_lr))


              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1291
           1       0.64      0.52      0.58       467

    accuracy                           0.80      1758
   macro avg       0.74      0.71      0.72      1758
weighted avg       0.79      0.80      0.79      1758

ROC-AUC: 0.8389749824596905


In [13]:
import numpy as np
from sklearn.metrics import classification_report


In [14]:
# try lower threshold
threshold = 0.35

y_pred_custom = (y_prob_lr >= threshold).astype(int)

print(classification_report(y_test, y_pred_custom))


              precision    recall  f1-score   support

           0       0.89      0.79      0.84      1291
           1       0.56      0.73      0.63       467

    accuracy                           0.77      1758
   macro avg       0.72      0.76      0.73      1758
weighted avg       0.80      0.77      0.78      1758



In [15]:
lr_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        max_iter=3000,
        class_weight={0: 1, 1: 2}
    ))
])


In [16]:
lr_pipeline.fit(X_train, y_train)

y_prob_lr = lr_pipeline.predict_proba(X_test)[:, 1]
y_pred_lr = lr_pipeline.predict(X_test)

print(classification_report(y_test, y_pred_lr))


              precision    recall  f1-score   support

           0       0.89      0.77      0.83      1291
           1       0.54      0.74      0.63       467

    accuracy                           0.77      1758
   macro avg       0.72      0.76      0.73      1758
weighted avg       0.80      0.77      0.78      1758



In [17]:
from sklearn.ensemble import GradientBoostingClassifier


In [18]:
gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)

gb.fit(X_train, y_train)

y_prob_gb = gb.predict_proba(X_test)[:, 1]
y_pred_gb = gb.predict(X_test)

print(classification_report(y_test, y_pred_gb))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_gb))


              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1291
           1       0.63      0.54      0.58       467

    accuracy                           0.79      1758
   macro avg       0.73      0.71      0.72      1758
weighted avg       0.78      0.79      0.79      1758

ROC-AUC: 0.8339550536824698


In [22]:
import os

os.listdir("../models")


[]

In [24]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train, y_train)


In [25]:
rf

In [26]:
import joblib

joblib.dump(rf, "../models/churn_model.pkl")


['../models/churn_model.pkl']

In [27]:
import pandas as pd
import joblib

# load cleaned data
df = pd.read_csv("../data/processed/telco_churn_cleaned.csv")

# prepare features
X = df.drop(columns=['Churn', 'RevenueLost', 'AnnualRevenue'])
X = pd.get_dummies(X, drop_first=True)

# load trained model
model = joblib.load("../models/churn_model.pkl")

# predict probabilities
df['ChurnProbability'] = model.predict_proba(X)[:, 1]

# save new file
df.to_csv("../data/processed/telco_churn_with_predictions.csv", index=False)
