In [94]:
import pandas as pd

In [95]:
df = pd.read_csv(r"C:\Users\Mada Samhitha\customer_churn_ml\data\raw\telco_customer_churn.csv")

In [96]:
df.shape

(7043, 21)

In [97]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [99]:
df["Churn"].value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [100]:
df["Churn"].value_counts(normalize=True)

Churn
No     0.73463
Yes    0.26537
Name: proportion, dtype: float64

The target variable Churn is moderately imbalanced, with a higher proportion of non-churning customers. Therefore, accuracy alone may be misleading, and metrics such as precision, recall, F1-score, and ROC–AUC will be more appropriate for model evaluation.

In [101]:
df["TotalCharges"].dtype

dtype('O')

In [102]:
# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Check missing values created
df["TotalCharges"].isna().sum()

np.int64(11)

In [103]:
# Fill missing TotalCharges with 0
df["TotalCharges"] = df["TotalCharges"].fillna(0)

In [104]:
X = df.drop(["Churn", "customerID"], axis=1)
y = df["Churn"]

The customerID feature was removed from the model as it is a unique identifier and does not contain predictive information. Including such identifiers can lead to memorization and misleading feature importance without improving generalization.

In [105]:
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns

categorical_cols, numerical_cols

(Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
        'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
        'PaperlessBilling', 'PaymentMethod'],
       dtype='object'),
 Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object'))

The dataset contains a mix of categorical and numerical features. Categorical variables will require encoding, while numerical features may benefit from scaling. To ensure clean preprocessing and avoid data leakage, these transformations will be handled using a scikit-learn pipeline.

In [106]:
from sklearn.model_selection import train_test_split

In [107]:
y = df["Churn"].map({"No": 0, "Yes": 1})

In [108]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [109]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [110]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

In [111]:
X_train.shape, X_test.shape

((5634, 19), (1409, 19))

In [112]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [113]:
log_reg_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(
        max_iter=1000,
        solver="lbfgs"
    ))
])

In [114]:
log_reg_model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [115]:
y_pred = log_reg_model.predict(X_test)

In [116]:
y_prob = log_reg_model.predict_proba(X_test)[:, 1]

In [117]:
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score
)

In [118]:
confusion_matrix(y_test, y_pred)

array([[926, 109],
       [165, 209]])

In [119]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409



In [120]:
roc_auc = roc_auc_score(y_test, y_prob)
roc_auc

0.8421349040274871

Logistic Regression provides a strong and interpretable baseline for churn prediction. While overall accuracy is reasonable, recall for the churn class is more critical, as failing to identify churners has higher business cost. ROC–AUC is used to assess the model’s ability to distinguish between churn and non-churn customers.

In [121]:
y_prob = log_reg_model.predict_proba(X_test)[:, 1]

In [122]:
import numpy as np
from sklearn.metrics import classification_report

thresholds = [0.5, 0.4, 0.3]

for t in thresholds:
    y_pred_t = (y_prob >= t).astype(int)
    print(f"\nThreshold = {t}")
    print(classification_report(y_test, y_pred_t))


Threshold = 0.5
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409


Threshold = 0.4
              precision    recall  f1-score   support

           0       0.87      0.82      0.84      1035
           1       0.57      0.67      0.61       374

    accuracy                           0.78      1409
   macro avg       0.72      0.74      0.73      1409
weighted avg       0.79      0.78      0.78      1409


Threshold = 0.3
              precision    recall  f1-score   support

           0       0.89      0.75      0.81      1035
           1       0.52      0.75      0.62       374

    accuracy                           0.75      1409
   macro avg       0.71      0.75      0.71      1409
weighted avg       0.79

Threshold tuning revealed that lowering the decision threshold from 0.5 to 0.4 significantly improved recall for churners from 56% to 67%. Given the higher cost of missing churners compared to contacting non-churners, a threshold of 0.4 was selected as the final operating point.

In [123]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [124]:
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

In [125]:
cv_scores = cross_val_score(
    log_reg_model,
    X,
    y,
    cv=cv,
    scoring="roc_auc"
)

In [126]:
cv_scores

array([0.85449508, 0.84547521, 0.86359761, 0.82536038, 0.83641354])

In [127]:
cv_scores.mean(), cv_scores.std()

(np.float64(0.8450683648012882), np.float64(0.013384111785724738))

Stratified 5-fold cross-validation using ROC–AUC produced a mean score of approximately 0.845 with low variance, indicating stable and reliable generalization performance. This confirms that the Logistic Regression model captures meaningful churn-related patterns without overfitting.

In [128]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [129]:
rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced"
    ))
])

In [130]:
rf_model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [131]:
from sklearn.metrics import roc_auc_score

rf_proba = rf_model.predict_proba(X_test)[:, 1]
rf_roc_auc = roc_auc_score(y_test, rf_proba)

rf_roc_auc

0.823214239582526

In [132]:
rf_cv_scores = cross_val_score(
    rf_model,
    X,
    y,
    cv=cv,
    scoring="roc_auc"
)

rf_cv_scores, rf_cv_scores.mean(), rf_cv_scores.std()

(array([0.83246661, 0.82108424, 0.83376088, 0.80406678, 0.81456935]),
 np.float64(0.821189572477435),
 np.float64(0.01115560810830497))

A Random Forest classifier was evaluated to assess whether non-linear modeling could improve churn prediction. However, the Random Forest achieved lower ROC–AUC on both the test set and during cross-validation compared to Logistic Regression, while also introducing additional complexity. Therefore, Logistic Regression was selected as the final model due to its superior generalization performance, stability, and interpretability.

In [133]:
# Get feature names after preprocessing
feature_names = log_reg_model.named_steps["preprocessor"].get_feature_names_out()

len(feature_names)

45

In [134]:
import pandas as pd
import numpy as np

coefficients = log_reg_model.named_steps["classifier"].coef_[0]

feature_importance = pd.DataFrame({
    "feature": feature_names,
    "coefficient": coefficients,
    "abs_coefficient": np.abs(coefficients)
})

feature_importance.sort_values("abs_coefficient", ascending=False).head(15)

Unnamed: 0,feature,coefficient,abs_coefficient
1,num__tenure,-1.257539,1.257539
38,cat__Contract_Two year,-0.776594,0.776594
15,cat__InternetService_DSL,-0.648646,0.648646
16,cat__InternetService_Fiber optic,0.634195,0.634195
2,num__MonthlyCharges,-0.591863,0.591863
36,cat__Contract_Month-to-month,0.582883,0.582883
3,num__TotalCharges,0.536253,0.536253
39,cat__PaperlessBilling_No,-0.343227,0.343227
25,cat__DeviceProtection_No internet service,-0.300104,0.300104
17,cat__InternetService_No,-0.300104,0.300104


Feature importance analysis using Logistic Regression revealed that customer tenure and contract type are the strongest drivers of churn. Customers with shorter tenure and month-to-month contracts exhibit significantly higher churn risk, while long-term contracts and longer tenure reduce churn probability. Internet service type and billing behavior further influence churn, highlighting the role of service expectations and customer stability.

In [135]:
import joblib

# Save final logistic regression pipeline
joblib.dump(log_reg_model, "C:/Users/Mada Samhitha/customer_churn_ml/models/customer_churn_logistic_pipeline.pkl")

['C:/Users/Mada Samhitha/customer_churn_ml/models/customer_churn_logistic_pipeline.pkl']