In [1]:
%load_ext autoreload
%autoreload 2

#### Loading Packages

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import statsmodels.api as sm
from statsmodels.tools import add_constant

# Sklearn modules
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve

In [3]:
project_root = os.path.abspath('..')

# Adding the project root to my system path
if project_root not in sys.path:
    sys.path.append(project_root)

from src.data_loader import telco_data_loader
from src.data_processor import telcoDataCleaner

  from .autonotebook import tqdm as notebook_tqdm


#### Loading and cleaning data

In [4]:
# running the loader
raw_telco_data = telco_data_loader()

cleaner = telcoDataCleaner()

cleaned_data = cleaner.clean_data(raw_telco_data)

src.data_loader - Starting download of telco data
src.data_loader - Telco data download successful
src.data_loader - CSV data downloaded successfully
src.data_processor - Binary columns converted
src.data_processor - Converting gender
src.data_processor - Converting Internet Service
src.data_processor - Converting Multiple Lines
src.data_processor - Created dummy columns for Contract and Payment Method
src.data_processor - Data types successfully converted
src.data_processor - Outlier Summary: {}
src.data_processor - Columns with missing values: {'OnlineSecurity': np.int64(1526), 'OnlineBackup': np.int64(1526), 'DeviceProtection': np.int64(1526), 'TechSupport': np.int64(1526), 'StreamingTV': np.int64(1526), 'StreamingMovies': np.int64(1526), 'TotalCharges': np.int64(11)}
src.data_processor - Filled missing values in TotalCharges with medians
src.data_processor - Filled missing values in Churn with medians


## Testing Models

Data prep

In [5]:
# identifying predeictors columns with NA's

# Only selecting columns that aren't missing values
non_na_cols = []

for col in cleaned_data['predictors']:
    if cleaned_data['predictors'][col].isna().sum() == 0:
        non_na_cols.append(col)

# Adding constant to the predictors
reg_predictors = add_constant(cleaned_data['predictors'].copy(),
                              has_constant = 'raise',
                              prepend = True)

##### Statistical Modeling

Performing regressions with the statistical model first to understand the impact of predictors.

In [6]:
online_log_reg_model = sm.Logit(endog = cleaned_data['target'][cleaned_data["masks"]],
                         exog = reg_predictors.loc[cleaned_data["masks"], :]).fit()

Optimization terminated successfully.
         Current function value: 0.470717
         Iterations 7


In [7]:
print(online_log_reg_model.summary())

                           Logit Regression Results                           
Dep. Variable:                  Churn   No. Observations:                 5517
Model:                          Logit   Df Residuals:                     5494
Method:                           MLE   Df Model:                           22
Date:                Tue, 02 Sep 2025   Pseudo R-squ.:                  0.2475
Time:                        20:41:46   Log-Likelihood:                -2596.9
converged:                       True   LL-Null:                       -3451.3
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
const                                    -0.6645      0.217     -3.061      0.002      -1.090      -0.239
gender                                   -0.0106  

In [None]:
cols_to_exclude = non_na_cols.copy()

cols_to_exclude.remove("InternetService")

nol_log_reg_model = sm.Logit(endog = cleaned_data['target'][~np.array(cleaned_data["masks"])],
                             exog = reg_predictors.loc[~np.array(cleaned_data["masks"]), cols_to_exclude]).fit()

Optimization terminated successfully.
         Current function value: 0.203968
         Iterations 9


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,PaperlessBilling,MonthlyCharges,TotalCharges,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
11,1,0,0,0,16,1,0,0,0,18.95,326.80,0.0,1.0,1.0,0.0,0.0
16,0,0,0,0,52,1,0,0,0,20.65,1022.95,1.0,0.0,0.0,0.0,1.0
21,1,0,1,0,12,1,0,0,0,19.80,202.25,1.0,0.0,0.0,0.0,0.0
22,1,0,0,0,1,1,0,0,0,20.15,20.15,0.0,0.0,0.0,0.0,1.0
33,1,0,0,0,1,1,0,0,0,20.20,20.20,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7017,0,0,0,0,51,1,0,0,0,20.65,1020.75,0.0,1.0,0.0,0.0,0.0
7019,0,0,0,0,39,1,0,0,0,20.15,826.00,0.0,1.0,0.0,0.0,1.0
7020,1,0,1,1,12,1,0,0,1,19.20,239.00,0.0,0.0,0.0,1.0,0.0
7030,0,0,0,0,2,1,0,0,1,20.05,39.25,0.0,0.0,0.0,0.0,1.0


In [9]:
print(nol_log_reg_model.summary())

                           Logit Regression Results                           
Dep. Variable:                  Churn   No. Observations:                 1526
Model:                          Logit   Df Residuals:                     1511
Method:                           MLE   Df Model:                           14
Date:                Tue, 02 Sep 2025   Pseudo R-squ.:                  0.2274
Time:                        20:41:46   Log-Likelihood:                -311.26
converged:                       True   LL-Null:                       -402.85
Covariance Type:            nonrobust   LLR p-value:                 1.459e-31
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
gender                                   -0.0860      0.213     -0.404      0.686      -0.503       0.331
SeniorCitizen                             0.7025  

## Switching to ML

### Internet Customers Model

In [10]:
internet_target = cleaned_data['target'][cleaned_data["masks"]]
internet_preds = reg_predictors.loc[cleaned_data["masks"], :]

In [None]:
scaler = StandardScaler()

internet_preds_for_model = scaler.fit_transform(internet_preds)

log_reg_cv = LogisticRegressionCV(cv = 5, 
                                  max_iter = 10000, 
                                  random_state=42)

# Fitting the regression
log_reg_cv.fit(X = internet_preds, 
               y = internet_target)

0,1,2
,Cs,10
,fit_intercept,True
,cv,5
,dual,False
,penalty,'l2'
,scoring,
,solver,'lbfgs'
,tol,0.0001
,max_iter,10000
,class_weight,
