In [1]:
pip install kagglehub

Note: you may need to restart the kernel to use updated packages.


Import Libraries

In [2]:
import pandas as pd
import sklearn
import kagglehub
import numpy as np
import matplotlib as mp
from sklearn.model_selection import KFold, cross_val_score, train_test_split

In [3]:
# Download latest version
"""path = kagglehub.dataset_download("blastchar/telco-customer-churn")

print("Path to dataset files:", path)"""

'path = kagglehub.dataset_download("blastchar/telco-customer-churn")\n\nprint("Path to dataset files:", path)'

Load Dataset using pandas

In [4]:
df = pd.read_csv("churndata.csv")

Sample Data

In [5]:
df.iloc[0:4]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No


Looking at the shape of data

In [6]:
df.shape

(7043, 21)

Column Names

In [7]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

Churn Counts

In [8]:
print(f"Total count of Churn Customers: {df['Churn'].value_counts()['Yes']}, Total Count of non-churn customers: {df['Churn'].value_counts()['No']}")

Total count of Churn Customers: 1869, Total Count of non-churn customers: 5174


Information

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


Converting Missing Columns

In [10]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


Filtering and populating Missing Rows

In [12]:
filtered_df = df[df['TotalCharges'].isna()]

In [13]:
filtered_df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3826,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
4380,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5218,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
6670,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,...,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [14]:
filtered_df['Contract'].unique()

array(['Two year', 'One year'], dtype=object)

In [15]:
def fill_missing_total_charges(row):
    if pd.isna(row['TotalCharges']) and row['Contract'] == "One year":
        return row['MonthlyCharges'] * 12
    elif pd.isna(row['TotalCharges']) and row['Contract'] == "Two year":
        return row['MonthlyCharges'] * 24
    else:
        return row['TotalCharges']

In [16]:
df["TotalCharges"]= df.apply(fill_missing_total_charges, axis=1)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [18]:
filtered_df = df[df['TotalCharges'].isna()]

In [19]:
filtered_df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn


Checking Correlation to see which Inputs would give us best relation

In [20]:
df.groupby('Churn')['tenure'].mean()

Churn
No     37.569965
Yes    17.979133
Name: tenure, dtype: float64

In [21]:
pd.crosstab(df['Contract'], df['Churn'], normalize='index') * 100

Churn,No,Yes
Contract,Unnamed: 1_level_1,Unnamed: 2_level_1
Month-to-month,57.290323,42.709677
One year,88.730482,11.269518
Two year,97.168142,2.831858


In [22]:
df.groupby('Churn')['MonthlyCharges'].mean()

Churn
No     61.265124
Yes    74.441332
Name: MonthlyCharges, dtype: float64

In [23]:
df['PaymentMethod'].unique()

array(['Electronic check', 'Mailed check', 'Bank transfer (automatic)',
       'Credit card (automatic)'], dtype=object)

In [24]:
pd.crosstab(df['PaymentMethod'], df['Churn'], normalize = "index")

Churn,No,Yes
PaymentMethod,Unnamed: 1_level_1,Unnamed: 2_level_1
Bank transfer (automatic),0.832902,0.167098
Credit card (automatic),0.847569,0.152431
Electronic check,0.547146,0.452854
Mailed check,0.808933,0.191067


In [25]:
df.groupby('Contract')['MonthlyCharges'].mean()

Contract
Month-to-month    66.398490
One year          65.048608
Two year          60.770413
Name: MonthlyCharges, dtype: float64

In [26]:
pd.crosstab(df['TechSupport'], df['Churn'], normalize = "index")

Churn,No,Yes
TechSupport,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.583645,0.416355
No internet service,0.92595,0.07405
Yes,0.848337,0.151663


In [27]:
pd.crosstab(df['InternetService'], df['Churn'], normalize = "index")

Churn,No,Yes
InternetService,Unnamed: 1_level_1,Unnamed: 2_level_1
DSL,0.810409,0.189591
Fiber optic,0.581072,0.418928
No,0.92595,0.07405


Converting Churn into Binary & One Hot Encoding

In [28]:
df["Churn"] = (df["Churn"] == "Yes").astype(int)

In [29]:
features = ['tenure', 'MonthlyCharges', 'Contract', 'InternetService', 'TechSupport', 'PaymentMethod']

In [30]:
X = df[features]
y = df['Churn']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

"""# One-hot encode categorical columns
df = pd.get_dummies(df, columns=["InternetService", "Contract", "PaymentMethod"], 
                    drop_first=False)"""

'# One-hot encode categorical columns\ndf = pd.get_dummies(df, columns=["InternetService", "Contract", "PaymentMethod"], \n                    drop_first=False)'

In [33]:
X_train_encoded.iloc[0:4]

Unnamed: 0,tenure,MonthlyCharges,Contract_One year,Contract_Two year,InternetService_Fiber optic,InternetService_No,TechSupport_No internet service,TechSupport_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
2142,21,64.85,True,False,False,False,False,False,False,False,True
1623,54,97.2,False,True,True,False,False,False,False,False,False
6074,1,23.45,False,False,False,False,False,False,False,True,False
1362,4,70.2,False,False,True,False,False,False,False,True,False


In [34]:
# 1. Check the split sizes
print(f"Training set: {X_train_encoded.shape}")
print(f"Test set: {X_test_encoded.shape}")

# 2. Check class balance in training set
print(f"\nChurn distribution in training:\n{y_train.value_counts()}")

# 3. Check if train and test have same columns
print(f"\nSame columns? {list(X_train_encoded.columns) == list(X_test_encoded.columns)}")

Training set: (5634, 11)
Test set: (1409, 11)

Churn distribution in training:
Churn
0    4138
1    1496
Name: count, dtype: int64

Same columns? True


Training Logistic Regression Model

In [35]:
from sklearn.linear_model import LogisticRegression

In [38]:
X_train_encoded

Unnamed: 0,tenure,MonthlyCharges,Contract_One year,Contract_Two year,InternetService_Fiber optic,InternetService_No,TechSupport_No internet service,TechSupport_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
2142,21,64.85,True,False,False,False,False,False,False,False,True
1623,54,97.20,False,True,True,False,False,False,False,False,False
6074,1,23.45,False,False,False,False,False,False,False,True,False
1362,4,70.20,False,False,True,False,False,False,False,True,False
6754,0,61.90,False,True,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
3772,1,95.00,False,False,True,False,False,False,False,True,False
5191,23,91.10,False,True,False,False,False,True,True,False,False
5226,12,21.15,False,False,False,True,True,False,False,True,False
5390,12,99.45,False,False,True,False,False,False,False,True,False


In [39]:
reg = LogisticRegression(max_iter=10000, random_state=42)
reg.fit(X_train_encoded, y_train)

In [40]:
y_pred = reg.predict(X_test_encoded)

In [41]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

Accuracy: 0.8055358410220014

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1036
           1       0.66      0.55      0.60       373

    accuracy                           0.81      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.80      0.81      0.80      1409


Confusion Matrix:
[[931 105]
 [169 204]]


In [42]:
# Get probability predictions
y_proba = reg.predict_proba(X_test_encoded)[:, 1]  # probability of churn (class 1)

# Look at first 10 probabilities
print(y_proba[:10])

[0.46297214 0.07106788 0.0107349  0.72824121 0.01349834 0.20284868
 0.06004003 0.01284194 0.03550772 0.18755185]


In [44]:
for threshold in [0.3, 0.4, 0.5]:
    y_pred_threshold = (y_proba >= threshold).astype(int)
    print(f"\n=== Threshold: {threshold} ===")
    print(classification_report(y_test, y_pred_threshold))


=== Threshold: 0.3 ===
              precision    recall  f1-score   support

           0       0.91      0.76      0.83      1036
           1       0.54      0.80      0.65       373

    accuracy                           0.77      1409
   macro avg       0.73      0.78      0.74      1409
weighted avg       0.82      0.77      0.78      1409


=== Threshold: 0.4 ===
              precision    recall  f1-score   support

           0       0.88      0.85      0.86      1036
           1       0.62      0.67      0.65       373

    accuracy                           0.80      1409
   macro avg       0.75      0.76      0.75      1409
weighted avg       0.81      0.80      0.81      1409


=== Threshold: 0.5 ===
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1036
           1       0.66      0.55      0.60       373

    accuracy                           0.81      1409
   macro avg       0.75      0.72      0.73      1409
we

Let's use Decision Tree here and see if there is a difference

In [45]:
from sklearn.tree import DecisionTreeClassifier

# Create decision tree
dt = DecisionTreeClassifier(max_depth=5, random_state=42)

# Train it
dt.fit(X_train_encoded, y_train)

# Predict
y_pred_dt = dt.predict(X_test_encoded)

In [46]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87      1036
           1       0.68      0.46      0.55       373

    accuracy                           0.80      1409
   macro avg       0.75      0.69      0.71      1409
weighted avg       0.79      0.80      0.79      1409



In [47]:
y_proba_dt = dt.predict_proba(X_test_encoded)[:, 1]
y_pred_dt_30 = (y_proba_dt >= 0.3).astype(int)
print(classification_report(y_test, y_pred_dt_30))

              precision    recall  f1-score   support

           0       0.91      0.74      0.82      1036
           1       0.52      0.80      0.63       373

    accuracy                           0.76      1409
   macro avg       0.72      0.77      0.72      1409
weighted avg       0.81      0.76      0.77      1409

