# Customer Churn - Classic ML Project
##This file contains:
1. Problem Definition : We are predicting customer churn using Tabular Data
2. Data Loading & Sanity Checks
3. Train/Validation/Test Splits
4. Baseline Model
5. Evaluation
6. Conclusions

In [1]:
# Get Data from IBM Github directly (Data Loading)
import pandas as pd

# Direct CSV from IBM GitHub (no login)
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)

df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
# Sanity Checks

print("Shape:", df.shape)
display(df.sample(5, random_state=42))

# Quick schema + missingness
display(df.dtypes)
display(df.isna().sum().sort_values(ascending=False).head(15))

# Target distribution
display(df["Churn"].value_counts(dropna=False))
display(df["Churn"].value_counts(normalize=True))


Shape: (7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
185,1024-GUALD,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,24.8,24.8,Yes
2715,0484-JPBRU,Male,0,No,No,41,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Bank transfer (automatic),25.25,996.45,No
3825,3620-EHIMZ,Female,0,Yes,Yes,52,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.35,1031.7,No
1807,6910-HADCM,Female,0,No,No,1,Yes,No,Fiber optic,No,...,Yes,No,No,No,Month-to-month,No,Electronic check,76.35,76.35,Yes
132,8587-XYZSF,Male,0,No,No,67,Yes,No,DSL,No,...,No,Yes,No,No,Two year,No,Bank transfer (automatic),50.55,3260.1,No


Unnamed: 0,0
customerID,object
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
MultipleLines,object
InternetService,object
OnlineSecurity,object


Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
No,5174
Yes,1869


Unnamed: 0_level_0,proportion
Churn,Unnamed: 1_level_1
No,0.73463
Yes,0.26537


In [3]:
#Cleaning one Common Issue


# Convert TotalCharges to numeric (coerce blanks to NaN)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Check how many became NaN
print("TotalCharges NaNs:", df["TotalCharges"].isna().sum())

# Simple handling: drop rows where TotalCharges is missing (small number)
df = df.dropna(subset=["TotalCharges"]).reset_index(drop=True)

print("Shape after dropping missing TotalCharges:", df.shape)


TotalCharges NaNs: 11
Shape after dropping missing TotalCharges: (7032, 21)


In [4]:
# Seperate Features and Target

# Separate target
X = df.drop(columns=["Churn"])
y = df["Churn"].map({"Yes": 1, "No": 0})  # binary target

print(X.shape, y.shape)


(7032, 20) (7032,)


In [5]:
# First Split Train Vs Temp Data

from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape, y_train.shape)
print("Temp :", X_temp.shape, y_temp.shape)


Train: (4922, 20) (4922,)
Temp : (2110, 20) (2110,)


In [6]:
# Second Split Validation VS Test

X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50,
    random_state=42,
    stratify=y_temp
)

print("Validation:", X_val.shape, y_val.shape)
print("Test      :", X_test.shape, y_test.shape)


Validation: (1055, 20) (1055,)
Test      : (1055, 20) (1055,)


In [7]:
# Sanity Check on cross-balance

def churn_rate(y, name):
    print(f"{name} churn rate:", y.mean())

churn_rate(y_train, "Train")
churn_rate(y_val, "Validation")
churn_rate(y_test, "Test")


Train churn rate: 0.2657456318569687
Validation churn rate: 0.2663507109004739
Test churn rate: 0.26540284360189575
