In [55]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    confusion_matrix, classification_report,
    precision_score, recall_score, f1_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

In [56]:
# =========================================================
# 1) LOAD DATA
# =========================================================
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [58]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [59]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [60]:
#cleaning
df = df.drop(columns=["customerID"])
    
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

df = df.dropna(subset=["Churn"])

df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1}).astype(int)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [61]:
# =========================================================
# 2) FEATURE ENGINEERING (4 features)
# =========================================================

# (A) Tenure buckets (captures early churn pattern)
df["tenure_bucket"] = pd.cut(
    df["tenure"],
    bins=[-1, 12, 24, 48, 1_000_000],
    labels=["0-1y", "1-2y", "2-4y", "4y+"]
)
df["tenure_bucket"] = df["tenure_bucket"].astype("object")

# (B) Average charges per month (pricing pressure signal)
# Use (tenure + 1) to avoid division by zero
df["avg_charges_per_month"] = df["TotalCharges"] / (df["tenure"] + 1)

# (C) High-risk contract flag
df["high_risk_contract"] = (df["Contract"] == "Month-to-month").astype(int)

# (D) Number of active services (customer engagement)
service_cols = [
    "PhoneService", "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies"
]

# InternetService is special: it's not Yes/No, so handle it separately
# We'll count it as "active" if InternetService != "No"
df["internet_active"] = (df["InternetService"] != "No").astype(int)

# Count Yes across service columns that are typically Yes/No
# If some columns are missing (depending on dataset version), filter safely:
existing_service_cols = [c for c in service_cols if c in df.columns]
df["num_services"] = (df[existing_service_cols] == "Yes").sum(axis=1) + df["internet_active"]


In [62]:
# =========================================================
# 3) DEFINE X / y
# =========================================================
X = df.drop(columns=["Churn"])
y = df["Churn"]


# =========================================================
# 4) TRAIN/TEST SPLIT (ONCE)
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=y
)

# =========================================================
# 5) PREPROCESSOR (ONCE, SHARED)
# =========================================================
num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()


numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, num_cols),
        ("cat", categorical_pipeline, cat_cols)
    ]
)

In [63]:
# =========================================================
# 6) BUSINESS THRESHOLD (OPTION B)
# =========================================================
def evaluate_business_threshold(model, X_test, y_test, recall_min=0.75, step=0.01):
    """
    Option B:
    - Keep Recall (Churn=1) >= recall_min
    - Maximize Precision under that constraint
    """
    proba = model.predict_proba(X_test)[:, 1]
    thresholds = np.arange(0.10, 0.91, step)

    rows = []
    for t in thresholds:
        pred = (proba >= t).astype(int)
        p = precision_score(y_test, pred, zero_division=0)
        r = recall_score(y_test, pred, zero_division=0)
        f = f1_score(y_test, pred, zero_division=0)
        rows.append((t, p, r, f))

    candidates = [row for row in rows if row[2] >= recall_min]
    if not candidates:
        return {"chosen_threshold": None, "note": f"No threshold met Recall >= {recall_min}.", "table": rows}

    best_t, best_p, best_r, best_f = max(candidates, key=lambda x: x[1])
    final_pred = (proba >= best_t).astype(int)

    return {
        "chosen_threshold": best_t,
        "precision": best_p,
        "recall": best_r,
        "f1": best_f,
        "confusion_matrix": confusion_matrix(y_test, final_pred),
        "classification_report": classification_report(y_test, final_pred),
        "table": rows
    }

In [64]:
# =========================================================
# 7) MODELS (SAME AS BEFORE, NOW WITH NEW FEATURES)
# =========================================================
lr_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", LogisticRegression(max_iter=2000, class_weight="balanced", random_state=0))
])

dt_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", DecisionTreeClassifier(max_depth=4, random_state=0))
])

knn_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", KNeighborsClassifier())
])
hgb_pipe= Pipeline(steps=[
    ("prep",preprocessor),
    ("model",HistGradientBoostingClassifier(
        max_depth=6,
        learning_rate=0.05,
        max_iter=300,
        random_state=0
    ))
]
    
)
knn_param_grid = {
    "model__n_neighbors": [3, 5, 7, 9, 11, 15, 21],
    "model__weights": ["uniform", "distance"],
    "model__metric": ["euclidean", "manhattan"]
}

grid_knn = GridSearchCV(
    knn_pipe,
    knn_param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=-1
)

rf_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", RandomForestClassifier(random_state=0, class_weight="balanced", n_jobs=-1))
])

rf_param_grid = {
    "model__n_estimators": [200, 300],
    "model__max_depth": [8, 12, 16],
    "model__min_samples_split": [5, 10],
    "model__min_samples_leaf": [2, 5]
}

grid_rf = GridSearchCV(
    rf_pipe,
    rf_param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=-1
)



In [65]:
# =========================================================
# 8) TRAIN
# =========================================================
lr_pipe.fit(X_train, y_train)
dt_pipe.fit(X_train, y_train)

0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,0
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [66]:
grid_knn.fit(X_train, y_train)
best_knn = grid_knn.best_estimator_

In [67]:
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_

In [68]:
hgb_pipe.fit(X_train, y_train)

0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'log_loss'
,learning_rate,0.05
,max_iter,300
,max_leaf_nodes,31
,max_depth,6
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255
,categorical_features,'from_dtype'


In [69]:
print("Best KNN params:", grid_knn.best_params_)
print("Best RF params:", grid_rf.best_params_)

Best KNN params: {'model__metric': 'manhattan', 'model__n_neighbors': 21, 'model__weights': 'uniform'}
Best RF params: {'model__max_depth': 16, 'model__min_samples_leaf': 2, 'model__min_samples_split': 5, 'model__n_estimators': 200}


In [70]:
# =========================================================
# 9) EVALUATE (BUSINESS)
# =========================================================
recall_min =0.75

results = {
    "LogisticRegression": evaluate_business_threshold(lr_pipe, X_test, y_test, recall_min=recall_min),
    "DecisionTree": evaluate_business_threshold(dt_pipe, X_test, y_test, recall_min=recall_min),
    "KNN (GridSearch)": evaluate_business_threshold(best_knn, X_test, y_test, recall_min=recall_min),
    "RandomForest (GridSearch)": evaluate_business_threshold(best_rf, X_test, y_test, recall_min=recall_min),
    "HistGradientBoostingClassifier":evaluate_business_threshold(hgb_pipe,X_test, y_test),
}

print("\n=== BUSINESS SUMMARY (Recall >= 0.75) ===")
print("Model | Threshold | Precision | Recall | F1")
print("-"*55)

for name, res in results.items():
    if res["chosen_threshold"] is None:
        print(f"{name} | None | - | - | -  ({res['note']})")
    else:
        print(f"{name} | {res['chosen_threshold']:.2f} | {res['precision']:.3f} | {res['recall']:.3f} | {res['f1']:.3f}")


=== BUSINESS SUMMARY (Recall >= 0.75) ===
Model | Threshold | Precision | Recall | F1
-------------------------------------------------------
LogisticRegression | 0.57 | 0.564 | 0.757 | 0.646
DecisionTree | 0.23 | 0.508 | 0.786 | 0.617
KNN (GridSearch) | 0.29 | 0.517 | 0.789 | 0.624
RandomForest (GridSearch) | 0.38 | 0.532 | 0.757 | 0.625
HistGradientBoostingClassifier | 0.25 | 0.515 | 0.754 | 0.612


In [71]:
#detailed outputs
for name, res in results.items():
    print(f"\n\n===== {name} =====")
    if res["chosen_threshold"] is None:
        print(res["note"])
        continue
    print("Chosen threshold:", round(res["chosen_threshold"], 3))
    print("Confusion Matrix:\n", res["confusion_matrix"])
    print("\nClassification Report:\n", res["classification_report"])




===== LogisticRegression =====
Chosen threshold: 0.57
Confusion Matrix:
 [[816 219]
 [ 91 283]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.79      0.84      1035
           1       0.56      0.76      0.65       374

    accuracy                           0.78      1409
   macro avg       0.73      0.77      0.74      1409
weighted avg       0.81      0.78      0.79      1409



===== DecisionTree =====
Chosen threshold: 0.23
Confusion Matrix:
 [[750 285]
 [ 80 294]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.72      0.80      1035
           1       0.51      0.79      0.62       374

    accuracy                           0.74      1409
   macro avg       0.71      0.76      0.71      1409
weighted avg       0.80      0.74      0.75      1409



===== KNN (GridSearch) =====
Chosen threshold: 0.29
Confusion Matrix:
 [[759 276]
 [ 79 295]]

Classific