In [1]:
import sys, os
sys.path.append(os.path.abspath(".."))

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier

from src.features.build_features import get_feature_lists, build_preprocessor
from src.models.evaluate import evaluate_at_threshold, compute_roc_auc


In [None]:
# learning_rate = 0.1
# n_estimators  = 100
# max_depth     = 3

# Recall ≈ 0.77
# AUC    ≈ 0.834

In [2]:
df = pd.read_csv("../data/processed/data.csv")

X = df.drop("Churn", axis=1)
y = df["Churn"]

X.head()


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,Contract,PaymentMethod,InternetService,SeniorCitizen,Partner,Dependents,PaperlessBilling
0,1,29.85,29.85,Month-to-month,Electronic check,DSL,0,Yes,No,Yes
1,34,56.95,1889.5,One year,Mailed check,DSL,0,No,No,No
2,2,53.85,108.15,Month-to-month,Mailed check,DSL,0,No,No,Yes
3,45,42.3,1840.75,One year,Bank transfer (automatic),DSL,0,No,No,No
4,2,70.7,151.65,Month-to-month,Electronic check,Fiber optic,0,No,No,Yes


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [4]:
num_features, cat_features, bin_features = get_feature_lists()

preprocessor = build_preprocessor(
    num_features,
    cat_features,
    bin_features
)


In [5]:
def train_and_evaluate_gb(
    learning_rate,
    n_estimators,
    max_depth,
    threshold=0.3
):
    gb = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", GradientBoostingClassifier(
                learning_rate=learning_rate,
                n_estimators=n_estimators,
                max_depth=max_depth,
                random_state=42
            ))
        ]
    )

    gb.fit(X_train, y_train)

    results = evaluate_at_threshold(
        gb, X_test, y_test, threshold=threshold
    )

    auc = compute_roc_auc(gb, X_test, y_test)["auc"]

    churn_metrics = results["classification_report"]["1"]

    return {
        "learning_rate": learning_rate,
        "n_estimators": n_estimators,
        "max_depth": max_depth,
        "recall": churn_metrics["recall"],
        "precision": churn_metrics["precision"],
        "f1": churn_metrics["f1-score"],
        "auc": auc
    }


In [6]:
experiments = [
    (0.1, 100, 3), 
    (0.05, 200, 3),
    (0.05, 300, 3),
]
experiments += [
    (0.1, 100, 2),
    (0.1, 100, 4),
    (0.05, 200, 4),
]

In [7]:
results = []

for lr, n_est, depth in experiments:
    res = train_and_evaluate_gb(
        learning_rate=lr,
        n_estimators=n_est,
        max_depth=depth
    )
    results.append(res)

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,learning_rate,n_estimators,max_depth,recall,precision,f1,auc
0,0.1,100,3,0.76738,0.513417,0.61522,0.834661
1,0.05,200,3,0.770053,0.518919,0.620022,0.837301
2,0.05,300,3,0.745989,0.521495,0.613861,0.835299
3,0.1,100,2,0.772727,0.516071,0.618844,0.836564
4,0.1,100,4,0.754011,0.522222,0.617068,0.833289
5,0.05,200,4,0.759358,0.530841,0.624862,0.837109


In [None]:
# learning_rate = 0.05
# n_estimators  = 200
# max_depth     = 3
# best configuration 