In [31]:

import sys, os
sys.path.append(os.path.abspath(".."))

import pandas as pd
from sklearn.model_selection import train_test_split

from src.features.build_features import get_feature_lists, build_preprocessor
from src.models.evaluate import evaluate_at_threshold, compute_roc_auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier




In [23]:
from src.features.build_features import get_feature_lists, build_preprocessor
from src.models.evaluate import evaluate_at_threshold, compute_roc_auc


In [24]:
df = pd.read_csv("../data/processed/data.csv")

X = df.drop("Churn", axis=1)
y = df["Churn"]

X.head()


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,Contract,PaymentMethod,InternetService,SeniorCitizen,Partner,Dependents,PaperlessBilling
0,1,29.85,29.85,Month-to-month,Electronic check,DSL,0,Yes,No,Yes
1,34,56.95,1889.5,One year,Mailed check,DSL,0,No,No,No
2,2,53.85,108.15,Month-to-month,Mailed check,DSL,0,No,No,Yes
3,45,42.3,1840.75,One year,Bank transfer (automatic),DSL,0,No,No,No
4,2,70.7,151.65,Month-to-month,Electronic check,Fiber optic,0,No,No,Yes


In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [26]:
num_features, cat_features, bin_features = get_feature_lists()

preprocessor = build_preprocessor(
    num_features,
    cat_features,
    bin_features
)


In [27]:

log_reg = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", LogisticRegression(max_iter=1000))
    ]
)

log_reg.fit(X_train, y_train)

In [28]:
BASELINE_THRESHOLD = 0.3

log_results = evaluate_at_threshold(
    log_reg,
    X_test,
    y_test,
    threshold=BASELINE_THRESHOLD
)

log_auc = compute_roc_auc(log_reg, X_test, y_test)["auc"]

log_results["classification_report"]["1"], log_auc


({'precision': 0.49568221070811747,
  'recall': 0.767379679144385,
  'f1-score': 0.602308499475341,
  'support': 374.0},
 0.8344756718140921)

In [29]:
rf=Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", RandomForestClassifier(
            n_estimators=200,
            random_state=42,
            n_jobs=-1
        ))
    ]
)
rf.fit(X_train,y_train)

In [30]:
rf_results = evaluate_at_threshold(rf, X_test,y_test,threshold=0.3)

rf_auc=compute_roc_auc(rf, X_test,y_test)
rf_results["classification_report"]["1"], rf_auc

({'precision': 0.5112781954887218,
  'recall': 0.7272727272727273,
  'f1-score': 0.6004415011037527,
  'support': 374.0},
 {'auc': 0.8122194842911202,
  'fpr': array([0.00000000e+00, 9.68054211e-04, 1.93610842e-03, 1.93610842e-03,
         2.90416263e-03, 2.90416263e-03, 2.90416263e-03, 2.90416263e-03,
         2.90416263e-03, 3.87221684e-03, 3.87221684e-03, 5.80832527e-03,
         6.77637948e-03, 7.74443369e-03, 7.74443369e-03, 8.71248790e-03,
         8.71248790e-03, 9.68054211e-03, 1.06485963e-02, 1.06485963e-02,
         1.06485963e-02, 1.16166505e-02, 1.16166505e-02, 1.35527590e-02,
         1.35527590e-02, 1.45208132e-02, 1.45208132e-02, 1.54888674e-02,
         1.54888674e-02, 1.54888674e-02, 1.54888674e-02, 1.64569216e-02,
         1.64569216e-02, 1.64569216e-02, 1.74249758e-02, 1.83930300e-02,
         1.83930300e-02, 2.03291384e-02, 2.12971926e-02, 2.12971926e-02,
         2.22652469e-02, 2.42013553e-02, 2.51694095e-02, 2.71055179e-02,
         2.80735721e-02, 2.90416263e-02

In [None]:
# Random Forest is not ideal

In [33]:
gb= Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model",GradientBoostingClassifier(random_state=42))
    ]
)
gb.fit(X_train, y_train)

In [34]:
gb_results = evaluate_at_threshold(gb, X_test, y_test, threshold=BASELINE_THRESHOLD)

gb_auc = compute_roc_auc(gb, X_test, y_test)["auc"]

gb_results["classification_report"]["1"], gb_auc

({'precision': 0.5134168157423972,
  'recall': 0.767379679144385,
  'f1-score': 0.6152197213290461,
  'support': 374.0},
 0.8346607410014962)

In [None]:
# Gradient boosting is ideal for classification