In [13]:
import sys
import os
import pandas as pd
sys.path.append(os.path.abspath(".."))

from src.models.evaluate import (
    evaluate_at_threshold,
    compute_roc_auc,
    evaluate_multiple_thresholds
)


sys.path.append(os.path.abspath(".."))

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from src.features.build_features import get_feature_lists, build_preprocessor
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt


In [14]:
df = pd.read_csv("../data/processed/data.csv")

X = df.drop("Churn", axis=1)
y = df["Churn"]

num_features, cat_features, bin_features = get_feature_lists()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

preprocessor = build_preprocessor(num_features, cat_features, bin_features)

clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", LogisticRegression(max_iter=1000))
    ]
)

clf.fit(X_train, y_train)

In [15]:
y.value_counts(normalize=True)


0    0.734215
1    0.265785
Name: Churn, dtype: float64

In [16]:
y_pred = clf.predict(X_test)

In [17]:
results_05 = evaluate_at_threshold(clf, X_test, y_test, threshold=0.5)
results_05["confusion_matrix"]

array([[922, 111],
       [168, 206]])

In [18]:
results_05["classification_report"]["1"]


{'precision': 0.6498422712933754,
 'recall': 0.5508021390374331,
 'f1-score': 0.5962373371924746,
 'support': 374.0}

In [19]:
results_03 = evaluate_at_threshold(clf, X_test, y_test, threshold=0.3)

results_03["confusion_matrix"]
results_03["classification_report"]["1"]

{'precision': 0.49568221070811747,
 'recall': 0.767379679144385,
 'f1-score': 0.602308499475341,
 'support': 374.0}

In [20]:
roc_results = compute_roc_auc(clf, X_test, y_test)
roc_results["auc"]


0.8344756718140921