In [18]:
import pandas as pd
import joblib
import optuna
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import fbeta_score, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

In [19]:
df = pd.read_csv("preprocessed_data.csv")

In [20]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [22]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [23]:
def objective(trial):
    max_iter = trial.suggest_int("max_iter", 100, 1000)
    tol = trial.suggest_float("tol", 1e-5, 1e-1, log=True)
    C = trial.suggest_float("C", 0.01, 10, log=True)
    threshold = trial.suggest_float("threshold", 0.25, 0.45)

    model = LogisticRegression(
        max_iter=max_iter,
        tol=tol,
        C=C,
        solver="lbfgs"
    )

    model.fit(X_train_smote, y_train_smote)

    probs = model.predict_proba(X_test)[:, 1]
    preds = (probs >= threshold).astype(int)

    return fbeta_score(y_test, preds, beta=2)

In [24]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2026-01-06 02:37:45,057] A new study created in memory with name: no-name-ebb4fcbd-8d8d-4651-91fe-2eeb31ca42e3
[I 2026-01-06 02:37:45,124] Trial 0 finished with value: 0.734171531207903 and parameters: {'max_iter': 200, 'tol': 1.3002965822316483e-05, 'C': 8.468037012249697, 'threshold': 0.37742991617619887}. Best is trial 0 with value: 0.734171531207903.
[I 2026-01-06 02:37:45,154] Trial 1 finished with value: 0.7350230414746544 and parameters: {'max_iter': 304, 'tol': 0.00021119622321000568, 'C': 3.2360872243257797, 'threshold': 0.42309465665259083}. Best is trial 1 with value: 0.7350230414746544.
[I 2026-01-06 02:37:45,181] Trial 2 finished with value: 0.7440978077571669 and parameters: {'max_iter': 967, 'tol': 0.0001325166261698214, 'C': 0.013190605687321253, 'threshold': 0.25992422777302765}. Best is trial 2 with value: 0.7440978077571669.
[I 2026-01-06 02:37:45,196] Trial 3 finished with value: 0.7437661220980224 and parameters: {'max_iter': 685, 'tol': 0.03164899045128713, 'C'

In [25]:
best_params = study.best_params.copy()
best_threshold = best_params.pop("threshold")

In [26]:
final_pipeline = Pipeline([
    ("model", LogisticRegression(
        **best_params,
        solver="lbfgs"
    ))
])

In [27]:
final_pipeline.fit(X_train_smote, y_train_smote)

0,1,2
,steps,"[('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.03117694474033276
,C,0.021319750730462342
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,862


In [28]:
final_probs = final_pipeline.predict_proba(X_test)[:, 1]
final_preds = (final_probs >= best_threshold).astype(int)

In [29]:
print(f"Accuracy: {accuracy_score(y_test, final_preds):.4f}")
print(f"Optimal Threshold: {best_threshold:.4f}")
print(classification_report(y_test, final_preds))

Accuracy: 0.6525
Optimal Threshold: 0.3137
              precision    recall  f1-score   support

         0.0       0.95      0.56      0.70      1033
         1.0       0.43      0.92      0.58       374

    accuracy                           0.65      1407
   macro avg       0.69      0.74      0.64      1407
weighted avg       0.81      0.65      0.67      1407



### SAVE MODEL (PICKLE)

In [30]:
artifact = {
    "model": final_pipeline,
    "threshold": best_threshold,
    "features": list(X.columns)
}

In [31]:
joblib.dump(artifact, "churn_lr_model.pkl")

print("\n✅ Model saved as churn_lr_model.pkl")


✅ Model saved as churn_lr_model.pkl
