In [26]:
import pandas as pd
import numpy as np
from sklearn import set_config
set_config(transform_output="pandas") 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, fbeta_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE


In [27]:
df = pd.read_csv("preprocessed_data.csv")
df.head(10)

Unnamed: 0,Partner,Dependents,PaperlessBilling,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,InternetService_No,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Contract_One year,Contract_Two year,MonthlyCharges,TotalCharges,tenure,SeniorCitizen
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,-1.161694,-0.994194,-1.280248,0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,-0.260878,-0.17374,0.064303,0
2,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-0.363923,-0.959649,-1.239504,0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.74785,-0.195248,0.512486,0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.196178,-0.940457,-1.239504,0
5,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.158489,-0.645369,-0.99504,0
6,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.807802,-0.147313,-0.424625,0
7,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-1.165018,-0.874169,-0.913552,0
8,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.329677,0.336516,-0.180161,0
9,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.28747,0.531476,1.205134,0


In [28]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
smote = SMOTE(random_state=42)

In [31]:
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)

### Logistic Regression

In [32]:
# --- Optuna Optimization Including Threshold Tuning ---

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    # Standard Hyperparameters
    max_iter = trial.suggest_int('max_iter', 100, 1000)
    tol = trial.suggest_float('tol', 1e-5, 1e-1, log=True)
    C = trial.suggest_float('C', 0.01, 10, log=True)
    
    threshold = trial.suggest_float('threshold', 0.25, 0.45)

    model = LogisticRegression(max_iter=max_iter, tol=tol, C=C)
    model.fit(X_train, y_train)

    probs = model.predict_proba(X_test)[:, 1]
    preds = (probs >= threshold).astype(int)
    return fbeta_score(y_test, preds, beta=2)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2026-01-06 02:27:43,878] A new study created in memory with name: no-name-b2cf65d6-caa2-44bb-bc5b-cf4d2b8da78c
[I 2026-01-06 02:27:43,954] Trial 0 finished with value: 0.5963791267305645 and parameters: {'max_iter': 148, 'tol': 4.1988436351509814e-05, 'C': 0.11496861327018779, 'threshold': 0.4253928232677371}. Best is trial 0 with value: 0.5963791267305645.
[I 2026-01-06 02:27:43,999] Trial 1 finished with value: 0.7111861137897783 and parameters: {'max_iter': 633, 'tol': 0.00021683933934927421, 'C': 0.25184087980657205, 'threshold': 0.2693062540019642}. Best is trial 1 with value: 0.7111861137897783.
[I 2026-01-06 02:27:44,014] Trial 2 finished with value: 0.6777557100297915 and parameters: {'max_iter': 441, 'tol': 0.04547571851576391, 'C': 0.5249623114688762, 'threshold': 0.362861680085882}. Best is trial 1 with value: 0.7111861137897783.
[I 2026-01-06 02:27:44,033] Trial 3 finished with value: 0.638353309015112 and parameters: {'max_iter': 430, 'tol': 0.007480686209182035, 'C': 2

In [33]:
params = study.best_params.copy()
best_threshold = params.pop('threshold') # Removes 'threshold' from dict and saves it

best_model = LogisticRegression(**params)
best_model.fit(X_train, y_train)

final_probs = best_model.predict_proba(X_test)[:, 1]
final_preds = (final_probs >= best_threshold).astype(int)

# --- Final Results ---
accuracy = accuracy_score(y_test, final_preds)
print(f"Accuracy: {accuracy:.4f}")
print(f"Optimal Threshold Found by Optuna: {best_threshold:.4f}")
print(classification_report(y_test, final_preds))

Accuracy: 0.7363
Optimal Threshold Found by Optuna: 0.2583
              precision    recall  f1-score   support

         0.0       0.91      0.71      0.80      1033
         1.0       0.50      0.82      0.62       374

    accuracy                           0.74      1407
   macro avg       0.71      0.76      0.71      1407
weighted avg       0.80      0.74      0.75      1407



### Logistic Regression SMOTE

In [34]:
X_train = X_resampled_smote
y_train = y_resampled_smote

# --- Optuna Optimization Including Threshold Tuning With SMOTE Data ---

def objective(trial):
    # Standard Hyperparameters
    max_iter = trial.suggest_int('max_iter', 100, 1000)
    tol = trial.suggest_float('tol', 1e-5, 1e-1, log=True)
    C = trial.suggest_float('C', 0.01, 10, log=True)
    
    threshold = trial.suggest_float('threshold', 0.25, 0.45)

    model = LogisticRegression(max_iter=max_iter, tol=tol, C=C)
    model.fit(X_train, y_train)

    probs = model.predict_proba(X_test)[:, 1]
    preds = (probs >= threshold).astype(int)
    return fbeta_score(y_test, preds, beta=2)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

params = study.best_params.copy()
best_threshold = params.pop('threshold') # Removes 'threshold' from dict and saves it

best_model = LogisticRegression(**params)
best_model.fit(X_train, y_train)

final_probs = best_model.predict_proba(X_test)[:, 1]
final_preds = (final_probs >= best_threshold).astype(int)

[I 2026-01-06 02:27:46,148] A new study created in memory with name: no-name-9268d042-24e6-486f-9b5c-83edf1dbc28b
[I 2026-01-06 02:27:46,231] Trial 0 finished with value: 0.7333032490974729 and parameters: {'max_iter': 420, 'tol': 2.8488139458570518e-05, 'C': 2.3405427020118426, 'threshold': 0.3959469056932953}. Best is trial 0 with value: 0.7333032490974729.
[I 2026-01-06 02:27:46,270] Trial 1 finished with value: 0.7219626168224299 and parameters: {'max_iter': 196, 'tol': 0.00026316775511695564, 'C': 0.035608215152460605, 'threshold': 0.4452303298277518}. Best is trial 0 with value: 0.7333032490974729.
[I 2026-01-06 02:27:46,296] Trial 2 finished with value: 0.7229195722919572 and parameters: {'max_iter': 754, 'tol': 0.0033717335274033025, 'C': 0.0909983317699799, 'threshold': 0.4300680079989958}. Best is trial 0 with value: 0.7333032490974729.
[I 2026-01-06 02:27:46,342] Trial 3 finished with value: 0.736235595390525 and parameters: {'max_iter': 475, 'tol': 1.2605237846477201e-05, '

In [35]:
# --- Final Results ---
accuracy = accuracy_score(y_test, final_preds)
print(f"Accuracy: {accuracy:.4f}")
print(f"Optimal Threshold Found by Optuna: {best_threshold:.4f}")
print(classification_report(y_test, final_preds))

Accuracy: 0.6731
Optimal Threshold Found by Optuna: 0.3526
              precision    recall  f1-score   support

         0.0       0.94      0.59      0.73      1033
         1.0       0.44      0.90      0.59       374

    accuracy                           0.67      1407
   macro avg       0.69      0.74      0.66      1407
weighted avg       0.81      0.67      0.69      1407



### Random Forest

In [36]:
# --- Optuna Optimization Including Threshold Tuning ---

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    # Standard Hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 1, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
    
    threshold = trial.suggest_float('threshold', 0.2, 0.7)

    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, n_jobs=-1, class_weight=class_weight)
    model.fit(X_train, y_train)

    probs = model.predict_proba(X_test)[:, 1]
    preds = (probs >= threshold).astype(int)
    return fbeta_score(y_test, preds, beta=2)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)



[I 2026-01-06 02:27:48,869] A new study created in memory with name: no-name-71074fa2-21c5-4452-9796-b2a77e999b5a
[I 2026-01-06 02:27:50,459] Trial 0 finished with value: 0.5979711692471971 and parameters: {'n_estimators': 537, 'max_depth': 18, 'min_samples_leaf': 7, 'class_weight': 'balanced', 'threshold': 0.6112709608331536}. Best is trial 0 with value: 0.5979711692471971.
[I 2026-01-06 02:27:51,772] Trial 1 finished with value: 0.5515719801434087 and parameters: {'n_estimators': 432, 'max_depth': 15, 'min_samples_leaf': 3, 'class_weight': None, 'threshold': 0.4597567708224494}. Best is trial 0 with value: 0.5979711692471971.
[I 2026-01-06 02:27:52,307] Trial 2 finished with value: 0.651844365841334 and parameters: {'n_estimators': 167, 'max_depth': 13, 'min_samples_leaf': 4, 'class_weight': None, 'threshold': 0.34319978985844013}. Best is trial 2 with value: 0.651844365841334.
[I 2026-01-06 02:27:52,545] Trial 3 finished with value: 0.5416666666666666 and parameters: {'n_estimators'

In [37]:
# Print the best result
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7432141318397243
Best hyperparameters: {'n_estimators': 457, 'max_depth': 7, 'min_samples_leaf': 6, 'class_weight': 'balanced', 'threshold': 0.300056943862026}


In [38]:
params = study.best_params.copy()
best_threshold = params.pop('threshold')

best_model = RandomForestClassifier(**params)
best_model.fit(X_train, y_train)

final_probs = best_model.predict_proba(X_test)[:, 1]
final_preds = (final_probs >= best_threshold).astype(int)

# --- Final Results ---
accuracy = accuracy_score(y_test, final_preds)
print(f"Accuracy: {accuracy:.4f}")
print(f"Optimal Threshold Found by Optuna: {best_threshold:.4f}")
print(classification_report(y_test, final_preds))

Accuracy: 0.6347
Optimal Threshold Found by Optuna: 0.3001
              precision    recall  f1-score   support

         0.0       0.95      0.53      0.68      1033
         1.0       0.42      0.92      0.57       374

    accuracy                           0.63      1407
   macro avg       0.68      0.73      0.63      1407
weighted avg       0.81      0.63      0.65      1407



### Random Forest Smote

In [39]:
# --- Optuna Optimization Including Threshold Tuning ---

X_train = X_resampled_smote
y_train = y_resampled_smote

def objective(trial):
    # Standard Hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 1, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
    
    threshold = trial.suggest_float('threshold', 0.2, 0.7)

    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, n_jobs=-1, class_weight=class_weight)
    model.fit(X_train, y_train)

    probs = model.predict_proba(X_test)[:, 1]
    preds = (probs >= threshold).astype(int)
    return fbeta_score(y_test, preds, beta=2)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)



[I 2026-01-06 02:29:09,205] A new study created in memory with name: no-name-a586bc21-a9e1-46e4-89ce-e4a1e6acef3c
[I 2026-01-06 02:29:10,359] Trial 0 finished with value: 0.7309688581314879 and parameters: {'n_estimators': 341, 'max_depth': 13, 'min_samples_leaf': 2, 'class_weight': 'balanced', 'threshold': 0.2120559799852234}. Best is trial 0 with value: 0.7309688581314879.
[I 2026-01-06 02:29:13,971] Trial 1 finished with value: 0.738585496866607 and parameters: {'n_estimators': 877, 'max_depth': 16, 'min_samples_leaf': 7, 'class_weight': None, 'threshold': 0.2892230938779778}. Best is trial 1 with value: 0.738585496866607.
[I 2026-01-06 02:29:14,743] Trial 2 finished with value: 0.7342361404993653 and parameters: {'n_estimators': 200, 'max_depth': 13, 'min_samples_leaf': 8, 'class_weight': None, 'threshold': 0.21067011318968465}. Best is trial 1 with value: 0.738585496866607.
[I 2026-01-06 02:29:16,063] Trial 3 finished with value: 0.7363370880267001 and parameters: {'n_estimators':

In [40]:
# Print the best result
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7459677419354839
Best hyperparameters: {'n_estimators': 583, 'max_depth': 6, 'min_samples_leaf': 1, 'class_weight': 'balanced', 'threshold': 0.36790142925695235}


In [41]:
params = study.best_params.copy()
best_threshold = params.pop('threshold')

best_model = RandomForestClassifier(**params)
best_model.fit(X_train, y_train)

final_probs = best_model.predict_proba(X_test)[:, 1]
final_preds = (final_probs >= best_threshold).astype(int)

# --- Final Results ---
accuracy = accuracy_score(y_test, final_preds)
print(f"Accuracy: {accuracy:.4f}")
print(f"Optimal Threshold Found by Optuna: {best_threshold:.4f}")
print(classification_report(y_test, final_preds))

Accuracy: 0.6823
Optimal Threshold Found by Optuna: 0.3679
              precision    recall  f1-score   support

         0.0       0.94      0.61      0.74      1033
         1.0       0.45      0.89      0.60       374

    accuracy                           0.68      1407
   macro avg       0.69      0.75      0.67      1407
weighted avg       0.81      0.68      0.70      1407



### XG Boost

In [42]:
# --- Optuna Optimization Including Threshold Tuning ---

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    # Standard Hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 1, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    subsample = trial.suggest_float('subsample', 0, 1.0)
    
    threshold = trial.suggest_float('threshold', 0.2, 0.8)

    model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, subsample=subsample, n_jobs=-1)
    model.fit(X_train, y_train)

    probs = model.predict_proba(X_test)[:, 1]
    preds = (probs >= threshold).astype(int)
    return fbeta_score(y_test, preds, beta=2)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)


[I 2026-01-06 02:31:06,838] A new study created in memory with name: no-name-222547f9-151a-479e-8497-eafa1e9711f6
[I 2026-01-06 02:31:07,295] Trial 0 finished with value: 0.48324022346368717 and parameters: {'n_estimators': 589, 'max_depth': 3, 'subsample': 0.1332971515778666, 'threshold': 0.6099819235255146}. Best is trial 0 with value: 0.48324022346368717.
[I 2026-01-06 02:31:07,909] Trial 1 finished with value: 0.35883748517200476 and parameters: {'n_estimators': 580, 'max_depth': 4, 'subsample': 0.8753037582229941, 'threshold': 0.7626991591912464}. Best is trial 0 with value: 0.48324022346368717.
[I 2026-01-06 02:31:09,019] Trial 2 finished with value: 0.496996176952485 and parameters: {'n_estimators': 404, 'max_depth': 18, 'subsample': 0.22737589108639356, 'threshold': 0.5732099765786347}. Best is trial 2 with value: 0.496996176952485.
[I 2026-01-06 02:31:09,129] Trial 3 finished with value: 0.5274725274725275 and parameters: {'n_estimators': 31, 'max_depth': 10, 'subsample': 0.72

In [43]:
# Print the best result
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.6679298910468972
Best hyperparameters: {'n_estimators': 14, 'max_depth': 14, 'subsample': 0.4227090248963321, 'threshold': 0.2057452886775454}


In [44]:
params = study.best_params.copy()
best_threshold = params.pop('threshold')

best_model = XGBClassifier(**params)
best_model.fit(X_train, y_train)

final_probs = best_model.predict_proba(X_test)[:, 1]
final_preds = (final_probs >= best_threshold).astype(int)

# --- Final Results ---
accuracy = accuracy_score(y_test, final_preds)
print(f"Accuracy: {accuracy:.4f}")
print(f"Optimal Threshold Found by Optuna: {best_threshold:.4f}")
print(classification_report(y_test, final_preds))

Accuracy: 0.6979
Optimal Threshold Found by Optuna: 0.2057
              precision    recall  f1-score   support

         0.0       0.88      0.68      0.77      1033
         1.0       0.46      0.75      0.57       374

    accuracy                           0.70      1407
   macro avg       0.67      0.72      0.67      1407
weighted avg       0.77      0.70      0.71      1407



### XG Boost SMOTE

In [45]:
# --- Optuna Optimization Including Threshold Tuning ---

X_train = X_resampled_smote
y_train = y_resampled_smote

def objective(trial):
    # Standard Hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 1, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    subsample = trial.suggest_float('subsample', 0, 1.0)
    
    threshold = trial.suggest_float('threshold', 0.2, 0.8)

    model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, subsample=subsample, n_jobs=-1)
    model.fit(X_train, y_train)

    probs = model.predict_proba(X_test)[:, 1]
    preds = (probs >= threshold).astype(int)
    return fbeta_score(y_test, preds, beta=2)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)


[I 2026-01-06 02:31:37,552] A new study created in memory with name: no-name-23e40622-bd51-451b-90eb-f9acdab6b311
[I 2026-01-06 02:31:38,483] Trial 0 finished with value: 0.4849315068493151 and parameters: {'n_estimators': 320, 'max_depth': 8, 'subsample': 0.15831416350846694, 'threshold': 0.6536798726100779}. Best is trial 0 with value: 0.4849315068493151.
[I 2026-01-06 02:31:39,713] Trial 1 finished with value: 0.5423907319641916 and parameters: {'n_estimators': 937, 'max_depth': 17, 'subsample': 0.031246105814891068, 'threshold': 0.5402862084363118}. Best is trial 1 with value: 0.5423907319641916.
[I 2026-01-06 02:31:41,380] Trial 2 finished with value: 0.47686230248307 and parameters: {'n_estimators': 421, 'max_depth': 12, 'subsample': 0.6706259901665417, 'threshold': 0.7784863988008812}. Best is trial 1 with value: 0.5423907319641916.
[I 2026-01-06 02:31:41,463] Trial 3 finished with value: 0.4354178842781999 and parameters: {'n_estimators': 67, 'max_depth': 3, 'subsample': 0.2381

In [46]:
# Print the best result
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7139756944444444
Best hyperparameters: {'n_estimators': 41, 'max_depth': 18, 'subsample': 0.0029985521568688706, 'threshold': 0.4185682863880078}


In [47]:
params = study.best_params.copy()
best_threshold = params.pop('threshold')

best_model = XGBClassifier(**params)
best_model.fit(X_train, y_train)

final_probs = best_model.predict_proba(X_test)[:, 1]
final_preds = (final_probs >= best_threshold).astype(int)

# --- Final Results ---
accuracy = accuracy_score(y_test, final_preds)
print(f"Accuracy: {accuracy:.4f}")
print(f"Optimal Threshold Found by Optuna: {best_threshold:.4f}")
print(classification_report(y_test, final_preds))

Accuracy: 0.6276
Optimal Threshold Found by Optuna: 0.4186
              precision    recall  f1-score   support

         0.0       0.92      0.54      0.68      1033
         1.0       0.41      0.88      0.56       374

    accuracy                           0.63      1407
   macro avg       0.67      0.71      0.62      1407
weighted avg       0.79      0.63      0.65      1407

