In [152]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, cross_validate
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, make_scorer
import mlflow
import mlflow.sklearn

import optuna

import json

import shap

# Logistic regression (baseLine)

In [129]:
data = pd.read_csv(r"..\data/preprocessed_dummy_Data.csv",)
X, y = data.drop(['Churn'], axis=1), data['Churn']

## Simple Solution

In [130]:
mlflow.set_experiment("Customer_Churn")

scoring = ['roc_auc', 'f1', 'precision', 'recall']

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(class_weight='balanced', random_state=42))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

with mlflow.start_run(run_name="basicRegression"):
    mlflow.log_param("model", "base_LogisticRegression")
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_param("solver", "lbfgs")
    
    scores = cross_validate(pipeline, X, y, cv=skf, scoring=scoring, return_train_score=False)
    
    for metric in scoring:
        mean_score = scores[f'test_{metric}'].mean()
        mlflow.log_metric(f"CV_{metric}_mean", mean_score)
        print(f"{metric}: mean={mean_score:.3f}")
    
    pipeline.fit(X, y)
    mlflow.sklearn.log_model(pipeline, "logreg_pipeline")

pd.DataFrame(scores)



roc_auc: mean=0.843
f1: mean=0.627
precision: mean=0.518
recall: mean=0.795




Unnamed: 0,fit_time,score_time,test_roc_auc,test_f1,test_precision,test_recall
0,0.020065,0.011959,0.852043,0.637712,0.52807,0.804813
1,0.02074,0.013401,0.843039,0.632885,0.51602,0.818182
2,0.020372,0.012727,0.861283,0.643312,0.533451,0.81016
3,0.014724,0.013213,0.823568,0.604255,0.500882,0.761394
4,0.022254,0.014383,0.833918,0.618491,0.513228,0.778075


## Tuned logistic regression

In [131]:
def objective(trial):
   
    C = trial.suggest_float('C', 1, 100, log=True)   
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
       # Подбор class_weight
    cw_option = trial.suggest_categorical('class_weight_option', ['none', 'balanced', 'custom'])
    if cw_option == 'none':
        class_weight = None
    elif cw_option == 'balanced':
        class_weight = 'balanced'
    else:
        # пример: w0=1, w1=trial от 1 до 5
        w1 = trial.suggest_float('custom_weight_1', 1.0, 5.0)
        class_weight = {0: 1.0, 1: w1}
        
    solver = 'liblinear' if penalty == 'l1' else 'lbfgs'   
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('logreg', LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=1000, random_state=42, class_weight=class_weight))
    ])
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    scores = cross_val_score(pipeline, X, y, cv=skf, scoring='f1')
    
    return scores.mean()


In [134]:
study = optuna.create_study(direction='maximize')  
study.optimize(objective, n_trials=50)

[I 2025-11-22 00:02:57,029] A new study created in memory with name: no-name-ac8e653e-3a82-4ac1-9927-81d9d84b9a6b
[I 2025-11-22 00:02:57,157] Trial 0 finished with value: 0.6290853121290508 and parameters: {'C': 13.345390679424492, 'penalty': 'l2', 'class_weight_option': 'custom', 'custom_weight_1': 2.2733392861708563}. Best is trial 0 with value: 0.6290853121290508.
[I 2025-11-22 00:02:57,284] Trial 1 finished with value: 0.626523267087044 and parameters: {'C': 2.617578009107284, 'penalty': 'l2', 'class_weight_option': 'balanced'}. Best is trial 0 with value: 0.6290853121290508.
[I 2025-11-22 00:02:57,424] Trial 2 finished with value: 0.6274086607587737 and parameters: {'C': 64.11682505892446, 'penalty': 'l2', 'class_weight_option': 'balanced'}. Best is trial 0 with value: 0.6290853121290508.
[I 2025-11-22 00:02:57,547] Trial 3 finished with value: 0.5948594219389023 and parameters: {'C': 40.65587062266152, 'penalty': 'l2', 'class_weight_option': 'none'}. Best is trial 0 with value: 0

In [None]:
print("Best trial:")
trial = study.best_trial
print("  F1: {:.4f}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Best trial:
  F1: 0.6301
  Params: 
    C: 1.7330413797167055
    penalty: l1
    class_weight_option: custom
    custom_weight_1: 2.0944938567091924


In [136]:
mlflow.set_experiment("Customer_Churn")

scoring = ['roc_auc', 'f1', 'precision', 'recall']

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(class_weight={0: 1.0, 1: 2.1}, random_state=42, C=1.73, penalty='l1', solver='liblinear'))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

with mlflow.start_run(run_name='tunedRegression'):
    mlflow.log_param("model", "tuned_LogisticRegression")
    mlflow.log_param("class_weight", "custom, w2=2.1")
    mlflow.log_param("solver", "liblinear")
    mlflow.log_param("C", "1.73")
    mlflow.log_param("penalty", "L1")
    
    scores = cross_validate(pipeline, X, y, cv=skf, scoring=scoring, return_train_score=False)
    
    for metric in scoring:
        mean_score = scores[f'test_{metric}'].mean()
        mlflow.log_metric(f"CV_{metric}_mean", mean_score)
        print(f"{metric}: mean={mean_score:.3f}")
    
    pipeline.fit(X, y)
    mlflow.sklearn.log_model(pipeline, "logreg_pipeline_tuned")

pd.DataFrame(scores)

roc_auc: mean=0.843
f1: mean=0.629
precision: mean=0.552
recall: mean=0.732




Unnamed: 0,fit_time,score_time,test_roc_auc,test_f1,test_precision,test_recall
0,0.290849,0.012814,0.851715,0.651217,0.574642,0.751337
1,0.268429,0.011559,0.842871,0.62754,0.542969,0.743316
2,0.461974,0.01615,0.861562,0.655963,0.574297,0.764706
3,0.366043,0.016032,0.823892,0.604706,0.538784,0.689008
4,0.295056,0.013752,0.83397,0.606613,0.528827,0.71123


# Random Forest

In [159]:
def objective(trial):
   
    n_estimators = trial.suggest_int('n_estimators', 50, 1000, log=True)   
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 50, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 50, log=True)
    cw_option = trial.suggest_categorical('class_weight_option', [None, 'balanced', 'balanced_subsample'])
    
    RF_clf = RandomForestClassifier(
        n_estimators=n_estimators, 
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        min_samples_split=min_samples_split,
        class_weight=cw_option
        )
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    scores = cross_val_score(RF_clf, X, y, cv=skf, scoring='f1')
    
    return scores.mean()


In [160]:
RF_study = optuna.create_study(direction='maximize')  
RF_study.optimize(objective, n_trials=50)

[I 2025-11-22 01:01:48,778] A new study created in memory with name: no-name-cc5969b1-4647-4f35-aa40-d9bac355986a
[I 2025-11-22 01:01:53,881] Trial 0 finished with value: 0.5763658037403031 and parameters: {'n_estimators': 252, 'max_depth': 9, 'min_samples_leaf': 13, 'min_samples_split': 16, 'class_weight_option': None}. Best is trial 0 with value: 0.5763658037403031.
[I 2025-11-22 01:01:57,951] Trial 1 finished with value: 0.6338903691114053 and parameters: {'n_estimators': 158, 'max_depth': 29, 'min_samples_leaf': 27, 'min_samples_split': 3, 'class_weight_option': 'balanced_subsample'}. Best is trial 1 with value: 0.6338903691114053.
[I 2025-11-22 01:02:18,741] Trial 2 finished with value: 0.6220953836156977 and parameters: {'n_estimators': 581, 'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 3, 'class_weight_option': 'balanced_subsample'}. Best is trial 1 with value: 0.6338903691114053.
[I 2025-11-22 01:02:26,711] Trial 3 finished with value: 0.6348067869336094 and para

In [162]:
print("Best trial:")
trial = RF_study.best_trial
print("  F1: {:.4f}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Best trial:
  F1: 0.6359
  Params: 
    n_estimators: 307
    max_depth: 22
    min_samples_leaf: 28
    min_samples_split: 11
    class_weight_option: balanced_subsample


In [164]:
mlflow.set_experiment("Customer_Churn")

scoring = ['roc_auc', 'f1', 'precision', 'recall']

RF_clf = RandomForestClassifier(n_estimators=307, max_depth=22, min_samples_leaf=28, min_samples_split=11, class_weight='balanced_subsample')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

with mlflow.start_run(run_name='RandomForest'):
    mlflow.log_param("model", "RadomForest")
    mlflow.log_param("class_weight", "balanced_subsample")
    mlflow.log_param("max_depth", 307)
    mlflow.log_param("min_samples_leaf", 28)
    mlflow.log_param("min_samples_split", 11)
    
    scores = cross_validate(RF_clf, X, y, cv=skf, scoring=scoring, return_train_score=False)
    
    for metric in scoring:
        mean_score = scores[f'test_{metric}'].mean()
        mlflow.log_metric(f"CV_{metric}_mean", mean_score)
        print(f"{metric}: mean={mean_score:.3f}")
    
    RF_clf.fit(X, y)
    mlflow.sklearn.log_model(RF_clf, "RandomForest")

pd.DataFrame(scores)

roc_auc: mean=0.846
f1: mean=0.634
precision: mean=0.528
recall: mean=0.796




Unnamed: 0,fit_time,score_time,test_roc_auc,test_f1,test_precision,test_recall
0,1.903985,0.106137,0.858489,0.648765,0.54219,0.807487
1,1.559465,0.095682,0.844105,0.632231,0.515152,0.818182
2,1.520422,0.092243,0.856079,0.646617,0.540395,0.804813
3,1.523029,0.097767,0.828957,0.613537,0.517495,0.753351
4,1.491919,0.101302,0.84233,0.631243,0.52381,0.794118
