In [77]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, cross_validate
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, make_scorer
import mlflow
import mlflow.sklearn

import optuna


# Logistic regression (baseLine)

In [76]:
data = pd.read_csv(r"..\data/preprocessed_dummy_Data.csv",)
X, y = data.drop(['Churn'], axis=1), data['Churn']

## Simple Solution

In [106]:
mlflow.set_experiment("Customer_Churn")

scoring = ['roc_auc', 'f1', 'precision', 'recall']

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(class_weight='balanced', random_state=42))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

with mlflow.start_run(run_name="basicRegression"):
    mlflow.log_param("model", "base_LogisticRegression")
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_param("solver", "lbfgs")
    
    scores = cross_validate(pipeline, X, y, cv=skf, scoring=scoring, return_train_score=False)
    
    for metric in scoring:
        mean_score = scores[f'test_{metric}'].mean()
        mlflow.log_metric(f"CV_{metric}_mean", mean_score)
        print(f"{metric}: mean={mean_score:.3f}")
    
    pipeline.fit(X, y)
    mlflow.sklearn.log_model(pipeline, "logreg_pipeline")

pd.DataFrame(scores)

2025/11/21 01:29:31 INFO mlflow.tracking.fluent: Experiment with name 'Customer_Churn' does not exist. Creating a new experiment.


roc_auc: mean=0.842
f1: mean=0.625
precision: mean=0.515
recall: mean=0.796




Unnamed: 0,fit_time,score_time,test_roc_auc,test_f1,test_precision,test_recall
0,0.018234,0.015012,0.851576,0.638478,0.527972,0.807487
1,0.025866,0.013799,0.842649,0.635802,0.516722,0.826203
2,0.02319,0.013704,0.857488,0.62803,0.518261,0.796791
3,0.020062,0.016475,0.82471,0.605096,0.500879,0.764075
4,0.022173,0.014248,0.835542,0.6196,0.511304,0.786096


## Tuned logistic regression

In [96]:
def objective(trial):
   
    C = trial.suggest_float('C', 1, 100, log=True)   
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
       # Подбор class_weight
    cw_option = trial.suggest_categorical('class_weight_option', ['none', 'balanced', 'custom'])
    if cw_option == 'none':
        class_weight = None
    elif cw_option == 'balanced':
        class_weight = 'balanced'
    else:
        # пример: w0=1, w1=trial от 1 до 5
        w1 = trial.suggest_float('custom_weight_1', 1.0, 5.0)
        class_weight = {0: 1.0, 1: w1}
        
    solver = 'liblinear' if penalty == 'l1' else 'lbfgs'   
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('logreg', LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=1000, random_state=42, class_weight=class_weight))
    ])
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    scores = cross_val_score(pipeline, X, y, cv=skf, scoring='f1')
    
    return scores.mean()


In [99]:
study = optuna.create_study(direction='maximize')  
study.optimize(objective, n_trials=100)

[I 2025-11-21 01:20:17,539] A new study created in memory with name: no-name-17405976-5fda-4e65-b11c-81deb0975029
[I 2025-11-21 01:20:21,644] Trial 0 finished with value: 0.6157540130670005 and parameters: {'C': 64.00464255804783, 'penalty': 'l1', 'class_weight_option': 'custom', 'custom_weight_1': 3.634901170414534}. Best is trial 0 with value: 0.6157540130670005.
[I 2025-11-21 01:20:24,859] Trial 1 finished with value: 0.5735322001046615 and parameters: {'C': 20.178919587228144, 'penalty': 'l1', 'class_weight_option': 'none'}. Best is trial 0 with value: 0.6157540130670005.
[I 2025-11-21 01:20:26,737] Trial 2 finished with value: 0.5730383889625864 and parameters: {'C': 3.7589958530585266, 'penalty': 'l1', 'class_weight_option': 'none'}. Best is trial 0 with value: 0.6157540130670005.
[I 2025-11-21 01:20:26,854] Trial 3 finished with value: 0.5732755179719893 and parameters: {'C': 14.752164643576586, 'penalty': 'l2', 'class_weight_option': 'none'}. Best is trial 0 with value: 0.61575

In [100]:
print("Best trial:")
trial = study.best_trial
print("  F1: {:.4f}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Best trial:
  F1: 0.6337
  Params: 
    C: 32.514063443302604
    penalty: l2
    class_weight_option: custom
    custom_weight_1: 2.0601019447356177


In [107]:
mlflow.set_experiment("Customer_Churn")

scoring = ['roc_auc', 'f1', 'precision', 'recall']

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(class_weight={0: 1.0, 1: 2}, random_state=42, C=32, penalty='l2'))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

with mlflow.start_run(run_name='tunedRegression'):
    mlflow.log_param("model", "tuned_LogisticRegression")
    mlflow.log_param("class_weight", "custom, w2=2.0")
    mlflow.log_param("solver", "lbfgs")
    mlflow.log_param("C", "32.8")
    mlflow.log_param("penalty", "L2")
    
    scores = cross_validate(pipeline, X, y, cv=skf, scoring=scoring, return_train_score=False)
    
    for metric in scoring:
        mean_score = scores[f'test_{metric}'].mean()
        mlflow.log_metric(f"CV_{metric}_mean", mean_score)
        print(f"{metric}: mean={mean_score:.3f}")
    
    pipeline.fit(X, y)
    mlflow.sklearn.log_model(pipeline, "logreg_pipeline_tuned")

pd.DataFrame(scores)



roc_auc: mean=0.842
f1: mean=0.630
precision: mean=0.555
recall: mean=0.728




Unnamed: 0,fit_time,score_time,test_roc_auc,test_f1,test_precision,test_recall
0,0.020784,0.012567,0.851431,0.646512,0.572016,0.743316
1,0.019981,0.015765,0.842644,0.645455,0.561265,0.759358
2,0.029173,0.014781,0.857359,0.646651,0.569106,0.748663
3,0.018884,0.01549,0.824847,0.599526,0.537155,0.678284
4,0.024504,0.014855,0.835449,0.609896,0.535354,0.708556
