In [41]:
import os
import optuna

import psycopg
import pandas as pd
import numpy as np
import mlflow
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score,log_loss, confusion_matrix

TABLE_NAME = "users_churn" # таблица с данными в postgres 
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "bvv_43"
RUN_NAME = "model_bayesian_search"
REGISTRY_MODEL_NAME = 'churn_model_bvv43'

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

pd.options.display.max_columns = 100
pd.options.display.max_rows = 64

# загружаем данные
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)
df.head(2) 


Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0
1,59,3957-SQXML,2017-04-01,NaT,Two year,No,Credit card (automatic),24.95,894.3,,,,,,,,Female,0,Yes,Yes,Yes,0


In [42]:
stratify_column = 'type'
split_column = "begin_date"
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
)
print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5634, 3)
Размер выборки для теста: (1409, 3)


In [None]:
from optuna.integration.mlflow import MLflowCallback
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict

# определяем основные credentials, которые нужны для подключения к MLflow
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env


mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

EXPERIMENT_NAME = "bvv_43" # ваш код здесь
RUN_NAME = "model_bayesian_search"

STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "churn_model"

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
)

def objective(trial: optuna.Trial) -> float:
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
        "random_strength": trial.suggest_float("random_strength", 0.1, 5),
        "loss_function": "Logloss",
        "task_type": "CPU",
        "random_seed": 0,
        "iterations": 300,
        "verbose": False,
    }
    model = CatBoostClassifier(**param)
    skf = StratifiedKFold(n_splits=2)

    metrics = defaultdict(list)

    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):


        train_x = X_train.iloc[train_index]
        train_y = y_train.iloc[train_index]
        val_x = X_train.iloc[val_index]
        val_y = y_train.iloc[val_index]
        
        model = CatBoostClassifier(**param)
        model.fit(train_x,train_y)        
        prediction = model.predict(val_x)
        probas = model.predict_proba(val_x)[:, 1]

        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, prediction)
        
        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)


    # ваш код здесь #
    err_1 = np.median(np.array(metrics['err1']))
    err_2 = np.median(np.array(metrics['err2']))
    auc = np.median(np.array(metrics['auc']))
    precision = np.median(np.array(metrics['precision']))
    recall = np.median(np.array(metrics['recall']))
    f1 = np.median(np.array(metrics['f1']))
    logloss = np.median(np.array(metrics['logloss']))
    model.fit(X_train, y_train)

    return auc


experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id
    

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_params(best_params)
    cv_info = mlflow.sklearn.log_model(skf, artifact_path='cv')


mlflc = MLflowCallback(
    tracking_uri=f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}",
    metric_name="AUC",
    create_experiment=False,
    mlflow_kwargs={'experiment_id': experiment_id, 'tags': {MLFLOW_PARENT_RUN_ID: run_id}}
)

study = optuna.create_study(sampler=optuna.samplers.TPESampler(), 
#                            storage=STUDY_DB_NAME,
#                            study_name=STUDY_NAME,
                            direction='maximize'
                            )
study.optimize(objective, n_trials=10, callbacks=[mlflc])
best_params = study.best_params
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best params: {best_params}")


run_id

  mlflc = MLflowCallback(
[I 2026-01-08 18:21:22,337] A new study created in memory with name: no-name-d80162a0-b95f-4046-91e8-629054d47292
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2026-01-08 18:21:26,298] Trial 0 finished with value: 0.788374779877326 and parameters: {'learning_rate': 0.09576166456851086, 'depth': 9, 'l2_leaf_reg': 3.8526123246460084, 'random_strength': 4.252528855223732}. Best is trial 0 with value: 0.788374779877326.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2026-01-08 18:21:28,688] Trial 1 finished with value: 0.7758438068855389 and parameters: {'learning_rate': 0.03696851652704129, 'depth': 8, 'l2_leaf_reg': 3.600932840522076, 'random_strength': 1.1640308415499183}. Best is trial 0 with value: 0.788374779877326.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2026-01-08 18:21:30,642] Trial 2 finished with value: 0.7955958322496877 and parameters: {'learning_rate': 0.04

Number of finished trials: 10
Best params: {'learning_rate': 0.04724615596380259, 'depth': 1, 'l2_leaf_reg': 0.7498468967563092, 'random_strength': 4.490029233069505}


'19ac6f2ad26e402dae0b2fb17bb211fd'

In [49]:
skf = StratifiedKFold(n_splits=2)
model_best = CatBoostClassifier(**best_params)
model_best.fit(X_train, y_train)

# оцениваем модель на тестовом наборе
test_score = model_best.score(X_test, y_test)
print("Точность на тестовой выборке:", test_score)

# считаем метрики
prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)

metrics = {}

confusion_matrix_data = confusion_matrix(y_test, prediction, normalize='all')
_, err1, _, err2 = confusion_matrix_data.ravel()
auc = roc_auc_score(y_test, probas[:,1])
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss
metrics

0:	learn: 0.6743889	total: 1.24ms	remaining: 1.24s
1:	learn: 0.6575127	total: 2.46ms	remaining: 1.23s
2:	learn: 0.6415928	total: 3.62ms	remaining: 1.2s
3:	learn: 0.6282290	total: 4.71ms	remaining: 1.17s
4:	learn: 0.6165609	total: 5.82ms	remaining: 1.16s
5:	learn: 0.6031964	total: 7.06ms	remaining: 1.17s
6:	learn: 0.5912786	total: 7.99ms	remaining: 1.13s
7:	learn: 0.5825358	total: 8.93ms	remaining: 1.11s
8:	learn: 0.5742890	total: 9.86ms	remaining: 1.08s
9:	learn: 0.5652670	total: 10.8ms	remaining: 1.07s
10:	learn: 0.5555826	total: 11.7ms	remaining: 1.05s
11:	learn: 0.5499455	total: 12.5ms	remaining: 1.03s
12:	learn: 0.5448199	total: 13.4ms	remaining: 1.02s
13:	learn: 0.5401003	total: 14.6ms	remaining: 1.03s
14:	learn: 0.5355376	total: 15.6ms	remaining: 1.03s
15:	learn: 0.5314333	total: 16.7ms	remaining: 1.03s
16:	learn: 0.5279106	total: 17.8ms	remaining: 1.03s
17:	learn: 0.5219492	total: 19ms	remaining: 1.03s
18:	learn: 0.5176985	total: 20ms	remaining: 1.03s
19:	learn: 0.5150247	total:

{'err1': 0.31156848828956707,
 'err2': 0.41589779985805536,
 'auc': 0.7534778744976068,
 'precision': 0.5717073170731707,
 'recall': 0.8746268656716418,
 'f1': 0.6914454277286136,
 'logloss': 13.378872052880247}

In [47]:
pip_requirements = "./requirements.txt"
signature =  mlflow.models.infer_signature(X_test, prediction) #mlflow.models.infer_signature(X_test.head(2), model_best.predict(X_test.head(2)))
input_example = X_test.head(2)

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    cv_info = mlflow.sklearn.log_model(skf, artifact_path='cv')
#    model_info = mlflow.sklearn.log_model( 
    model_info = mlflow.catboost.log_model( 
        artifact_path='models',
        cb_model=model_best,
        registered_model_name=REGISTRY_MODEL_NAME,
        pip_requirements=pip_requirements,
        signature=signature,
        input_example=input_example
        #await_registration_for=60
		)
run_id

  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'churn_model_bvv43' already exists. Creating a new version of this model...
2026/01/08 18:19:08 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_model_bvv43, version 15
Created version '15' of model 'churn_model_bvv43'.


'658c002c53f24265aa3a9b9b3bfc8d38'

In [50]:
pip_requirements = "./requirements.txt"
signature =  mlflow.models.infer_signature(X_test, prediction) #mlflow.models.infer_signature(X_test.head(2), model_best.predict(X_test.head(2)))
input_example = X_test.head(2)


mlflow.log_metrics(metrics)
mlflow.log_params(best_params)
cv_info = mlflow.sklearn.log_model(skf, artifact_path='cv')
#    model_info = mlflow.sklearn.log_model( 
model_info = mlflow.catboost.log_model( 
    artifact_path='models',
    cb_model=model_best,
    registered_model_name=REGISTRY_MODEL_NAME,
    pip_requirements=pip_requirements,
    signature=signature,
    input_example=input_example
)
run_id

  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'churn_model_bvv43' already exists. Creating a new version of this model...
2026/01/08 18:24:40 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_model_bvv43, version 16
Created version '16' of model 'churn_model_bvv43'.


'19ac6f2ad26e402dae0b2fb17bb211fd'

In [40]:
skf

NameError: name 'skf' is not defined