In [1]:
import os
import optuna

import psycopg
import pandas as pd
import numpy as np
import mlflow
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score,log_loss, confusion_matrix

TABLE_NAME = "users_churn" # таблица с данными в postgres 
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "bvv_43"
RUN_NAME = "model_bayesian_search"
REGISTRY_MODEL_NAME = 'churn_model_bvv43'

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

pd.options.display.max_columns = 100
pd.options.display.max_rows = 64

# загружаем данные
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)
df.head(2) 


Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0
1,59,3957-SQXML,2017-04-01,NaT,Two year,No,Credit card (automatic),24.95,894.3,,,,,,,,Female,0,Yes,Yes,Yes,0


In [2]:
stratify_column = 'type'
split_column = "begin_date"
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
)
print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5634, 3)
Размер выборки для теста: (1409, 3)


In [None]:
from optuna.integration.mlflow import MLflowCallback
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict

# определяем основные credentials, которые нужны для подключения к MLflow
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env


mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

EXPERIMENT_NAME = "bvv_43" # ваш код здесь
RUN_NAME = "model_bayesian_search"

STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "churn_model"

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
)

skf = StratifiedKFold(n_splits=2)

def objective(trial: optuna.Trial) -> float:
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
        "random_strength": trial.suggest_float("random_strength", 0.1, 5),
        "loss_function": "Logloss",
        "task_type": "CPU",
        "random_seed": 0,
        "iterations": 300,
        "verbose": False,
    }
    model = CatBoostClassifier(**param)
    skf = StratifiedKFold(n_splits=2)

    metrics = defaultdict(list)

    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):


        train_x = X_train.iloc[train_index]
        train_y = y_train.iloc[train_index]
        val_x = X_train.iloc[val_index]
        val_y = y_train.iloc[val_index]
        
        model = CatBoostClassifier(**param)
        model.fit(train_x,train_y)        
        prediction = model.predict(val_x)
        probas = model.predict_proba(val_x)[:, 1]

        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, prediction)
        
        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)


    # ваш код здесь #
    err_1 = np.median(np.array(metrics['err1']))
    err_2 = np.median(np.array(metrics['err2']))
    auc = np.median(np.array(metrics['auc']))
    precision = np.median(np.array(metrics['precision']))
    recall = np.median(np.array(metrics['recall']))
    f1 = np.median(np.array(metrics['f1']))
    logloss = np.median(np.array(metrics['logloss']))
    model.fit(X_train, y_train)

    return auc


experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    
mlflc = MLflowCallback(
    tracking_uri=f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}",
    metric_name="AUC",
    create_experiment=False,
    mlflow_kwargs={'experiment_id': experiment_id, 'tags': {MLFLOW_PARENT_RUN_ID: run_id}}
)

study = optuna.create_study(sampler=optuna.samplers.TPESampler(), 
#                            storage=STUDY_DB_NAME,
#                            study_name=STUDY_NAME,
                            direction='maximize'
                            )
study.optimize(objective, n_trials=10, callbacks=[mlflc])
best_params = study.best_params
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best params: {best_params}")

mlflow.sklearn.log_model(skf, artifact_path='cv')
mlflow.log_params(best_params)

run_id

  mlflc = MLflowCallback(
[I 2026-01-08 18:40:25,703] A new study created in memory with name: no-name-bff9d921-6a18-4ac8-8851-06ac983fe34a
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2026-01-08 18:40:27,280] Trial 0 finished with value: 0.7613161180629465 and parameters: {'learning_rate': 0.0052314290234245844, 'depth': 7, 'l2_leaf_reg': 3.659186742930368, 'random_strength': 4.722255184331505}. Best is trial 0 with value: 0.7613161180629465.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2026-01-08 18:40:28,343] Trial 1 finished with value: 0.7216896893685859 and parameters: {'learning_rate': 0.011227320676851062, 'depth': 1, 'l2_leaf_reg': 1.7701097855028283, 'random_strength': 2.0494920966195047}. Best is trial 0 with value: 0.7613161180629465.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2026-01-08 18:40:33,156] Trial 2 finished with value: 0.7252947105325118 and parameters: {'learning_rate'

Number of finished trials: 10
Best params: {'learning_rate': 0.07677592607700714, 'depth': 5, 'l2_leaf_reg': 3.564660264414854, 'random_strength': 2.014838535216445}


'ccdebb93b5f94e41b3ca0ea98a3de4dc'

In [23]:
skf = StratifiedKFold(n_splits=2)
model_best = CatBoostClassifier(**best_params)
model_best.fit(X_train, y_train)

# оцениваем модель на тестовом наборе
test_score = model_best.score(X_test, y_test)
print("Точность на тестовой выборке:", test_score)

# считаем метрики
prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)

metrics = {}

confusion_matrix_data = confusion_matrix(y_test, prediction, normalize='all')
_, err1, _, err2 = confusion_matrix_data.ravel()
auc = roc_auc_score(y_test, probas[:,1])
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss
metrics

0:	learn: 0.6632102	total: 1.43ms	remaining: 1.42s
1:	learn: 0.6239074	total: 4.21ms	remaining: 2.1s
2:	learn: 0.5918118	total: 6.97ms	remaining: 2.32s
3:	learn: 0.5638412	total: 8.67ms	remaining: 2.16s
4:	learn: 0.5501207	total: 9.77ms	remaining: 1.94s
5:	learn: 0.5321702	total: 11.4ms	remaining: 1.89s
6:	learn: 0.5130348	total: 13.3ms	remaining: 1.89s
7:	learn: 0.4964164	total: 15.1ms	remaining: 1.87s
8:	learn: 0.4821786	total: 16.8ms	remaining: 1.84s
9:	learn: 0.4705772	total: 18.5ms	remaining: 1.83s
10:	learn: 0.4665462	total: 19.9ms	remaining: 1.78s
11:	learn: 0.4561796	total: 21.8ms	remaining: 1.79s
12:	learn: 0.4504036	total: 24.3ms	remaining: 1.84s
13:	learn: 0.4433922	total: 26.9ms	remaining: 1.89s
14:	learn: 0.4392224	total: 29.3ms	remaining: 1.93s
15:	learn: 0.4373353	total: 30.7ms	remaining: 1.89s
16:	learn: 0.4314454	total: 32.4ms	remaining: 1.88s
17:	learn: 0.4282440	total: 34.1ms	remaining: 1.86s
18:	learn: 0.4242110	total: 35.9ms	remaining: 1.85s
19:	learn: 0.4206536	to

{'err1': 0.34918381831085876,
 'err2': 0.42583392476933996,
 'auc': 0.7373588754468524,
 'precision': 0.5494505494505495,
 'recall': 0.8955223880597015,
 'f1': 0.681044267877412,
 'logloss': 14.376531727951624}

In [28]:
pip_requirements = "./requirements.txt"
signature =  mlflow.models.infer_signature(X_test, prediction) #mlflow.models.infer_signature(X_test.head(2), model_best.predict(X_test.head(2)))
input_example = X_test.head(2)

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_id=run_id, nested=True):
    #run_id = run.info.run_id
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    cv_info = mlflow.sklearn.log_model(skf, artifact_path='cv')
#    model_info = mlflow.sklearn.log_model( 
    model_info = mlflow.catboost.log_model( 
        artifact_path='cv',
        cb_model=model_best,
        registered_model_name=REGISTRY_MODEL_NAME,
        pip_requirements=pip_requirements,
        signature=signature,
        input_example=input_example
        #await_registration_for=60
		)
run_id

Registered model 'churn_model_bvv43' already exists. Creating a new version of this model...
2026/01/08 19:04:44 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_model_bvv43, version 18
Created version '18' of model 'churn_model_bvv43'.


'ccdebb93b5f94e41b3ca0ea98a3de4dc'