In [39]:
import os
import optuna
import mlflow
import numpy as np
from collections import defaultdict
from catboost import CatBoostClassifier
from optuna.integration.mlflow import MLflowCallback
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    log_loss,
)
from optuna.integration.mlflow import MLflowCallback
from statistics import median

In [31]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [32]:
EXPERIMENT_NAME = "test_connection_experiment_vadim_shakula" 
RUN_NAME = "model_bayesian_search"

STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "churn_model"

In [33]:
TABLE_NAME = "clean_users_churn"
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.environ["DB_DESTINATION_HOST"], 
    "port": os.environ["DB_DESTINATION_PORT"],
    "dbname": os.environ["DB_DESTINATION_NAME"],
    "user": os.environ["DB_DESTINATION_USER"],
    "password": os.environ["DB_DESTINATION_PASSWORD"],
}
assert all([var_value != "" for var_value in list(postgres_credentials.values())])

connection.update(postgres_credentials)

In [34]:
# эта конструкция создаёт контекстное управление для соединения с базой данных 
# оператор with гарантирует, что соединение будет корректно закрыто после выполнения всех операций 
# закрыто оно будет даже в случае ошибки, чтобы не допустить "утечку памяти"
with psycopg.connect(**connection) as conn:

# создаёт объект курсора для выполнения запросов к базе данных
# с помощью метода execute() выполняется SQL-запрос для выборки данных из таблицы TABLE_NAME
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
                
                # извлекаем все строки, полученные в результате выполнения запроса
        data = cur.fetchall()

                # получает список имён столбцов из объекта курсора
        columns = [col[0] for col in cur.description]

# создаёт объект DataFrame из полученных данных и имён столбцов. 
# это позволяет удобно работать с данными в Python, используя библиотеку Pandas.
df = pd.DataFrame(data, columns=columns)

In [35]:
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = "begin_date"
test_size = 0.2

df = df.sort_values(by=[split_column])
X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
)

In [44]:
def objective(trial: optuna.Trial) -> float:
    param = {  # ваш код здесь #
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
        "random_strength": trial.suggest_float("random_strength", 0.1, 5),
        "loss_function": "Logloss",
        "task_type": "CPU",
        "random_seed": 0,
        "iterations": 300,
        "verbose": False,
    }
    model = CatBoostClassifier(**param)
    
    skf = StratifiedKFold(n_splits=2)
    
    metrics = defaultdict(list)
    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        # ваш код здесь #
        train_x = X_train.iloc[train_index]
        train_y = y_train.iloc[train_index]
        val_x = X_train.iloc[val_index]
        val_y = y_train.iloc[val_index]
    
        model.fit(train_x, train_y)
        prediction = model.predict(val_x)
        probas = model.predict_proba(val_x)[:, 1]

        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, probas)
        
        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)
        
            # ваш код здесь #
    err_1 = np.median(np.array(metrics['err1'])) 
    err_2 = np.median(np.array(metrics['err2']))
    auc = np.median(np.array(metrics['auc']))
    precision = np.median(np.array(metrics['precision']))
    recall = np.median(np.array(metrics['recall']))
    f1 = np.median(np.array(metrics['f1']))
    logloss = np.median(np.array(metrics['logloss']))
		
    return auc

In [45]:
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

In [46]:
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    if mlflow.active_run():
        mlflow.end_run()

    mlflc = MLflowCallback(
        tracking_uri=f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}",
        metric_name='AUC',
        create_experiment=False,
        mlflow_kwargs={'experiment_id': experiment_id, 'tags': {MLFLOW_PARENT_RUN_ID: run_id}}
    )

    study = optuna.create_study(  # ваш код здесь #
        study_name=STUDY_NAME,
        storage=STUDY_DB_NAME,
        sampler=optuna.samplers.TPESampler(),
        direction="maximize",
        load_if_exists=True
    )
    study.optimize(objective, n_trials=10, callbacks=[mlflc])  # ваш код здесь #
    best_params = study.best_params

    print(f"Number of finished trials: {len(study.trials)}")
    print(f"Best params: {best_params}")

  mlflc = MLflowCallback(
[I 2025-09-07 20:36:01,078] Using an existing study with name 'churn_model' instead of creating a new one.


  _warn_prf(average, modifier, msg_start, len(result))
[I 2025-09-07 20:36:02,121] Trial 16 finished with value: 0.8123816272596209 and parameters: {'learning_rate': 0.031781626725825574, 'depth': 3, 'l2_leaf_reg': 1.0920148895713992, 'random_strength': 1.1994276508009563}. Best is trial 10 with value: 0.8216664878470447.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2025-09-07 20:36:03,339] Trial 17 finished with value: 0.813996517557029 and parameters: {'learning_rate': 0.025661769518171302, 'depth': 4, 'l2_leaf_reg': 1.0461285099606914, 'random_strength': 1.2760425106092097}. Best is trial 10 with value: 0.8216664878470447.
  _warn_prf(average, modifier, msg_start, len(result))
[I 2025-09-07 20:36:04,263] Trial 18 finished with value: 0.7900886696725458 and parameters: {'learning_rate': 0.05619812237902952, 'depth': 1, 'l2_leaf_reg': 1.1787749452481888, 'random_strength': 1.2056690760102264}. Best is trial 10 with value: 0.8216664878470447.
  _warn_prf(average, modifier,

Number of finished trials: 26
Best params: {'learning_rate': 0.03409399693879268, 'depth': 2, 'l2_leaf_reg': 0.20176729035797436, 'random_strength': 0.19822463155477843}


In [47]:
REGISTRY_MODEL_NAME = "best_model_bayesian_search"

best_model = CatBoostClassifier(**best_params, verbose=False).fit(X_train, y_train)

mlflow.set_experiment(EXPERIMENT_NAME)

with mlflow.start_run(run_id=run_id) as run: 
    mlflow.log_params(best_params) 
     
    mlflow.sklearn.log_model(
        sk_model=best_model, 
        artifact_path="cv", 
        registered_model_name=REGISTRY_MODEL_NAME 
    )

Successfully registered model 'best_model_bayesian_search'.
2025/09/07 20:40:07 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: best_model_bayesian_search, version 1
Created version '1' of model 'best_model_bayesian_search'.


In [48]:
print(run_id)

aeb2ec90707247b1b5d92a8b550d29c1
