In [None]:
import os

import psycopg
import pandas as pd
import numpy as np
import mlflow
from catboost import CatBoostClassifier
#from mlxtend.feature_selection import SequentialFeatureSelector as SFS
#from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
#from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score,log_loss, confusion_matrix
#import matplotlib.pyplot as plt

TABLE_NAME = "users_churn" # таблица с данными в postgres 
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "bvv_43"
RUN_NAME = "model_grid_search"
REGISTRY_MODEL_NAME = 'churn_model_bvv43'

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

pd.options.display.max_columns = 100
pd.options.display.max_rows = 64

# загружаем данные
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)
df.head(2) 


Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0
1,59,3957-SQXML,2017-04-01,NaT,Two year,No,Credit card (automatic),24.95,894.3,,,,,,,,Female,0,Yes,Yes,Yes,0


In [46]:
stratify_column = 'type'
split_column = "begin_date"
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
)
print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5634, 3)
Размер выборки для теста: (1409, 3)


##### GridSearchCV

In [47]:
loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

params = {
    'learning_rate': [0.01, 0.02, 0.05],
    'depth': [6,7,8]
}

model = CatBoostClassifier(loss_function=loss_function, task_type=task_type, random_seed=random_seed, iterations=iterations, verbose=verbose)

cv = GridSearchCV(estimator=model, param_grid=params, cv=2, n_jobs=-1)

clf = cv.fit(X_train,y_train)

# выводим лучшие параметры и лучший счёт
best_score = clf.best_score_
best_params = clf.best_params_
print("Лучшая точность:", best_score)
print("Лучшие гиперпараметры:", best_params)

# обучаем модель с лучшими параметрами на всём обучающем наборе
model_best = CatBoostClassifier(loss_function=loss_function, task_type=task_type, random_seed=random_seed, iterations=iterations, verbose=verbose, **best_params)
model_best.fit(X_train, y_train)

# оцениваем модель на тестовом наборе
test_score = model_best.score(X_test, y_test)
print("Точность на тестовой выборке:", test_score)

Лучшая точность: 0.6418175363862265
Лучшие гиперпараметры: {'depth': 6, 'learning_rate': 0.01}
Точность на тестовой выборке: 0.6422995031937544


In [61]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score,log_loss, confusion_matrix

# считаем метрики
prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)

metrics = {}

confusion_matrix_data = confusion_matrix(y_test, prediction, normalize='all')
_, err1, _, err2 = confusion_matrix_data.ravel()
auc = roc_auc_score(y_test, probas[:,1])
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

cv_results = pd.DataFrame(clf.cv_results_)

# дополнительные метрики из результатов кросс-валидации
metrics["mean_fit_time"] = cv_results['mean_fit_time'].mean() # среднее время обучения
metrics["std_fit_time"] =  cv_results['std_fit_time'].mean()  # стандартное отклонение времени обучения
metrics["mean_test_score"] = cv_results['mean_test_score'].mean()# средний результат на тесте
metrics["std_test_score"] = cv_results['std_test_score'].mean() # стандартное отклонение результата на тесте
metrics["best_score"] = best_score    # лучший результат кросс-валидации
metrics


{'err1': 0.262597586941093,
 'err2': 0.38041163946061035,
 'auc': 0.7184113263183407,
 'precision': 0.5916114790286976,
 'recall': 0.8,
 'f1': 0.6802030456852792,
 'logloss': 12.892832723999323,
 'mean_fit_time': 1.280699094136556,
 'std_fit_time': 0.09376695421006945,
 'mean_test_score': 0.6122549599652901,
 'std_test_score': 0.17475249477379406,
 'best_score': 0.6418175363862265}

In [None]:
# настройки для логирования в MLFlow
# определяем основные credentials, которые нужны для подключения к MLflow
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

# регистрируем модель и логируем метрики
pip_requirements = "./requirements.txt"
signature =  mlflow.models.infer_signature(X_test, prediction) #mlflow.models.infer_signature(X_test.head(2), model_best.predict(X_test.head(2)))
input_example = X_test.head(2)
#input_example = X_test[:10]



  inputs = _infer_schema(model_input) if model_input is not None else None


In [None]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_metrics(metrics)
    model_info = mlflow.sklearn.log_model( 
#    model_info = mlflow.catboost.log_model( 
        artifact_path='models',
        sk_model=model_best,
        registered_model_name=REGISTRY_MODEL_NAME,
        pip_requirements=pip_requirements,
        signature=signature,
        input_example=input_example
        #await_registration_for=60
		)
run_id

Registered model 'churn_model_bvv43' already exists. Creating a new version of this model...
2026/01/08 08:50:29 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_model_bvv43, version 11
Created version '11' of model 'churn_model_bvv43'.


'128ca9c7c4314a23b767e662077c4a4e'

In [62]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    model_info = mlflow.catboost.log_model( 
        artifact_path='models',
        cb_model=model_best,
        registered_model_name=REGISTRY_MODEL_NAME,
        pip_requirements=pip_requirements,
        signature=signature,
        input_example=input_example
        #await_registration_for=60
		)
run_id

Registered model 'churn_model_bvv43' already exists. Creating a new version of this model...
2026/01/08 09:18:38 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_model_bvv43, version 12
Created version '12' of model 'churn_model_bvv43'.


'880cf3391eb943308976a3168cb5a4d4'

##### RUN_NAME = "model_random_search"

In [None]:
RUN_NAME = "model_random_search"
artifact_path="cv"

param_distributions = {
    'learning_rate': np.logspace(-3, -1, 5),
    'depth': np.arange(4,10)
}

model = CatBoostClassifier(loss_function=loss_function, task_type=task_type, random_seed=random_seed, iterations=iterations, verbose=verbose)

cv = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, cv=2, n_jobs=-1, n_iter=20)

clf = cv.fit(X_train,y_train)

# выводим лучшие параметры и лучший счёт
best_score = clf.best_score_
best_params = clf.best_params_
print("Лучшая точность:", best_score)
print("Лучшие гиперпараметры:", best_params)

# обучаем модель с лучшими параметрами на всём обучающем наборе
model_best = CatBoostClassifier(loss_function=loss_function, task_type=task_type, random_seed=random_seed, iterations=iterations, verbose=verbose, **best_params)
model_best.fit(X_train, y_train)

# оцениваем модель на тестовом наборе
test_score = model_best.score(X_test, y_test)
print("Точность на тестовой выборке:", test_score)


Лучшая точность: 0.7305644302449414
Лучшие гиперпараметры: {'learning_rate': 0.001, 'depth': 4}
Точность на тестовой выборке: 0.6224272533711852


In [90]:
# считаем метрики
prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)

metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas[:,1])
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

cv_results = pd.DataFrame(clf.cv_results_)

# дополнительные метрики из результатов кросс-валидации
metrics["mean_fit_time"] = cv_results['mean_fit_time'].mean() # среднее время обучения
metrics["std_fit_time"] =  cv_results['std_fit_time'].mean()  # стандартное отклонение времени обучения
metrics["mean_test_score"] = cv_results['mean_test_score'].mean()# средний результат на тесте
metrics["std_test_score"] = cv_results['std_test_score'].mean() # стандартное отклонение результата на тесте
metrics["best_score"] = best_score    # лучший результат кросс-валидации
metrics

{'err1': 0.09794180269694819,
 'err2': 0.19588360539389638,
 'auc': 0.7000918950578636,
 'precision': 0.6666666666666666,
 'recall': 0.41194029850746267,
 'f1': 0.5092250922509225,
 'logloss': 13.60910120866595,
 'mean_fit_time': 1.43021702170372,
 'std_fit_time': 0.09258330464363099,
 'mean_test_score': 0.649795882144125,
 'std_test_score': 0.13728257011004616,
 'best_score': 0.7305644302449414}

In [None]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)
    cv_info = mlflow.sklearn.log_model(cv, artifact_path=artifact_path)
    model_info = mlflow.catboost.log_model( 
        artifact_path='models',
        cb_model=model_best,
        registered_model_name=REGISTRY_MODEL_NAME,
        pip_requirements=pip_requirements,
        signature=signature,
        input_example=input_example
        #await_registration_for=60
		)
    

Registered model 'churn_model_bvv43' already exists. Creating a new version of this model...
2026/01/08 10:06:13 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_model_bvv43, version 13
Created version '13' of model 'churn_model_bvv43'.


In [92]:
run_id

'59c40809603c46bfb7d0125f490d593b'