In [19]:
import os
import numpy as np
import psycopg
import pandas as pd
import mlflow
from catboost import CatBoostClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
psycopg
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix, log_loss

In [20]:
os.environ["DB_DESTINATION_HOST"] = os.getenv("DB_DESTINATION_HOST")
os.environ["DB_DESTINATION_PORT"] = os.getenv("DB_DESTINATION_PORT")
os.environ["DB_DESTINATION_NAME"] = os.getenv("DB_DESTINATION_NAME")
os.environ["DB_DESTINATION_USER"] = os.getenv("DB_DESTINATION_USER")
os.environ["DB_DESTINATION_PASSWORD"] = os.getenv("DB_DESTINATION_PASSWORD")

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env

In [21]:
TABLE_NAME = "clean_users_churn"

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "test_connection_experiment_vadim_shakula"
RUN_NAME = 'model_grid_search' 
REGISTRY_MODEL_NAME = "churn_model_vadimshakula"

In [22]:
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.environ["DB_DESTINATION_HOST"], 
    "port": os.environ["DB_DESTINATION_PORT"],
    "dbname": os.environ["DB_DESTINATION_NAME"],
    "user": os.environ["DB_DESTINATION_USER"],
    "password": os.environ["DB_DESTINATION_PASSWORD"],
}
assert all([var_value != "" for var_value in list(postgres_credentials.values())])

connection.update(postgres_credentials)

In [23]:
# эта конструкция создаёт контекстное управление для соединения с базой данных 
# оператор with гарантирует, что соединение будет корректно закрыто после выполнения всех операций 
# закрыто оно будет даже в случае ошибки, чтобы не допустить "утечку памяти"
with psycopg.connect(**connection) as conn:

# создаёт объект курсора для выполнения запросов к базе данных
# с помощью метода execute() выполняется SQL-запрос для выборки данных из таблицы TABLE_NAME
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
                
                # извлекаем все строки, полученные в результате выполнения запроса
        data = cur.fetchall()

                # получает список имён столбцов из объекта курсора
        columns = [col[0] for col in cur.description]

# создаёт объект DataFrame из полученных данных и имён столбцов. 
# это позволяет удобно работать с данными в Python, используя библиотеку Pandas.
df = pd.DataFrame(data, columns=columns)

In [24]:
df

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,1,7795-CFOCW,2016-05-01,2019-11-01,One year,No,Bank transfer (automatic),42.30,1840.75,DSL,...,Yes,Yes,No,No,Male,0,No,No,No,0
1,2,9237-HQITU,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.70,151.65,Fiber optic,...,No,No,No,No,Female,0,No,No,No,1
2,3,9305-CDSKC,2019-03-01,2019-11-01,Month-to-month,Yes,Electronic check,99.65,820.50,Fiber optic,...,Yes,No,Yes,Yes,Female,0,No,No,Yes,1
3,4,1452-KIOVK,2018-04-01,2019-11-01,Month-to-month,Yes,Credit card (automatic),89.10,1949.40,Fiber optic,...,No,No,Yes,No,Male,0,No,Yes,Yes,0
4,5,6713-OKOMC,2019-04-01,2019-11-01,Month-to-month,No,Mailed check,29.75,301.90,DSL,...,No,No,No,No,Female,0,No,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7014,7015,0550-DCXLH,2019-01-01,2019-11-01,Month-to-month,No,Mailed check,73.35,931.55,DSL,...,No,Yes,Yes,Yes,Male,0,No,No,No,0
7015,7016,9281-CEDRU,2014-06-01,2019-11-01,Two year,No,Bank transfer (automatic),64.10,4326.25,DSL,...,No,Yes,Yes,No,Female,0,Yes,No,No,0
7016,7017,2235-DWLJU,2019-08-01,2019-11-01,Month-to-month,Yes,Electronic check,44.40,263.05,DSL,...,No,No,Yes,Yes,Female,1,No,No,No,0
7017,7018,0871-OPBXW,2019-12-01,2019-11-01,Month-to-month,Yes,Mailed check,20.05,39.25,Fiber optic,...,No,No,No,No,Female,0,No,No,No,0


In [25]:
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = 'begin_date'# ваш код здесь
stratify_column = 'begin_date'# ваш код здесь
test_size = 0.2# ваш код здесь

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
) 

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5615, 3)
Размер выборки для теста: (1404, 3)


In [26]:
loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

params = {
    'iterations' : [5,10,15,20,25,30,35,40,45,50],
}

model = CatBoostClassifier(loss_function=loss_function, task_type=task_type, iterations=iterations, verbose=verbose,random_seed=random_seed)
cv = GridSearchCV(
    estimator=model,
    param_grid=params,
    n_jobs=-1,
    cv=2,
    scoring="roc_auc",
    refit=True,)

clf = cv.fit(X_train, y_train)

In [27]:

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

cv_results = pd.DataFrame(clf.cv_results_)
best_params = clf.best_params_
model_best = CatBoostClassifier(**best_params, 
    loss_function=loss_function, 
    task_type=task_type, 
    #iterations=iterations,
    verbose=verbose,
    random_seed=random_seed)

model_best.fit(X_train, y_train)

prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)[:, 1]

In [28]:
# расчёт метрик качества
metrics = {}

_, err1, _, err2 =  confusion_matrix(y_test, prediction, normalize='all').ravel()
auc =  roc_auc_score(y_test, probas)
precision =  precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

In [29]:
# дополнительные метрики из результатов кросс-валидации
metrics["mean_fit_time"] = cv_results['mean_fit_time'].mean() # среднее время обучения
metrics["std_fit_time"] =  cv_results['std_fit_time'].mean() # стандартное отклонение времени обучения
metrics["mean_test_score"] = cv_results['mean_test_score'].mean()  # средний результат на тесте
metrics["std_test_score"] = cv_results['std_test_score'].mean()  # стандартное отклонение результата на тесте
metrics['best_score'] = clf.best_score_  # лучший результат кросс-валидации

In [30]:
# настройки для логирования в MLFlow
pip_requirements= 'requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

  inputs = _infer_schema(model_input) if model_input is not None else None


In [33]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id 
    
    model_info = mlflow.catboost.log_model(
        cb_model=model_best,
        signature=signature,
        input_example=input_example,
        await_registration_for=60,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        pip_requirements=pip_requirements,
    )

    mlflow.log_metrics(metrics) 
    mlflow.log_params(best_params)
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')

Registered model 'churn_model_vadimshakula' already exists. Creating a new version of this model...
2025/09/07 13:11:11 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_vadimshakula, version 3
Created version '3' of model 'churn_model_vadimshakula'.


In [34]:
print("EXPERIMENT_NAME         :", EXPERIMENT_NAME)
print("run_id                  :", run_id)
print("REGISTRY_MODEL_NAME     :", REGISTRY_MODEL_NAME)


EXPERIMENT_NAME         : test_connection_experiment_vadim_shakula
run_id                  : f54054037d5a4880be96545ce2b2847c
REGISTRY_MODEL_NAME     : churn_model_vadimshakula
