In [14]:
import os

import pandas as pd
import psycopg
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)



TABLE_NAME = "users_churn" # таблица с данными в postgres 

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "bvv_43" # напишите название вашего эксперимента
RUN_NAME = "preprocessing" 
REGISTRY_MODEL_NAME = 'churn_model_bvv43'


#ASSETS_DIR = "assets"

#os.mkdir(ASSETS_DIR) # или 
#os.makedirs(ASSETS_DIR, exist_ok=True)

pd.options.display.max_columns = 100
pd.options.display.max_rows = 64

# загружаем данные
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)
obj_df = df.select_dtypes(include="object") # категориальные признаки
df.head(2) 

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0
1,59,3957-SQXML,2017-04-01,NaT,Two year,No,Credit card (automatic),24.95,894.3,,,,,,,,Female,0,Yes,Yes,Yes,0


In [None]:
from sklearn.model_selection import train_test_split
from autofeat import AutoFeatRegressor, AutoFeatClassifier

cat_features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
]
num_features = ["monthly_charges", "total_charges"]

features = cat_features + num_features

target = ['target'] # колонка с таргетом вашей модели

split_column = "begin_date"
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
)
X_train.head(5)

Unnamed: 0,paperless_billing,payment_method,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,monthly_charges,total_charges
3430,Yes,Bank transfer (automatic),Fiber optic,No,Yes,Yes,No,Yes,Yes,Male,1,Yes,No,Yes,104.15,7689.95
4468,No,Credit card (automatic),DSL,Yes,Yes,Yes,Yes,Yes,Yes,Female,1,Yes,Yes,Yes,92.45,6440.25
4565,Yes,Bank transfer (automatic),Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Male,0,Yes,No,Yes,117.8,8684.8
70,Yes,Credit card (automatic),Fiber optic,No,Yes,Yes,Yes,Yes,Yes,Male,0,Yes,No,Yes,108.6,7690.9
1061,Yes,Electronic check,Fiber optic,Yes,No,Yes,Yes,Yes,Yes,Male,0,No,No,Yes,108.05,7532.15


In [None]:
transformations = ("log", "sqrt", "1/", "abs")

afc = AutoFeatClassifier(categorical_cols=cat_features, transformations=transformations, feateng_steps=1, n_jobs=-1)

X_train_features = afc.fit_transform(X_train, y_train)
X_test_features = afc.transform(X_test)
X_train_features.head(5)

  y = column_or_1d(y, warn=True)


Unnamed: 0,monthly_charges,total_charges,cat_paperless_billing_No,cat_paperless_billing_Yes,cat_payment_method_Bank transfer (automatic),cat_payment_method_Credit card (automatic),cat_payment_method_Electronic check,cat_payment_method_Mailed check,cat_internet_service_DSL,cat_internet_service_Fiber optic,cat_internet_service_None,cat_online_security_No,cat_online_security_Yes,cat_online_security_None,cat_online_backup_No,cat_online_backup_Yes,cat_online_backup_None,cat_device_protection_No,cat_device_protection_Yes,cat_device_protection_None,cat_tech_support_No,cat_tech_support_Yes,cat_tech_support_None,cat_streaming_tv_No,cat_streaming_tv_Yes,cat_streaming_tv_None,cat_streaming_movies_No,cat_streaming_movies_Yes,cat_streaming_movies_None,cat_gender_Female,cat_gender_Male,cat_senior_citizen_0,cat_senior_citizen_1,cat_partner_No,cat_partner_Yes,cat_dependents_No,cat_dependents_Yes,cat_multiple_lines_No,cat_multiple_lines_Yes,cat_multiple_lines_None,1/total_charges
0,104.15,7689.95,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.00013
1,92.45,6440.25,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.000155
2,117.8,8684.8,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.000115
3,108.6,7690.9,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.00013
4,108.05,7532.15,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.000133


In [None]:
# определяем основные credentials, которые нужны для подключения к MLflow
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

# логируем AutoFeatClassifier()
artifact_path = "afc"
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name='AutoFeatClassifier', experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    afc_info = mlflow.sklearn.log_model(afc, artifact_path=artifact_path) 

2026-01-04 14:26:22,027 INFO: Found credentials in environment variables.


In [58]:
from catboost import CatBoostClassifier
model = CatBoostClassifier()

# обучение модели
model = model.fit(X_train_features, y_train)

Learning rate set to 0.021554
0:	learn: 0.6765001	total: 59.7ms	remaining: 59.6s
1:	learn: 0.6627651	total: 62ms	remaining: 30.9s
2:	learn: 0.6482932	total: 65.8ms	remaining: 21.9s
3:	learn: 0.6347319	total: 69.1ms	remaining: 17.2s
4:	learn: 0.6217381	total: 72.6ms	remaining: 14.4s
5:	learn: 0.6086249	total: 75.9ms	remaining: 12.6s
6:	learn: 0.5971052	total: 79.1ms	remaining: 11.2s
7:	learn: 0.5852194	total: 82.3ms	remaining: 10.2s
8:	learn: 0.5753097	total: 85.6ms	remaining: 9.43s
9:	learn: 0.5654778	total: 89.1ms	remaining: 8.82s
10:	learn: 0.5558748	total: 92.4ms	remaining: 8.3s
11:	learn: 0.5474446	total: 95.6ms	remaining: 7.87s
12:	learn: 0.5386424	total: 100ms	remaining: 7.61s
13:	learn: 0.5308129	total: 104ms	remaining: 7.31s
14:	learn: 0.5238162	total: 107ms	remaining: 7.05s
15:	learn: 0.5171033	total: 111ms	remaining: 6.81s
16:	learn: 0.5104734	total: 114ms	remaining: 6.59s
17:	learn: 0.5040296	total: 117ms	remaining: 6.39s
18:	learn: 0.4983331	total: 120ms	remaining: 6.21s
19

In [59]:
model

<catboost.core.CatBoostClassifier at 0x7f5455c94eb0>

In [60]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score,log_loss, confusion_matrix



prediction = model.predict(X_test_features)
probas = model.predict_proba(X_test_features)

metrics = {}

confusion_matrix_data = confusion_matrix(y_test, prediction, normalize='all')
_, err1, _, err2 = confusion_matrix_data.ravel()
auc = roc_auc_score(y_test, probas[:,1])
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss
metrics

{'err1': 0.3385379701916253,
 'err2': 0.4237047551454933,
 'auc': 0.7465584795912183,
 'precision': 0.5558659217877095,
 'recall': 0.891044776119403,
 'f1': 0.6846330275229358,
 'logloss': 14.069559520237355}

In [63]:
pip_requirements = "./requirements.txt"

signature =  mlflow.models.infer_signature(X_test_features.head(2), model.predict(X_test_features.head(2)))
input_example = X_test_features.head(2)


# создаём эксперимент если его нет и записываем в него тестовую информацию
if not mlflow.get_experiment_by_name(EXPERIMENT_NAME):
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME) 
else:
    experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_metrics(metrics)
    model_info = mlflow.sklearn.log_model( 
#    model_info = mlflow.catboost.log_model( 
        artifact_path='models',
        sk_model=model,
        registered_model_name=REGISTRY_MODEL_NAME,
        pip_requirements=pip_requirements,
        signature=signature,
        input_example=input_example
        #await_registration_for=60
		)


Registered model 'churn_model_bvv43' already exists. Creating a new version of this model...
2026/01/04 15:24:32 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_model_bvv43, version 7
Created version '7' of model 'churn_model_bvv43'.


In [64]:
run_id

'2a358126b113488abfbbf16d78663b70'