In [1]:
import os

import pandas as pd
import psycopg
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)



TABLE_NAME = "users_churn" # таблица с данными в postgres 

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "bvv_43" # напишите название вашего эксперимента
RUN_NAME = "preprocessing" 
REGISTRY_MODEL_NAME = 'churn_model_bvv43'


#ASSETS_DIR = "assets"

#os.mkdir(ASSETS_DIR) # или 
#os.makedirs(ASSETS_DIR, exist_ok=True)

pd.options.display.max_columns = 100
pd.options.display.max_rows = 64

# загружаем данные
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)
obj_df = df.select_dtypes(include="object") # категориальные признаки
df.head(2) 

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0
1,59,3957-SQXML,2017-04-01,NaT,Two year,No,Credit card (automatic),24.95,894.3,,,,,,,,Female,0,Yes,Yes,Yes,0


In [2]:
# определение категориальных колонок, которые будут преобразованы
cat_columns = ["type", "payment_method", "internet_service", "gender"]

# создание объекта OneHotEncoder для преобразования категориальных переменных
# auto - автоматическое определение категорий
# ignore - игнорировать ошибки, если встречается неизвестная категория
# max_categories - максимальное количество уникальных категорий
# sparse_output - вывод в виде разреженной матрицы, если False, то в виде обычного массива
# drop="first" - удаляет первую категорию, чтобы избежать ловушки мультиколлинеарности
encoder_oh = OneHotEncoder(categories = 'auto', 
                           handle_unknown = 'ignore', 
                           max_categories = 10,
                           sparse_output = False,
                           drop='first')

# применение OneHotEncoder к данным. Преобразование категориальных данных в массив
encoded_features = encoder_oh.fit_transform(df[cat_columns].to_numpy())

# преобразование полученных признаков в DataFrame и установка названий колонок
# get_feature_names_out() - получение имён признаков после преобразования
encoded_df = pd.DataFrame(encoded_features, columns=encoder_oh.get_feature_names_out(cat_columns))

# конкатенация исходного DataFrame с новым DataFrame, содержащим закодированные категориальные признаки
# axis=1 означает конкатенацию по колонкам
obj_df = pd.concat([obj_df, encoded_df], axis=1)

obj_df.head(2)

Unnamed: 0,customer_id,type,paperless_billing,payment_method,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,partner,dependents,multiple_lines,type_One year,type_Two year,payment_method_Credit card (automatic),payment_method_Electronic check,payment_method_Mailed check,internet_service_Fiber optic,internet_service_None,gender_Male
0,5575-GNVDE,One year,No,Mailed check,DSL,Yes,No,Yes,No,No,No,Male,No,No,No,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,3957-SQXML,Two year,No,Credit card (automatic),,,,,,,,Female,Yes,Yes,Yes,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0


In [3]:
num_columns = ["monthly_charges", "total_charges"]

n_knots = 3
degree_spline = 4
n_quantiles=100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None

# обработка пропущенных знаяений
mean_monthly_charges = df[num_columns].mean()# median / mode
df = df.fillna({'monthly_charges':mean_monthly_charges['monthly_charges']})
df = df.fillna({'total_charges':mean_monthly_charges['total_charges']})

num_df = df[num_columns]
# SplineTransformer
encoder_spl = SplineTransformer(n_knots=n_knots, degree=degree_spline)
encoded_features = encoder_spl.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_spl.get_feature_names_out(num_columns)
)
num_df = pd.concat([num_df, encoded_df], axis=1)


# QuantileTransformer
encoder_q = QuantileTransformer(n_quantiles=n_quantiles)
encoded_features = encoder_q.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_q.get_feature_names_out(num_columns)
)
encoded_df.columns = [col + f"_q_{n_quantiles}" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


# RobustScaler
encoder_rb = RobustScaler()
encoded_features = encoder_rb.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_rb.get_feature_names_out(num_columns)
)
encoded_df.columns = [col + f"_robust" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


# PolynomialFeatures
encoder_pol = PolynomialFeatures(degree=degree)
encoded_features = encoder_pol.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_pol.get_feature_names_out(num_columns)
)
# get all columns after the intercept and original features
encoded_df = encoded_df.iloc[:, 1 + len(num_columns):]
encoded_df.columns = [col + f"_poly" for col in encoded_df.columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

# KBinsDiscretizer
encoder_kbd = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy, subsample=subsample)
encoded_features = encoder_kbd.fit_transform(df[num_columns].to_numpy())
encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_kbd.get_feature_names_out(num_columns)
)
encoded_df.columns = [col + f"_bin" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

num_df.head(2)


Unnamed: 0,monthly_charges,total_charges,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,monthly_charges_sp_5,total_charges_sp_0,total_charges_sp_1,total_charges_sp_2,total_charges_sp_3,total_charges_sp_4,total_charges_sp_5,monthly_charges_q_100,total_charges_q_100,monthly_charges_robust,total_charges_robust,monthly_charges^2_poly,monthly_charges total_charges_poly,total_charges^2_poly,monthly_charges^3_poly,monthly_charges^2 total_charges_poly,monthly_charges total_charges^2_poly,total_charges^3_poly,monthly_charges_bin,total_charges_bin
0,56.95,1889.5,0.000116,0.094742,0.554677,0.335807,0.014658,0.0,0.004345,0.230314,0.596051,0.167842,0.001448,0.0,0.394114,0.580098,-0.24655,0.144473,3243.3025,107607.025,3570210.25,184706.077375,6128220.0,203323500.0,6745912000.0,1.0,1.0
1,24.95,894.3,0.023507,0.388355,0.519449,0.068676,1.3e-05,0.0,0.016892,0.350947,0.545446,0.086646,6.9e-05,0.0,0.19697,0.387206,-0.835327,-0.149584,622.5025,22312.785,799772.49,15531.437375,556704.0,19954320.0,715236500.0,0.0,0.0


In [5]:
numeric_transformer = ColumnTransformer(
    transformers=[
        ('Spl', encoder_spl, num_columns),
        ('q', encoder_q, num_columns),
        ('rb', encoder_rb, num_columns),
        ('pol', encoder_pol, num_columns),
        ('kbd', encoder_kbd, num_columns)
    ])


categorical_transformer = Pipeline(steps=[('encoder', encoder_oh)])

preprocessor = ColumnTransformer(
	transformers=[
        ('num', numeric_transformer, num_columns),
        ('cat', categorical_transformer, cat_columns)
    ], n_jobs=-1
)

encoded_features = preprocessor.fit_transform(df)

transformed_df = pd.DataFrame(
    encoded_features, 
    columns=preprocessor.get_feature_names_out()
            )

df = pd.concat([df, transformed_df], axis=1)
df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target,num__Spl__monthly_charges_sp_0,num__Spl__monthly_charges_sp_1,num__Spl__monthly_charges_sp_2,num__Spl__monthly_charges_sp_3,num__Spl__monthly_charges_sp_4,num__Spl__monthly_charges_sp_5,num__Spl__total_charges_sp_0,num__Spl__total_charges_sp_1,num__Spl__total_charges_sp_2,num__Spl__total_charges_sp_3,num__Spl__total_charges_sp_4,num__Spl__total_charges_sp_5,num__q__monthly_charges,num__q__total_charges,num__rb__monthly_charges,num__rb__total_charges,num__pol__1,num__pol__monthly_charges,num__pol__total_charges,num__pol__monthly_charges^2,num__pol__monthly_charges total_charges,num__pol__total_charges^2,num__pol__monthly_charges^3,num__pol__monthly_charges^2 total_charges,num__pol__monthly_charges total_charges^2,num__pol__total_charges^3,num__kbd__monthly_charges,num__kbd__total_charges,cat__type_One year,cat__type_Two year,cat__payment_method_Credit card (automatic),cat__payment_method_Electronic check,cat__payment_method_Mailed check,cat__internet_service_Fiber optic,cat__internet_service_None,cat__gender_Male,num__Spl__monthly_charges_sp_0.1,num__Spl__monthly_charges_sp_1.1,num__Spl__monthly_charges_sp_2.1,num__Spl__monthly_charges_sp_3.1,num__Spl__monthly_charges_sp_4.1,num__Spl__monthly_charges_sp_5.1,num__Spl__total_charges_sp_0.1,num__Spl__total_charges_sp_1.1,num__Spl__total_charges_sp_2.1,num__Spl__total_charges_sp_3.1,num__Spl__total_charges_sp_4.1,num__Spl__total_charges_sp_5.1,num__q__monthly_charges.1,num__q__total_charges.1,num__rb__monthly_charges.1,num__rb__total_charges.1,num__pol__1.1,num__pol__monthly_charges.1,num__pol__total_charges.1,num__pol__monthly_charges^2.1,num__pol__monthly_charges total_charges.1,num__pol__total_charges^2.1,num__pol__monthly_charges^3.1,num__pol__monthly_charges^2 total_charges.1,num__pol__monthly_charges total_charges^2.1,num__pol__total_charges^3.1,num__kbd__monthly_charges.1,num__kbd__total_charges.1,cat__type_One year.1,cat__type_Two year.1,cat__payment_method_Credit card (automatic).1,cat__payment_method_Electronic check.1,cat__payment_method_Mailed check.1,cat__internet_service_Fiber optic.1,cat__internet_service_None.1,cat__gender_Male.1
0,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0,0.000116,0.094742,0.554677,0.335807,0.014658,0.0,0.004345,0.230314,0.596051,0.167842,0.001448,0.0,0.394114,0.580098,-0.24655,0.144473,1.0,56.95,1889.5,3243.3025,107607.025,3570210.25,184706.077375,6128220.0,203323500.0,6745912000.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.000116,0.094742,0.554677,0.335807,0.014658,0.0,0.004345,0.230314,0.596051,0.167842,0.001448,0.0,0.394114,0.580098,-0.24655,0.144473,1.0,56.95,1889.5,3243.3025,107607.025,3570210.25,184706.077375,6128220.0,203323500.0,6745912000.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,59,3957-SQXML,2017-04-01,NaT,Two year,No,Credit card (automatic),24.95,894.3,,,,,,,,Female,0,Yes,Yes,Yes,0,0.023507,0.388355,0.519449,0.068676,1.3e-05,0.0,0.016892,0.350947,0.545446,0.086646,6.9e-05,0.0,0.19697,0.387206,-0.835327,-0.149584,1.0,24.95,894.3,622.5025,22312.785,799772.49,15531.437375,556704.0,19954320.0,715236500.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.023507,0.388355,0.519449,0.068676,1.3e-05,0.0,0.016892,0.350947,0.545446,0.086646,6.9e-05,0.0,0.19697,0.387206,-0.835327,-0.149584,1.0,24.95,894.3,622.5025,22312.785,799772.49,15531.437375,556704.0,19954320.0,715236500.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0


In [20]:
preprocessor

In [7]:
# определяем основные credentials, которые нужны для подключения к MLflow
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.sklearn.log_model(preprocessor, "column_transformer")



In [15]:
import joblib
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

data = pd.read_csv('data/initial_data.csv')  
X_train, X_test, y_train, y_test = train_test_split(data, data['target'], random_state=0)

# создание Pipeline с преобразованиями и моделью
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', CatBoostClassifier())])

# обучение модели
model1 = pipe.fit(X_train, y_train)

Learning rate set to 0.020969
0:	learn: 0.6807907	total: 7.34ms	remaining: 7.34s
1:	learn: 0.6685860	total: 14.5ms	remaining: 7.26s
2:	learn: 0.6550352	total: 23ms	remaining: 7.63s
3:	learn: 0.6431808	total: 29.5ms	remaining: 7.35s
4:	learn: 0.6321021	total: 40.6ms	remaining: 8.07s
5:	learn: 0.6233149	total: 49ms	remaining: 8.12s
6:	learn: 0.6132524	total: 57ms	remaining: 8.09s
7:	learn: 0.6033374	total: 63.7ms	remaining: 7.9s
8:	learn: 0.5938908	total: 70.4ms	remaining: 7.75s
9:	learn: 0.5849477	total: 78ms	remaining: 7.72s
10:	learn: 0.5768736	total: 85.2ms	remaining: 7.66s
11:	learn: 0.5703542	total: 91.8ms	remaining: 7.56s
12:	learn: 0.5620788	total: 98.3ms	remaining: 7.47s
13:	learn: 0.5544854	total: 105ms	remaining: 7.39s
14:	learn: 0.5470661	total: 112ms	remaining: 7.34s
15:	learn: 0.5409084	total: 119ms	remaining: 7.29s
16:	learn: 0.5349563	total: 125ms	remaining: 7.24s
17:	learn: 0.5299363	total: 132ms	remaining: 7.2s
18:	learn: 0.5243740	total: 141ms	remaining: 7.25s
19:	lear

In [21]:
model1

In [19]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score,log_loss, confusion_matrix


y_test= data.target
prediction = model1.predict(data)
probas = model1.predict_proba(data)

metrics = {}

confusion_matrix_data = confusion_matrix(y_test, prediction, normalize='all')
_, err1, _, err2 = confusion_matrix_data.ravel()
auc = roc_auc_score(y_test, probas[:,1])
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss
metrics

{'err1': 0.057787874485304556,
 'err2': 0.15930711344597473,
 'auc': 0.8963987426948299,
 'precision': 0.7338129496402878,
 'recall': 0.6003210272873194,
 'f1': 0.6603884638022366,
 'logloss': 5.905775381377424}

In [None]:
pip_requirements = "./requirements.txt"

signature =  mlflow.models.infer_signature(data.head(2), model1.predict(data.head(2)))
input_example = data.head(2)


# создаём эксперимент если его нет и записываем в него тестовую информацию
if not mlflow.get_experiment_by_name(EXPERIMENT_NAME):
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME) 
else:
    experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_metrics(metrics)
    model_info = mlflow.sklearn.log_model( 
#    model_info = mlflow.catboost.log_model( 
        artifact_path='models',
        sk_model=model1,
        registered_model_name=REGISTRY_MODEL_NAME,
        pip_requirements=pip_requirements,
        signature=signature,
        input_example=input_example
        #await_registration_for=60
		)



  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'churn_model_bvv43' already exists. Creating a new version of this model...
2026/01/02 17:32:27 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_model_bvv43, version 6
Created version '6' of model 'churn_model_bvv43'.


In [24]:
run_id

'50aa86dea1854799b3ab46e9da37eb99'

In [29]:
# устанавливаем host, который будет отслеживать наши эксперименты
client = mlflow.MlflowClient()
run = client.get_run(run_id)
artifact_uri = run.info.artifact_uri

print(f"Путь артефакта в S3: '{artifact_uri}'")

Путь артефакта в S3: 's3://s3-student-mle-20251108-d3b615298f-freetrack/8/50aa86dea1854799b3ab46e9da37eb99/artifacts'


In [30]:

models = client.search_model_versions(
    filter_string=f"name = '{REGISTRY_MODEL_NAME}'"
)

In [31]:
models

[<ModelVersion: aliases=[], creation_timestamp=1767375147659, current_stage='None', description='', last_updated_timestamp=1767375147659, name='churn_model_bvv43', run_id='50aa86dea1854799b3ab46e9da37eb99', run_link='', source='s3://s3-student-mle-20251108-d3b615298f-freetrack/8/50aa86dea1854799b3ab46e9da37eb99/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='6'>,
 <ModelVersion: aliases=[], creation_timestamp=1766928383777, current_stage='None', description='', last_updated_timestamp=1766928383777, name='churn_model_bvv43', run_id='56f312fdbf7844cba41574824f3e4566', run_link='', source='s3://s3-student-mle-20251108-d3b615298f-freetrack/8/56f312fdbf7844cba41574824f3e4566/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='5'>,
 <ModelVersion: aliases=[], creation_timestamp=1766864171602, current_stage='None', description='', last_updated_timestamp=1766864171602, name='churn_model_bvv43', run_id='62a3bdb7f0814789ba62eb05