In [13]:
import os

import pandas as pd
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)
import psycopg

TABLE_NAME = 'clean_users_churn'

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "test_connection_experiment_vadim_shakula"
RUN_NAME = "preprocessing" 
REGISTRY_MODEL_NAME = "churn_model_vadimshakula"

In [14]:
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        sql = """
            select customer_id,
                   begin_date,
                   end_date,
                   type,
                   paperless_billing,
                   payment_method,
                   monthly_charges,
                   total_charges,
                   internet_service,
                   online_security,
                   online_backup,
                   device_protection,
                   tech_support,
                   streaming_tv,
                   streaming_movies,
                   gender,
                   senior_citizen,
                   partner,
                   dependents,
                   multiple_lines,
                   target
            from users_churn
        """
        cur.execute(sql)
        data = cur.fetchall()                       
        columns = [desc.name for desc in cur.description] 

df = pd.DataFrame(data, columns=columns)

In [15]:
#командf, которая выделит нечисловые колонки вашего датасета.
obj_df = df.select_dtypes(include="object")
obj_df

Unnamed: 0,customer_id,type,paperless_billing,payment_method,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,partner,dependents,multiple_lines
0,7795-CFOCW,One year,No,Bank transfer (automatic),DSL,Yes,No,Yes,Yes,No,No,Male,No,No,
1,9237-HQITU,Month-to-month,Yes,Electronic check,Fiber optic,No,No,No,No,No,No,Female,No,No,No
2,9305-CDSKC,Month-to-month,Yes,Electronic check,Fiber optic,No,No,Yes,No,Yes,Yes,Female,No,No,Yes
3,1452-KIOVK,Month-to-month,Yes,Credit card (automatic),Fiber optic,No,Yes,No,No,Yes,No,Male,No,Yes,Yes
4,6713-OKOMC,Month-to-month,No,Mailed check,DSL,Yes,No,No,No,No,No,Female,No,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0550-DCXLH,Month-to-month,No,Mailed check,DSL,No,Yes,No,Yes,Yes,Yes,Male,No,No,No
7039,9281-CEDRU,Two year,No,Bank transfer (automatic),DSL,No,Yes,No,Yes,Yes,No,Female,Yes,No,No
7040,2235-DWLJU,Month-to-month,Yes,Electronic check,DSL,No,No,No,No,Yes,Yes,Female,No,No,
7041,0871-OPBXW,Month-to-month,Yes,Mailed check,,,,,,,,Female,No,No,No


In [16]:
#колонrb категориальные, но не числовые
cat_columns = ["type", "payment_method", "internet_service", "gender"]

In [17]:
from sklearn.preprocessing import OneHotEncoder
encoder_oh = OneHotEncoder(
    categories='auto',          # автоопределение категорий
    handle_unknown='ignore',    # игнорируем неизвестные категории
    max_categories=10,          # максимум 10 уникальных категорий
    sparse_output=False,        # выводим обычный массив, а не разреженную матрицу
    drop='first'                # удаляем первую категорию, чтобы избежать мультиколлинеарности
)
encoded = encoder_oh.fit_transform(df[cat_columns])
encoded_df = pd.DataFrame(encoded, columns=encoder_oh.get_feature_names_out(cat_columns))

In [18]:
encoded_df

Unnamed: 0,type_One year,type_Two year,payment_method_Credit card (automatic),payment_method_Electronic check,payment_method_Mailed check,internet_service_Fiber optic,internet_service_None,gender_Male
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
7038,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
7039,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7040,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7041,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [21]:
# применение OneHotEncoder к данным. Преобразование категориальных данных в массив
encoded_features = encoder_oh.fit_transform(df[cat_columns].to_numpy())

# преобразование полученных признаков в DataFrame и установка названий колонок
encoded_df = pd.DataFrame(
    encoded_features,
    columns=encoder_oh.get_feature_names_out(cat_columns),
    index=obj_df.index
)

# конкатенация исходного DataFrame с новым DataFrame, содержащим закодированные категориальные признаки
obj_df = pd.concat([obj_df, encoded_df], axis=1)

obj_df.head(2)

Unnamed: 0,customer_id,type,paperless_billing,payment_method,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,...,internet_service_None,gender_Male,type_One year,type_Two year,payment_method_Credit card (automatic),payment_method_Electronic check,payment_method_Mailed check,internet_service_Fiber optic,internet_service_None.1,gender_Male.1
0,7795-CFOCW,One year,No,Bank transfer (automatic),DSL,Yes,No,Yes,Yes,No,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,9237-HQITU,Month-to-month,Yes,Electronic check,Fiber optic,No,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [22]:
num_columns = ["monthly_charges", "total_charges"]
num_df = df[num_columns]

n_knots = 3
degree_spline = 4
n_quantiles=100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None

In [30]:
from sklearn.preprocessing import SplineTransformer, QuantileTransformer, RobustScaler, PolynomialFeatures, KBinsDiscretizer

# берем только нужные числовые колонки и заполняем пропуски (можно 'mean'/'median' по выбору)
X = df[num_columns].fillna(df[num_columns].median())

# без bias-столбца, чтобы число новых фич не «пухло»
encoder_spl = SplineTransformer(n_knots=n_knots, degree=degree_spline, include_bias=False)

encoded_features = encoder_spl.fit_transform(X)

encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_spl.get_feature_names_out(num_columns),
    index=X.index                      # важно сохранить индекс
)

num_df = pd.concat([num_df, encoded_df], axis=1)

In [31]:
# QuantileTransformer
encoder_q = QuantileTransformer(n_quantiles=n_quantiles, output_distribution='uniform', random_state=42)
encoded_features = encoder_q.fit_transform(num_df[num_columns])

encoded_df = pd.DataFrame(encoded_features, index=num_df.index, columns=num_columns)
encoded_df.columns = [col + f"_q_{n_quantiles}" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

In [32]:
# RobustScaler
encoder_rb = RobustScaler()
encoded_features = encoder_rb.fit_transform(num_df[num_columns])

encoded_df = pd.DataFrame(encoded_features, index=num_df.index, columns=num_columns)
encoded_df.columns = [col + f"_robust" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

In [37]:

# обработка NaN (например, медианой)
X = num_df[num_columns].fillna(num_df[num_columns].median())

# создаём трансформер
encoder_pol = PolynomialFeatures(degree=degree, include_bias=True, interaction_only=False)

# трансформация
encoded_features = encoder_pol.fit_transform(X)

# собираем DataFrame
poly_full = pd.DataFrame(
    encoded_features,
    columns=encoder_pol.get_feature_names_out(num_columns),
    index=num_df.index
)

# выбираем только чистые степени (например, x^2, y^2 и т.д.)
keep_cols = [f"{col}^{degree}" for col in num_columns if f"{col}^{degree}" in poly_full.columns]
encoded_df = poly_full[keep_cols].copy()
encoded_df.columns = [col + f"_poly" for col in num_columns]

In [43]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
# KBinsDiscretizer + заполнение пропусков
encoder_kbd = make_pipeline(
    SimpleImputer(strategy='median'),                 # можно 'mean'
    KBinsDiscretizer(
        n_bins=n_bins,
        encode=encode,                                # 'ordinal' / 'onehot' / 'onehot-dense'
        strategy=strategy,
        subsample=subsample,
        random_state=42
    )
)

encoded_features = encoder_kbd.fit_transform(num_df[num_columns])

# если вернулась sparse-матрица (onehot/onehot-dense) — приводим к ndarray
if hasattr(encoded_features, "toarray"):
    encoded_features = encoded_features.toarray()

encoded_df = pd.DataFrame(encoded_features, index=num_df.index, columns=num_columns)
encoded_df.columns = [f"{c}_bin" for c in num_columns]

num_df = pd.concat([num_df, encoded_df], axis=1)

num_df.head(2)

Unnamed: 0,monthly_charges,total_charges,monthly_charges_sp_0,monthly_charges_sp_1,monthly_charges_sp_2,monthly_charges_sp_3,monthly_charges_sp_4,total_charges_sp_0,total_charges_sp_1,total_charges_sp_2,...,monthly_charges_q_100,total_charges_q_100,monthly_charges_robust,total_charges_robust,monthly_charges_bin,total_charges_bin,monthly_charges_bin.1,total_charges_bin.1,monthly_charges_bin.2,total_charges_bin.2
0,42.3,1840.75,0.003079,0.207835,0.598672,0.188228,0.002186,0.0047,0.235853,0.595016,...,0.268242,0.574104,-0.516099,0.130633,1.0,1.0,1.0,1.0,1.0,1.0
1,70.7,151.65,0.0,0.034835,0.436005,0.479704,0.049456,0.036787,0.442783,0.473414,...,0.507112,0.141397,0.00644,-0.367144,2.0,0.0,2.0,0.0,2.0,0.0


In [44]:

numeric_transformer = ColumnTransformer(
    transformers=[
        ('spl', encoder_spl, num_columns),
        ('q', encoder_q, num_columns),
        ('rb', encoder_rb, num_columns),
        ('pol', encoder_pol, num_columns),
        ('kbd', encoder_kbd, num_columns)
    ]
)

In [45]:
categorical_transformer = Pipeline(
    steps=[('encoder', encoder_oh)
    ]
)

In [49]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_columns), ('cat', categorical_transformer, cat_columns)], n_jobs=-1)

In [53]:
# заполним пропуски перед трансформацией (числа — медианой, категории — модой)
X = df.copy()
num_cols = X.select_dtypes(include='number').columns
cat_cols = X.select_dtypes(exclude='number').columns
X[num_cols] = X[num_cols].fillna(X[num_cols].median())
if len(cat_cols):
    X[cat_cols] = X[cat_cols].fillna(X[cat_cols].mode().iloc[0])

encoded_features = preprocessor.fit_transform(X)
transformed_df = pd.DataFrame(
    encoded_features,
    columns=preprocessor.get_feature_names_out(),
    index=df.index
)
df = pd.concat([df, transformed_df], axis=1)

In [54]:
preprocessor

In [56]:
import os
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv('AWS_ACCESS_KEY_ID')
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv('AWS_SECRET_ACCESS_KEY')

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.sklearn.log_model(preprocessor, "column_transformer")



In [57]:

run_id

'5e2d9ccd929945f3bedf7a229c775ec6'

In [58]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

# имя зарегистрированной модели (то, под которым ты её регистрировал)
model_registered_name = os.getenv('REGISTRY_MODEL_NAME')  # или строкой: "churn_model_vadimshakula"

# берем самую свежую версию этой модели
versions = client.search_model_versions(f"name = '{model_registered_name}'")
latest = sorted(versions, key=lambda m: int(m.version))[-1]

model_version_id = latest.version      # номер версии модели в реестре
run_id = latest.run_id                 # run_id, в рамках которого зарегистрирована эта версия

print("model_version_id:", model_version_id)
print("model_registered_name:", model_registered_name)
print("run_id:", run_id)

model_version_id: 1
model_registered_name: churn_model_vadimshakula
run_id: 90e7f8d96c234294a6cd521d6d356331
