In [1]:
import os
import psycopg2 as psycopg
import pandas as pd

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, KBinsDiscretizer
from sklearn.impute import SimpleImputer
from autofeat import AutoFeatRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import time

In [13]:
EXPERIMENT_NAME = "Baseline model registration"
RUN_NAME = "Nick_projecr _run_1"

connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}
assert all([var_value != "" for var_value in list(postgres_credentials.values())])

connection.update(postgres_credentials)

TABLE_NAME = "flat_cleaned_churn"

with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]
    
df = pd.DataFrame(data, columns=columns)

In [14]:
df.head(2)

Unnamed: 0,id,build_id,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator,floor,kitchen_area,living_area,rooms,is_apartment,studio,total_area,target
0,1,16093,1994-01-01,4,55.834713,37.448383,2.64,204,17,True,16,10.1,44.799999,3,False,False,73.800003,13390000.0
1,2,3104,1960-01-01,1,55.701302,37.738918,2.8,20,5,False,1,6.0,16.5,1,False,False,32.0,5500000.0


In [15]:
columns_list = df.columns.tolist()
columns_text = ",".join(columns_list) 
with open("columns.txt", "w", encoding="utf-8") as fio:
    fio.write(columns_text)
    
df.to_csv("flat_cleaned_churn", index=False)    

In [16]:
# Исключение столбцов с меньше чем 2 уникальными значениями
unique_counts = df.nunique()
columns_to_drop = unique_counts[unique_counts < 2].index
df = df.drop(columns=columns_to_drop)
X = df.drop('target', axis=1).copy()
# Разделение данных на категориальные и числовые
cat_features = X.select_dtypes(include='object')
potential_binary_features = cat_features.nunique() == 2
binary_cat_features = cat_features[potential_binary_features[potential_binary_features].index]
other_cat_features = cat_features[potential_binary_features[~potential_binary_features].index]
num_features = X.select_dtypes(['float','int'])
date_features = X.select_dtypes(include='datetime64[ns]')

preprocessor = ColumnTransformer(
    [
        ('binary', OneHotEncoder(drop='if_binary'), binary_cat_features.columns.tolist()),
        ('cat', OneHotEncoder(handle_unknown='ignore'), other_cat_features.columns.tolist()),
        ('num', StandardScaler(), num_features.columns.tolist()),
        ('date', StandardScaler(), date_features.columns.tolist())  # Обработка признаков даты    
    ],
    remainder='drop', verbose_feature_names_out=False
    )


model = LinearRegression(fit_intercept=True)  
pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', model)
    ]
    )

y = df['target'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

prediction = pipeline.predict(X_test)
print(prediction)

[ 9188352. 11099136. 11835392. ...  9725952.  9669632. 13305856.]


In [17]:
metrics = {}
mean = y_test.mean() #среднее значение целевой переменной на тесте
MAE = mean_absolute_error(y_test, prediction)
MSE = mean_squared_error(y_test, prediction)
R2 = r2_score(y_test, prediction)
y_error = y_test - prediction
y_error = y_test - prediction  # рассчитаем вектор ошибок
y_error_abs = abs(y_error)  # рассчитаем вектор модуля ошибок
perc_error_abs = y_error_abs / y_test  # рассчитаем вектор относительных ошибок
mape = perc_error_abs.mean()  # рассчитаем MAPE
metrics['Среднее значение целевой переменной'] = mean.round(2)
metrics['Средний модуль ошибки '] = MAE.round(2)
metrics['Средняя квадратичная ошибка'] = MSE.round(2)
metrics['Коэффициент детерминации'] = R2.round(2) 
metrics['Средняя абсолютная ошибка в процентах '] = mape.round(2)

In [18]:
# Установка переменных окружения для работы с хранилищем
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] =  os.getenv("AWS_SECRET_ACCESS_KEY")

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

# Установка URI для tracking и registry
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [19]:
REGISTRY_MODEL_NAME = "baseline_model_nikolaimelnikov"

pip_requirements = '../requirements.txt' # ваш код здесь
signature = mlflow.models.infer_signature(X_test, prediction)# ваш код здесь
input_example = input_example = X_test[:10]# ваш код здесь
metadata = {'model_type': 'monthly'}# ваш код здесь

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

model_path = "mlflow_baseline_model"

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    mlflow.sklearn.log_model(pipeline, model_path, pip_requirements=pip_requirements,
                             input_example=input_example, metadata=metadata, signature=signature,
                             registered_model_name=REGISTRY_MODEL_NAME, await_registration_for=6)

    # Дополнительные логгирования метрик и артефактов
    mlflow.log_metrics(metrics)
    mlflow.log_artifact("columns.txt", artifact_path="dataframe")
    mlflow.log_artifact("flat_cleaned_churn", artifact_path="dataframe")

  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'baseline_model_nikolaimelnikov'.
2024/05/06 12:34:49 INFO mlflow.tracking._model_registry.client: Waiting up to 6 seconds for model version to finish creation. Model name: baseline_model_nikolaimelnikov, version 1
Created version '1' of model 'baseline_model_nikolaimelnikov'.
