In [3]:
import os
import mlflow
import yaml
import time
import optuna
import psycopg2 as psycopg
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from optuna import distributions
from optuna.integration import OptunaSearchCV
from catboost import CatBoostRegressor
from sklearn.preprocessing import FunctionTransformer

TABLE_NAME = "flat_cleaned_churn"
RANDOM_STATE = 42

In [4]:
#Создаю необходимые соединения

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] =  os.getenv("AWS_SECRET_ACCESS_KEY")

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

# Установка URI для tracking и registry
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}
assert all([var_value != "" for var_value in list(postgres_credentials.values())])

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

In [51]:
#Создаю эксперимент

EXPERIMENT_NAME = "Final improved model_2"
RUN_NAME = "Nick_project_cbr_model_2"
REGISTRY_MODEL_NAME = 'CBR_model_2'

In [5]:
#Загружаю необходимые артефакты.

df = pd.DataFrame(data, columns=columns)

columns_list = df.columns.tolist()
columns_text = ",".join(columns_list) 
with open("columns.txt", "w", encoding="utf-8") as fio:
    fio.write(columns_text)
    
df.to_csv("flat_cleaned_churn", index=False) 


In [6]:
df.head()

Unnamed: 0,id,build_id,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator,floor,kitchen_area,living_area,rooms,is_apartment,studio,total_area,target
0,1,16093,1994-01-01,4,55.834713,37.448383,2.64,204,17,True,16,10.1,44.799999,3,False,False,73.800003,13390000.0
1,2,3104,1960-01-01,1,55.701302,37.738918,2.8,20,5,False,1,6.0,16.5,1,False,False,32.0,5500000.0
2,3,11876,1977-01-01,0,55.851589,37.416008,2.48,168,12,True,9,10.3,44.0,3,False,False,66.0,9500000.0
3,4,9212,1970-01-01,6,55.876389,37.716415,2.64,98,9,True,5,6.5,28.0,2,False,False,40.0,9950000.0
4,5,22817,2014-01-01,4,55.734455,37.412422,2.64,274,10,True,9,10.08,45.720001,3,False,False,75.099998,18500000.0


In [7]:
#Функция для предобработки данных.

def transform_data(df):
    unique_counts = df.nunique()
    columns_to_drop = unique_counts[unique_counts < 2].index
    df = df.drop(columns=columns_to_drop)
    
    # Преобразование значений 'True' и 'False' в столбцах has_elevator и is_apartment в числовой формат
    df['has_elevator'] = df['has_elevator'].str.lower().replace({'false': 0, 'true': 1}).astype(int)
    
    df.drop('id', axis=1, inplace=True)
    df.drop('build_id', axis=1, inplace=True)
    df.drop('is_apartment', axis=1, inplace=True)
    
    df['build_year'] = pd.to_datetime(df['build_year']).dt.year
    df['build_year'] = pd.to_numeric(df['build_year'], errors='coerce')
    
    return df

In [8]:
df = transform_data(df)

  df['has_elevator'] = df['has_elevator'].str.lower().replace({'false': 0, 'true': 1}).astype(int)


In [9]:
df.head()

Unnamed: 0,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator,floor,kitchen_area,living_area,rooms,total_area,target
0,1994,4,55.834713,37.448383,2.64,204,17,1,16,10.1,44.799999,3,73.800003,13390000.0
1,1960,1,55.701302,37.738918,2.8,20,5,0,1,6.0,16.5,1,32.0,5500000.0
2,1977,0,55.851589,37.416008,2.48,168,12,1,9,10.3,44.0,3,66.0,9500000.0
3,1970,6,55.876389,37.716415,2.64,98,9,1,5,6.5,28.0,2,40.0,9950000.0
4,2014,4,55.734455,37.412422,2.64,274,10,1,9,10.08,45.720001,3,75.099998,18500000.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17521 entries, 0 to 17520
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   build_year         17521 non-null  int32  
 1   building_type_int  17521 non-null  object 
 2   latitude           17521 non-null  float64
 3   longitude          17521 non-null  float64
 4   ceiling_height     17521 non-null  float64
 5   flats_count        17521 non-null  int64  
 6   floors_total       17521 non-null  int64  
 7   has_elevator       17521 non-null  int64  
 8   floor              17521 non-null  int64  
 9   kitchen_area       17521 non-null  float64
 10  living_area        17521 non-null  float64
 11  rooms              17521 non-null  int64  
 12  total_area         17521 non-null  float64
 13  target             17521 non-null  float64
dtypes: float64(7), int32(1), int64(5), object(1)
memory usage: 1.8+ MB


In [11]:
#Закладываю функцию для категоризации по этажам

def floor_category(X, **kwargs):
    new_features = pd.DataFrame(index=X.index)
    new_features['is_first_floor'] = (X['floor'] == 1).astype(int)
    new_features['is_last_floor'] = (
        (X['floor'] == X['floors_total']).astype('int')
    )
    new_features['floor_relative'] = X['floor'] / X['floors_total']
    return new_features

custom_features_transformer = FunctionTransformer(floor_category)

In [12]:
#Формирую простой препроцессов в рамках которого будут заполнены пропущенные значения и квартиры категоризируются по этажности.

preprocessor = ColumnTransformer([
     ('fill_missing',  SimpleImputer(missing_values=0., strategy='median'), ['kitchen_area', 'living_area', 'total_area']),
     ('floor_category', FunctionTransformer(floor_category), ['floor', 'floors_total'])], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

In [13]:
#Формирую пайплайн

cbr_model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', CatBoostRegressor(random_state=RANDOM_STATE,
                                verbose=False))
])

In [14]:
#Разделяю выборку

X = df.drop('target', axis=1).copy()
y = df['target'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17521 entries, 0 to 17520
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   build_year         17521 non-null  int32  
 1   building_type_int  17521 non-null  object 
 2   latitude           17521 non-null  float64
 3   longitude          17521 non-null  float64
 4   ceiling_height     17521 non-null  float64
 5   flats_count        17521 non-null  int64  
 6   floors_total       17521 non-null  int64  
 7   has_elevator       17521 non-null  int64  
 8   floor              17521 non-null  int64  
 9   kitchen_area       17521 non-null  float64
 10  living_area        17521 non-null  float64
 11  rooms              17521 non-null  int64  
 12  total_area         17521 non-null  float64
dtypes: float64(6), int32(1), int64(5), object(1)
memory usage: 1.7+ MB


In [16]:
#Запускаю "голосование"

param_dist = {
    'model__iterations': distributions.IntDistribution(600, 1200),
    'model__learning_rate': distributions.FloatDistribution(.001, 0.1,
                                                            log=True),
    'model__depth': distributions.IntDistribution(4, 10),
}

study = OptunaSearchCV(
    estimator=cbr_model,
    param_distributions=param_dist,
    cv=4,
    n_jobs=-1,
    scoring='neg_mean_absolute_error',
    random_state=RANDOM_STATE,
    refit=False,
    n_trials=10
)

study.fit(X_train, y_train)

  study = OptunaSearchCV(
[I 2024-06-07 15:38:43,002] A new study created in memory with name: no-name-6517742e-80ae-4f77-86d8-6826e3e1036f
[I 2024-06-07 15:38:57,198] Trial 0 finished with value: -1838746.853462135 and parameters: {'model__iterations': 746, 'model__learning_rate': 0.026531331196392952, 'model__depth': 5}. Best is trial 0 with value: -1838746.853462135.
[I 2024-06-07 15:39:04,483] Trial 2 finished with value: -1832396.807367904 and parameters: {'model__iterations': 607, 'model__learning_rate': 0.02807299534581878, 'model__depth': 7}. Best is trial 2 with value: -1832396.807367904.
[I 2024-06-07 15:39:12,127] Trial 4 finished with value: -1830356.404820146 and parameters: {'model__iterations': 632, 'model__learning_rate': 0.0377850286223221, 'model__depth': 6}. Best is trial 4 with value: -1830356.404820146.
[I 2024-06-07 15:39:15,746] Trial 1 finished with value: -1891125.323330813 and parameters: {'model__iterations': 1019, 'model__learning_rate': 0.004316839882528766

In [17]:
best_params = study.best_params_
best_params

{'model__iterations': 632,
 'model__learning_rate': 0.0377850286223221,
 'model__depth': 6}

In [18]:
#обучение модели на лучших гиперпараметрах.
cbr_model.set_params(**best_params)

cbr_model.fit(X_train, y=y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [19]:
#Функция для скоринга модели
def evaluate(pipeline, X_test, y_test):
    start_time = time.time()
    prediction = pipeline.predict(X_test)
    end_time = time.time()
    prediction_time = end_time - start_time
    mean = y_test.mean() 
    MAE = mean_absolute_error(y_test, prediction)
    MSE = mean_squared_error(y_test, prediction)
    R2 = r2_score(y_test, prediction)
    y_error = y_test - prediction
    y_error_abs = abs(y_error)
    perc_error_abs = y_error_abs / y_test
    mape = perc_error_abs.mean()
    
    return {
        'Среднее значение целевой переменной': mean.round(2),
        'Средний модуль ошибки': MAE.round(2),
        'Средняя квадратичная ошибка': MSE.round(2),
        'Коэффициент детерминации': R2,
        'Средняя абсолютная ошибка в процентах': mape.round(2),
        'Время предсказания в секундах': prediction_time
    }

In [20]:
sbr_score = evaluate(cbr_model, X_test, y_test)
sbr_score

{'Среднее значение целевой переменной': 11079136.06,
 'Средний модуль ошибки': 1799920.93,
 'Средняя квадратичная ошибка': 4979010692131.94,
 'Коэффициент детерминации': 0.6305116516938205,
 'Средняя абсолютная ошибка в процентах': 0.18,
 'Время предсказания в секундах': 0.010658979415893555}

In [21]:
#Проведу микротест чтобы понять что в дальнейшем смогу получить предсказание от модели передав набор данных.

X_valid = {
    "build_year": [1994],
    "building_type_int": ["4"],
    "latitude": [55.834713],
    "longitude": [37.448383],
    "ceiling_height": [2.64],
    "flats_count": [204],
    "floors_total": [17],
    "has_elevator": [1],
    "floor": [16],
    "kitchen_area": [10.10],
    "living_area": [44.799999],
    "rooms": [3],
    "total_area": [73.800003]
}

X_valid = pd.DataFrame(X_valid)

X_valid

Unnamed: 0,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator,floor,kitchen_area,living_area,rooms,total_area
0,1994,4,55.834713,37.448383,2.64,204,17,1,16,10.1,44.799999,3,73.800003


In [22]:
test_prediction = cbr_model.predict(X_valid)
test_prediction.round(2) 

array([15991399.93])

In [23]:
#резервно сохраняю модель с расширением bin
import joblib

# Путь для сохранения модели в формате .bin
model_path = '/home/mle-user/mle_projects/mle-project-sprint-3-v001/services/model/cbr_model.bin'

# Сохранение модели в формате .bin с помощью joblib
joblib.dump(cbr_model, model_path)

# Проверка успешного сохранения модели
import os
if os.path.exists(model_path):
    print("Модель успешно сохранена в формате .bin")
else:
    print("Ошибка при сохранении модели")

Модель успешно сохранена в формате .bin


In [69]:
#Смело логирую модель и артефакты в MLflow

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id
    
pip_requirements = '../requirements.txt' 
input_example = input_example = X_test[:10]
metadata = {'model_type': 'monthly'}

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    mlflow.sklearn.log_model(cbr_model,  artifact_path="models", pip_requirements=pip_requirements,
                             input_example=input_example, metadata=metadata,
                             registered_model_name=REGISTRY_MODEL_NAME, await_registration_for=6)
    # Дополнительные логирования метрик и артефактов
    mlflow.log_metrics(sbr_score)
    mlflow.log_params(best_params) 
    mlflow.log_artifact("columns.txt", artifact_path="dataframe")
    mlflow.log_artifact("flat_cleaned_churn", artifact_path="dataframe")

 - mlflow (current: 2.13.1, required: mlflow==2.7.1)
 - pandas (current: 2.2.2, required: pandas==2.0.1)
 - scikit-learn (current: 1.5.0, required: scikit-learn==1.3.1)
 - scipy (current: 1.13.1, required: scipy==1.11.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
Registered model 'CBR_model_2' already exists. Creating a new version of this model...
2024/06/06 08:14:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 6 seconds for model version to finish creation. Model name: CBR_model_2, version 2
Created version '2' of model 'CBR_model_2'.


In [70]:
#Проверяю доступность модели.
model_uri = "models:/CBR_model_2/2"  # Укажите URI вашей модели
loaded_model = mlflow.sklearn.load_model(model_uri)

Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

In [73]:
# Сохраняю модель в директории /services/model
target_dir = "/home/mle-user/mle_projects/mle-project-sprint-3-v001/services/model"

os.makedirs(target_dir, exist_ok=True)

mlflow.sklearn.save_model(loaded_model, target_dir)

print("Модель сохранена в директории:", target_dir)

Модель сохранена в директории: /home/mle-user/mle_projects/mle-project-sprint-3-v001/services/model


In [1]:
#проверяю возможность загрузки модели
import joblib

def load_churn_model(model_path: str):
    try:
        model = joblib.load(model_path)  # Загрузка модели с использованием joblib
        print("Model loaded successfully")
    except Exception as e:
        print(f"Failed to load model: {e}")
        model = None
    return model

if __name__ == "__main__":
    model_path = "../services/model/model.pkl"
    model = load_churn_model(model_path)


Model loaded successfully


In [2]:
# Загрузка модели из файла
def get_model(path: str):
    with open(os.path.abspath(path), "rb") as file:
        model = pickle.load(file)
    return model

Failed to load model: Can't get attribute 'floor_category' on <module '__main__'>
