In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from sqlalchemy import create_engine
import mlflow
import boto3
import os
import joblib
import pickle
# подгружаем .env
load_dotenv()


* 'schema_extra' has been renamed to 'json_schema_extra'


True

In [3]:
s3_bucket = os.environ.get('S3_BUCKET_NAME')
s3_access_key = os.environ.get('AWS_ACCESS_KEY_ID')
s3_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("S3_ACCESS_KEY")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("S3_SECRET_KEY")

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")




In [4]:
#imports
import os
import random
import mlflow
import json
import logging
import time
import copy
import warnings
import numpy as np
import pandas as pd


from sqlalchemy import create_engine
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from matplotlib.colors import LinearSegmentedColormap
from phik import resources, report
from phik.report import plot_correlation_matrix
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import quantile_transform, robust_scale, scale, power_transform
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from autofeat import AutoFeatRegressor
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
from category_encoders import CatBoostEncoder
from autofeat import AutoFeatRegressor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from statistics import median
from optuna.samplers import CmaEsSampler, RandomSampler
from optuna.integration.mlflow import MLflowCallback
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import skew, kurtosis
dst_host = os.environ.get('DB_DESTINATION_HOST')
dst_port = os.environ.get('DB_DESTINATION_PORT')
dst_username = os.environ.get('DB_DESTINATION_USER')
dst_password = os.environ.get('DB_DESTINATION_PASSWORD')
dst_db = os.environ.get('DB_DESTINATION_NAME')

dst_conn = create_engine(f'postgresql://{dst_username}:{dst_password}@{dst_host}:{dst_port}/{dst_db}')
# Variables
TABLE_NAME = "flats_features"
REGISTRY_MODEL_NAME = 'sprint_2'
EXPERIMENT_NAME = 'sprint_2'
RANDOM_STATE = 42
GRAPHICS = 'graphics'
NOTEBOOK = 'mle-project-sprint-2-v001.ipynb'
MOSCOW_CENTER = (55.755825,37.617298)
MODELS_DIR = 'models'
target = ['price']
scoring = ['neg_mean_absolute_error', 'neg_root_mean_squared_error', 'r2']
root_mean_squared_error = lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred))




  from .autonotebook import tqdm as notebook_tqdm


In [5]:
SQL = f'select * from {TABLE_NAME}'
df = pd.read_sql(SQL, dst_conn)
df = df[(df['living_area']>0) & (df['price']> 100000) ].drop(['id'], axis=1)

In [6]:
cat_columns = [x for x in df.columns if df[x].nunique() <= 6]
num_columns = [x for x in df.columns if x not in cat_columns and x !=target[0]]
num_discrete_columns = ['floor', 'ceiling_height', 'flats_count', 'floors_total']
num_time_columns = [ 'build_year', 'building_id']
num_area_columns = ['kitchen_area', 'living_area', 'total_area']
num_geo_columns = ['latitude', 'longitude']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(target, axis=1),
    df[target],
    random_state=RANDOM_STATE,
    shuffle=False) 

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin, clone
def get_metrics(model, x_train, y_train, x_val, y_val, need_fit=True):

    start_time = time.time()
    root_mean_squared_error = lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred))
    if isinstance(y_train, pd.core.frame.DataFrame):
        y_train = y_train.values.ravel()
    if need_fit==True:
        model = clone(model)
        model.fit(x_train, y_train)
    elapsed_fit_time = time.time() - start_time

    start_time = time.time()
    y_pred = model.predict(x_val)
    elapsed_predict_time = time.time() - start_time

    metrics={}
    
    metrics['mae'] = mean_absolute_error(y_val, y_pred)
    metrics['rmse'] = root_mean_squared_error(y_val, y_pred)
    metrics['r2'] = r2_score(y_val, y_pred)
    metrics['fit_time'] = elapsed_fit_time
    metrics['predict_time'] = elapsed_predict_time
    print(f"Fit Time: {metrics['fit_time']:.4f} seconds")
    print(f"Predict Time: {metrics['fit_time']:.4f} predict_time")
    print(f"Mean Absolute Error (MAE): {metrics['mae']:.2f}")
    print(f"Root Mean Squared Error (RMSE): {metrics['rmse']:.2f}")
    print(f"R² Score: {metrics['r2']:.2f}")
    return metrics


In [9]:
pol_enc = PolynomialFeatures(degree=2)
kbd_enc = KBinsDiscretizer(n_bins=5, 
                           encode='ordinal',
                           strategy='uniform', 
                           subsample=None,
                           random_state=RANDOM_STATE)
no_geo_columns = num_discrete_columns + num_time_columns
kbins_columns = num_discrete_columns + num_time_columns + num_geo_columns
standart = num_discrete_columns + num_time_columns  + num_area_columns
cb_enc = CatBoostEncoder()
categorical_transformer = ColumnTransformer(
    transformers=[('cb_encoder', cb_enc, cat_columns)]
)
all_au = ['kbd__floor', 'kbd__ceiling_height', 'kbd__flats_count',
       'kbd__floors_total', 'kbd__build_year', 'kbd__building_id',
       'kbd__latitude', 'kbd__longitude',
        'cat__cb_encoder__building_type_int',
       'cat__cb_encoder__has_elevator', 'cat__cb_encoder__rooms',
       'cat__cb_encoder__is_apartment','scaler__building_id', 'scaler__build_year', 'scaler__latitude',
       'scaler__longitude', 'scaler__ceiling_height',
       'scaler__flats_count', 'scaler__floors_total', 'scaler__floor',
       'scaler__kitchen_area', 'scaler__living_area',
       'scaler__total_area',]
afreg =  AutoFeatRegressor(
                           verbose=0, 
                           feateng_steps=1, 
                           max_gb=16, 
                           n_jobs=-1)
preprocessor = ColumnTransformer(
    transformers=[
        ('kbd', kbd_enc, kbins_columns),
        ('scaler', StandardScaler(), num_columns),
        ('cat', categorical_transformer, cat_columns)], 
        remainder='passthrough',
        verbose_feature_names_out=True,
        )
preprocessor.set_output(transform='pandas')
feature_generator = ColumnTransformer(
    transformers=[
        ('auto_feat', afreg, all_au)], 
        remainder='passthrough',)

pipeline = Pipeline([
        ('processor', Pipeline([('preprocessor', preprocessor),
                                ('feature_generator', feature_generator)])),
        ('regressor', CatBoostRegressor(n_estimators=200, verbose=0, random_state=RANDOM_STATE))
        ])
pipeline.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [10]:
processor = Pipeline([('preprocessor', preprocessor), ('feature_generator', feature_generator)])
X_train_enriched = processor.transform(X_train)
X_test_enriched = processor.transform(X_test)
X_train_enriched.shape, X_test_enriched.shape


((56673, 45), (18891, 45))

In [11]:
estimator = Ridge()
scorer = make_scorer(root_mean_squared_error, greater_is_better=False)
sfs = SFS(estimator,
    k_features=20,       
    forward=True,      
    floating=False,     
    scoring=scorer, 
    cv=3,               
    n_jobs=-1
)
sbs = SFS(estimator,
    k_features=20,       
    forward=False,      
    floating=False,     
    scoring=scorer, 
    cv=0,               
    n_jobs=-1
)


In [12]:
union_features = FeatureUnion([
    ('sfs', sfs),
    ('sbs', sbs)
])
parameters = {'learning_rate': 0.09253061224489795, 'l2_leaf_reg': 7.575510204081633, 'iterations': 383, 'depth': 9}
pipeline_v2 = Pipeline([
        ('processor', Pipeline([('preprocessor', preprocessor),
                                            ('feature_generator', feature_generator),
                                            ('union_features', union_features)])), 
        ('regressor', CatBoostRegressor(**parameters, verbose=0, random_state=RANDOM_STATE))])
pipeline_v2.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [13]:
metrics = get_metrics(pipeline_v2, X_train, y_train, X_test, y_test, need_fit=False)

Fit Time: 0.0000 seconds
Predict Time: 0.0000 predict_time
Mean Absolute Error (MAE): 1962199.92
Root Mean Squared Error (RMSE): 2483116.28
R² Score: 0.70


In [14]:
import dill

# Сохранение с использованием dill
with open('model.pkl', 'wb') as model_file:
    dill.dump(pipeline_v2, model_file)

# Загрузка с использованием dill
with open('model.pkl', 'rb') as model_file:
    loaded_pipeline = dill.load(model_file)


In [15]:
from random import randint, uniform
random_params = {
        "floor": randint(1, 60), 
        "is_apartment": randint(0, 1), 
        "kitchen_area": uniform(1, 100), 
        "living_area": uniform(1, 200), 
        "rooms": randint(1, 10), 
        "total_area": uniform(1, 300), 
        "building_id": randint(1, 20000), 
        "build_year": randint(1920, 2024),  
        "building_type_int": randint(1, 10), 
        "latitude": uniform(54, 56), 
        "longitude": uniform(36, 38), 
        "ceiling_height": uniform(1, 5), 
        "flats_count": randint(1, 1000), 
        "floors_total": randint(1, 100), 
        "has_elevator": randint(0, 1)
    }

In [16]:
df_sample = pd.DataFrame(random_params, index=[0])

In [18]:
loaded_pipeline.predict(df_sample)[0]

12761447.099149788