In [30]:
# импортируем чистый датасет из лр1
import pandas as pd

data = pd.read_pickle('/Users/sergejvaresko/iis_proj/data/clean_dataset.pkl')
data

Unnamed: 0,Selling_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,Age_of_car,Car_depreciation
0,1.208960,27000,Petrol,Dealer,Manual,0,11,4.381
1,1.558145,43000,Diesel,Dealer,Manual,0,12,7.982
2,1.981001,6900,Petrol,Dealer,Manual,0,8,7.869
3,1.047319,5200,Petrol,Dealer,Manual,0,14,3.103
4,1.526056,42450,Diesel,Dealer,Manual,0,11,5.344
...,...,...,...,...,...,...,...,...
296,2.251292,33988,Diesel,Dealer,Manual,0,9,9.349
297,1.386294,60000,Petrol,Dealer,Manual,0,10,4.514
298,1.208960,87934,Petrol,Dealer,Manual,0,16,9.791
299,2.442347,9000,Diesel,Dealer,Manual,0,8,10.058


**Разделение на тестовую и обучающую выборки в размере 25%-75%**

In [31]:
from sklearn.model_selection import train_test_split

X = data.drop('Selling_Price', axis = 1) # данные для регрессии, без целевой переменной
y = data['Selling_Price'] # целевая переменная (по которой потом будет сравнение)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10)

print(f"Обучающая выборка содержит {X_train.shape}")

print(f"Тестовая выборка содержит {X_test.shape}")

Обучающая выборка содержит (224, 7)
Тестовая выборка содержит (75, 7)


**Создаем переменные для хранения имен столбцов категориальных и числовых переменных**

In [32]:
X.dtypes

Driven_kms            int64
Fuel_Type            object
Selling_type         object
Transmission         object
Owner                 int64
Age_of_car            int64
Car_depreciation    float64
dtype: object

In [33]:
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns.to_list()
categorical_columns = X.select_dtypes(include=['object']).columns.to_list()

print(f"Числовые признаки находятся в столбцах - {numeric_columns}")
print(f"Категориальные признаки находятся в столбцах - {categorical_columns}")

Числовые признаки находятся в столбцах - ['Driven_kms', 'Owner', 'Age_of_car', 'Car_depreciation']
Категориальные признаки находятся в столбцах - ['Fuel_Type', 'Selling_type', 'Transmission']


**Создание pipeline обработки признаков и обучения модели. Для числовых признаков используется StandardScaler, для категориальных - OrdinalEncoder для задачи регрессии. В качестве модели - RandomForest**

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor

preprocessor_data = ColumnTransformer(
        transformers=[
            ('numeric', StandardScaler(), numeric_columns),
            ('categorical', OrdinalEncoder(), categorical_columns)
        ]
)

pipeline_model = Pipeline([
    ('preprocessor', preprocessor_data),
    ('regressor',RandomForestRegressor(random_state=42))
                ])

# param = {
#     'regressor__n_estimators' : [50, 100, 150, 200],
#     'regressor__max_depth' : [None, 10, 20, 25]
# }

**Обучить модель и получить на тестовой выборке метрики качества**

In [38]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

model_1v = pipeline_model.fit(X_train, y_train)

# Предсказание
y_pred_baseline = pipeline_model.predict(X_test)

# Метрики
mae = mean_absolute_error(y_test, y_pred_baseline)
mape = mean_absolute_percentage_error(y_test, y_pred_baseline)
mse = mean_squared_error(y_test, y_pred_baseline)

print("Baseline модель:")
print(f"MAE: {mae:.2f}")
print(f"MAPE: {mape:.2%}")
print(f"MSE: {mse:.2f}")

Baseline модель:
MAE: 0.35
MAPE: 3740736895105443.00%
MSE: 0.25


**http://127.0.0.1:5000 - адрес для MLFlow**

**Создание артефактов и логирование первой модели**

In [60]:
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

mlflow.set_tracking_uri("sqlite:////Users/sergejvaresko/iis_proj/mlflow/mlflow.db")

# Можно создать эксперимент вручную с артефактами:
if not mlflow.get_experiment_by_name("Baseline_Experiment_1"):
    mlflow.create_experiment(
        name="Baseline_Experimen_1",
        artifact_location="file:///Users/sergejvaresko/iis_proj/mlflow/mlruns_artifacts"
    )


mlflow.set_experiment("Baseline_Experiment_1")

input_example = X_train.head(5)
signature = infer_signature(X_train.head(5), pipeline_model.predict(X_train.head(5)))

with mlflow.start_run():
    mlflow.set_tag("model_type", "baseline")
    mlflow.set_tag("framework", "sklearn")

    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mape", mape)
    mlflow.log_metric("mse", mse)

    mlflow.sklearn.log_model(
        sk_model=pipeline_model,
        artifact_path="model",
        input_example=input_example,
        signature=signature
    )


2025/04/10 00:55:43 INFO mlflow.tracking.fluent: Experiment with name 'Baseline_Experiment_1' does not exist. Creating a new experiment.


In [61]:
from mlflow import MlflowClient

client = MlflowClient()

run_id = "ea9f8aa3085a46bc9afcb04053ab15b8"
model_uri = f"runs:/{run_id}/model"
model_name = "BaselineRandomForest"
client.create_registered_model(model_name)
client.create_model_version(name=model_name, source=model_uri, run_id=run_id)


<ModelVersion: aliases=[], creation_timestamp=1744236150877, current_stage='None', description=None, last_updated_timestamp=1744236150877, name='BaselineRandomForest', run_id='ea9f8aa3085a46bc9afcb04053ab15b8', run_link=None, source='runs:/ea9f8aa3085a46bc9afcb04053ab15b8/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

**Пункт 10 Создаем новый экс и логируем все**

Создайте новую переменную X_train_fe_sklearn- копию исходной обучающей выборки, используя метод .copy() датафрейма. - готово

Создайте ColumnTransformer, содержащий как трансформации baseline-модели (сделанные ранее), так и новые. - готово

Обучите и сохраните в переменную X_train_fe_sklearn (используя метод fit_transform) получившиеся преобразования. - готово

Сохраните в файл названия столбцов получившегося датафрейма. Этот файл нужно будет залогировать в MLFlow - готово

Создайте pipeline, в котором на первом шаге будет работать ColumnTransformer, созданный в этом пункте, а на втором - модель. - готово

In [62]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, PolynomialFeatures, KBinsDiscretizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from mlflow.models import infer_signature


X_train_fe_sklearn = X_train.copy()

poly_features = ['Driven_kms', 'Age_of_car']
bin_features = ['Car_depreciation', 'Owner']
cat_features = ['Fuel_Type', 'Selling_type', 'Transmission']
all_num = X_train_fe_sklearn.select_dtypes(include=['int64', 'float64']).columns.tolist() #отберем еще раз тк выборку копировали, для сохранности пред экс
base_num = list(set(all_num) - set(poly_features) - set(bin_features))


poly_transformer = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("scaler", StandardScaler())
])

bin_transformer = Pipeline([
    ("kbins", KBinsDiscretizer(n_bins=4, encode='onehot-dense', strategy='quantile'))
])

base_transformer = Pipeline([
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
    ("ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])


full_preprocessor = ColumnTransformer([
    ("poly_features", poly_transformer, poly_features),
    ("bin_features", bin_transformer, bin_features),
    ("base_numeric", base_transformer, base_num),
    ("categorical", cat_transformer, cat_features)
])


X_train_fe_transformed = full_preprocessor.fit_transform(X_train_fe_sklearn)

poly_names = full_preprocessor.named_transformers_['poly_features']['poly'].get_feature_names_out(poly_features)
bin_names = full_preprocessor.named_transformers_['bin_features']['kbins'].get_feature_names_out(bin_features)
final_feature_names = np.concatenate([poly_names, bin_names, base_num, cat_features])


feature_file = "feature_names_fe.txt"
with open(feature_file, "w") as f:
    for col in final_feature_names:
        f.write(f"{col}\n")


pipeline_fe = Pipeline([
    ("preprocessor", full_preprocessor),
    ("regressor", RandomForestRegressor(random_state=42))
])

pipeline_fe.fit(X_train_fe_sklearn, y_train)
y_pred = pipeline_fe.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)


mlflow.set_tracking_uri("sqlite:////Users/sergejvaresko/iis_proj/mlflow/mlflow.db") 
mlflow.set_experiment("Feature_Engineering_Experiment")

with mlflow.start_run(run_name="fe_polynomial_kbins_randomforest"):
    mlflow.set_tag("task", "feature_engineering_with_sklearn")

    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mape", mape)
    mlflow.log_metric("mse", mse)

    input_example = X_train.head(5)
    signature = infer_signature(X_train.head(5), pipeline_fe.predict(X_train.head(5)))

    mlflow.sklearn.log_model(
        sk_model=pipeline_fe,
        artifact_path="model",
        input_example=input_example,
        signature=signature
    )

    mlflow.log_artifact(feature_file)


2025/04/10 15:30:51 INFO mlflow.tracking.fluent: Experiment with name 'Feature_Engineering_Experiment' does not exist. Creating a new experiment.


**Новые признаки и замена первоначальной трансформации**

In [63]:
!pip show autofeat

Name: autofeat
Version: 2.1.3
Summary: Automatic Feature Engineering and Selection Linear Prediction Model
Home-page: https://franziskahorn.de/autofeat
Author: Franziska Horn
Author-email: cod3licious@gmail.com
License: MIT
Location: /Users/sergejvaresko/iis_proj/.venv_lab_iis/lib/python3.9/site-packages
Requires: numba, scikit-learn, sympy, joblib, pint, scipy, pandas, numpy
Required-by: 


In [64]:
from autofeat import AutoFeatRegressor

X_train_fe_autofeat = X_train.copy() # делаем одинаково как и в пред пункте
cat_features = ['Fuel_Type', 'Selling_type', 'Transmission']
encoder = ColumnTransformer([
    ("categorical", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_features)
], remainder="passthrough")
X_train_enc = pd.DataFrame(
    encoder.fit_transform(X_train_fe_autofeat),
    columns=cat_features + [col for col in X_train.columns if col not in cat_features]
)
X_test_enc = pd.DataFrame(
    encoder.transform(X_test),
    columns=cat_features + [col for col in X_test.columns if col not in cat_features]
)
autofeat_model = AutoFeatRegressor(verbose=1, feateng_steps=2)

X_train_autofeat = autofeat_model.fit_transform(X_train_enc, y_train)
X_test_autofeat = autofeat_model.transform(X_test_enc)
autofeat_feature_file = "autofeat_feature_names.txt"
with open(autofeat_feature_file, "w") as f:
    for name in X_train_autofeat.columns:
        f.write(f"{name}\n")
pipeline_autofeat = Pipeline([
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])
pipeline_autofeat.fit(X_train_autofeat, y_train)
y_pred = pipeline_autofeat.predict(X_test_autofeat)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

mlflow.set_tracking_uri("sqlite:////Users/sergejvaresko/iis_proj/mlflow/mlflow.db") 
mlflow.set_experiment("AutoFeat_Experiment")

with mlflow.start_run(run_name="autofeat_feateng_v2_rf"):
    mlflow.set_tag("model_type", "autofeat + RF")
    mlflow.set_tag("feature_engineering", "autofeat")

    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mape", mape)
    mlflow.log_metric("mse", mse)

    signature = infer_signature(X_train_autofeat, y_pred)
    mlflow.sklearn.log_model(
        sk_model=pipeline_autofeat,
        artifact_path="model",
        signature=signature,
        input_example=X_train_autofeat.iloc[:5]
    )

    mlflow.log_artifact(autofeat_feature_file)

[featsel] Scaling data.../            406 feature tuples combineddone.


  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:


[AutoFeat]     9/   10 new features

2025/04/10 18:32:48 INFO mlflow.tracking.fluent: Experiment with name 'AutoFeat_Experiment' does not exist. Creating a new experiment.


[AutoFeat]     9/   10 new features

**С использованием библиотеки mlxtend отобрать N наиболее важных признаков. N выбирается с учетом количества признаков на предыдущем шаге, ориентировочный диапазон - от 20% до 70%.**

In [68]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer

n_features_to_select = int(len(final_feature_names) * 0.5)  # 50% от признаков


sfs_forward = SFS(
    estimator=LinearRegression(),
    k_features=n_features_to_select,
    forward=True,
    floating=False,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=1
)

# признаки после fit_transform из пункта 10 нашей лр
sfs_forward.fit(X_train_fe_transformed, y_train)

selected_idx_forward = list(sfs_forward.k_feature_idx_)
selected_names_forward = [final_feature_names[i] for i in selected_idx_forward]

with open("sfs_forward_feature_names.txt", "w") as f:
    for name in selected_names_forward:
        f.write(f"{name}\n")

with open("sfs_forward_feature_idx.txt", "w") as f:
    for idx in selected_idx_forward:
        f.write(f"{idx}\n")

def select_features_forward(X):
    return X[:, selected_idx_forward]

feature_selector_forward = FunctionTransformer(select_features_forward)

pipeline_sfs_forward = Pipeline([
    ("preprocessor", full_preprocessor),  # тот же, что в п.10
    ("select", feature_selector_forward), # отбор признаков
    ("regressor", RandomForestRegressor(random_state=42))
])
pipeline_sfs_forward.fit(X_train_fe_sklearn, y_train)
y_pred = pipeline_sfs_forward.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)


mlflow.set_tracking_uri("sqlite:////Users/sergejvaresko/iis_proj/mlflow/mlflow.db")
mlflow.set_experiment("Feature_Selection_Experiment")

with mlflow.start_run(run_name="sfs_forward_rf"):
    mlflow.set_tag("feature_selection", "SFS forward")
    mlflow.set_tag("selected_features", str(len(selected_idx_forward)))
    mlflow.set_tag("base_features", str(len(final_feature_names)))

    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mape", mape)
    mlflow.log_metric("mse", mse)

    # лог модели
    signature = infer_signature(X_train_fe_transformed[:, selected_idx_forward], y_pred)
    mlflow.sklearn.log_model(
        sk_model=pipeline_sfs_forward,
        artifact_path="model",
        signature=signature,
        input_example=X_train_fe_transformed[:5, selected_idx_forward]
    )

    # лог файлов
    mlflow.log_artifact("sfs_forward_feature_names.txt")
    mlflow.log_artifact("sfs_forward_feature_idx.txt")


2025/04/10 20:10:28 INFO mlflow.tracking.fluent: Experiment with name 'Feature_Selection_Experiment' does not exist. Creating a new experiment.
  "inputs": [
    [
      -0.1309703640428355,
      0.0,
      0.0,
      2.0,
      1.0,
      1.0
    ],
    [
      -0.1309703640428355,
      0.0,
      0.0,
      2.0,
      1.0,
      1.0
    ],
    [
      -0.48874306581838584,
      0.0,
      0.0,
      2.0,
      1.0,
      1.0
    ],
    [
      -0.48874306581838584,
      1.0,
      0.0,
      2.0,
      0.0,
      1.0
    ],
    [
      -1.2042884693694866,
      0.0,
      0.0,
      2.0,
      1.0,
      0.0
    ]
  ]
}. Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_inpu

**повтор предыдущего действия с SequentialFeatureSelector последовательно удаляя признаки (forward=False), и\или с помощью RFE из sklearn.**

In [69]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

n_features_to_select_back = int(len(final_feature_names) * 0.4)  # например 40% теперь поставим
sfs_backward = SFS(
    estimator=LinearRegression(),
    k_features=n_features_to_select_back,
    forward=False,
    floating=False,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=1
)

sfs_backward.fit(X_train_fe_transformed, y_train)

selected_idx_back = list(sfs_backward.k_feature_idx_)
selected_names_back = [final_feature_names[i] for i in selected_idx_back]

with open("sfs_backward_feature_names.txt", "w") as f:
    for name in selected_names_back:
        f.write(f"{name}\n")

with open("sfs_backward_feature_idx.txt", "w") as f:
    for idx in selected_idx_back:
        f.write(f"{idx}\n")


rfe = RFE(estimator=LinearRegression(), n_features_to_select=n_features_to_select_back)
rfe.fit(X_train_fe_transformed, y_train)

rfe_selected_idx = np.where(rfe.support_)[0].tolist()
rfe_selected_names = [final_feature_names[i] for i in rfe_selected_idx]

with open("rfe_feature_names.txt", "w") as f:
    for name in rfe_selected_names:
        f.write(f"{name}\n")

with open("rfe_feature_idx.txt", "w") as f:
    for idx in rfe_selected_idx:
        f.write(f"{idx}\n")

intersection = list(set(selected_names_back) & set(rfe_selected_names))
union = list(set(selected_names_back) | set(rfe_selected_names))

print("Совпадающие признаки (пересечение):", intersection)
print("Объединённые признаки (union):", union)

def select_features_back(X):
    return X[:, selected_idx_back]

pipeline_sfs_back = Pipeline([
    ("preprocessor", full_preprocessor),
    ("select", FunctionTransformer(select_features_back)),
    ("regressor", RandomForestRegressor(random_state=42))
])

pipeline_sfs_back.fit(X_train_fe_sklearn, y_train)
y_pred = pipeline_sfs_back.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

mlflow.set_tracking_uri("sqlite:////Users/sergejvaresko/iis_proj/mlflow/mlflow.db")
mlflow.set_experiment("Feature_Selection_Experiment")

with mlflow.start_run(run_name="sfs_backward_rf"):
    mlflow.set_tag("feature_selection", "SFS backward")
    mlflow.set_tag("selected_features", str(len(selected_idx_back)))
    mlflow.set_tag("base_features", str(len(final_feature_names)))

    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mape", mape)
    mlflow.log_metric("mse", mse)

    signature = infer_signature(X_train_fe_transformed[:, selected_idx_back], y_pred)
    mlflow.sklearn.log_model(
        sk_model=pipeline_sfs_back,
        artifact_path="model",
        signature=signature,
        input_example=X_train_fe_transformed[:5, selected_idx_back]
    )

    mlflow.log_artifact("sfs_backward_feature_names.txt")
    mlflow.log_artifact("sfs_backward_feature_idx.txt")
    mlflow.log_artifact("rfe_feature_names.txt")
    mlflow.log_artifact("rfe_feature_idx.txt")


Совпадающие признаки (пересечение): ['Car_depreciation_1.0', 'Car_depreciation_0.0', 'Selling_type']
Объединённые признаки (union): ['Driven_kms Age_of_car', 'Selling_type', 'Car_depreciation_2.0', 'Car_depreciation_0.0', 'Car_depreciation_3.0', 'Age_of_car^2', 'Car_depreciation_1.0']


  "inputs": [
    [
      -0.21725804671643173,
      1.0,
      0.0,
      0.0,
      1.0
    ],
    [
      -0.21725804671643173,
      1.0,
      0.0,
      0.0,
      1.0
    ],
    [
      -0.5024092330317483,
      1.0,
      0.0,
      0.0,
      1.0
    ],
    [
      -0.5024092330317483,
      0.0,
      0.0,
      1.0,
      0.0
    ],
    [
      -0.9912398381437197,
      1.0,
      0.0,
      0.0,
      1.0
    ]
  ]
}. Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: X has 5 features, but ColumnTransformer is expecting 7 features as input.


**У Random Forest будем настраивать параметры**

In [70]:
import optuna
from optuna.samplers import TPESampler

# j,thnrf для обучения и оценки модели
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    max_features = trial.suggest_float("max_features", 0.1, 1.0)

    model = Pipeline([
        ("preprocessor", full_preprocessor),
        ("select", FunctionTransformer(select_features_forward)),
        ("regressor", RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            max_features=max_features,
            random_state=42
        ))
    ])

    model.fit(X_train_fe_sklearn, y_train)
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    return mae  # минимизируем

study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=15)

best_params = study.best_params
print("Лучшие параметры:", best_params)


best_model = Pipeline([
    ("preprocessor", full_preprocessor),
    ("select", FunctionTransformer(select_features_forward)),
    ("regressor", RandomForestRegressor(
        **best_params,
        random_state=42
    ))
])

best_model.fit(X_train_fe_sklearn, y_train)
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

from mlflow import MlflowClient

mlflow.set_tracking_uri("sqlite:////Users/sergejvaresko/iis_proj/mlflow/mlflow.db")
mlflow.set_experiment("Hyperparameter_Tuning_RF")

with mlflow.start_run(run_name="rf_tuned_sfs_forward_v2"):
    mlflow.set_tag("model_type", "RandomForest tuned")
    mlflow.set_tag("tuning", "optuna")
    mlflow.set_tag("selected_features", str(len(selected_idx_forward)))

    mlflow.log_params(best_params)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mape", mape)
    mlflow.log_metric("mse", mse)

    signature = infer_signature(X_train_fe_transformed[:, selected_idx_forward], y_pred)
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="model",
        signature=signature,
        input_example=X_train_fe_transformed[:5, selected_idx_forward],
        registered_model_name="BaselineRandomForest"
    )


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-04-10 21:18:05,013] A new study created in memory with name: no-name-567ee764-9018-46e5-bdab-3f3fc01efac2
[I 2025-04-10 21:18:05,109] Trial 0 finished with value: 0.383234973547487 and parameters: {'n_estimators': 144, 'max_depth': 15, 'max_features': 0.7587945476302645}. Best is trial 0 with value: 0.383234973547487.
[I 2025-04-10 21:18:05,200] Trial 1 finished with value: 0.4055432516313216 and parameters: {'n_estimators': 200, 'max_depth': 5, 'max_features': 0.2403950683025824}. Best is trial 0 with value: 0.383234973547487.
[I 2025-04-10 21:18:05,242] Trial 2 finished with value: 0.3765577053435806 and parameters: {'n_estimators': 64, 'max_depth': 14, 'max_features': 0.6410035105688879}. Best is trial 2 with value: 0.3765577053435806.
[I 2025-04-10 21:18:05,383] Trial 3 finished with value: 0.45778036654871973 and parameters: {'n_estimators': 227, 'max_depth': 3, 'max_features': 0.9729188669457949}. Best is trial 2 with valu

Лучшие параметры: {'n_estimators': 85, 'max_depth': 6, 'max_features': 0.4297256589643225}


Registered model 'BaselineRandomForest' already exists. Creating a new version of this model...
Created version '2' of model 'BaselineRandomForest'.
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 4929.50it/s] 
  "inputs": [
    [
      -0.1309703640428355,
      0.0,
      0.0,
      2.0,
      1.0,
      1.0
    ],
    [
      -0.1309703640428355,
      0.0,
      0.0,
      2.0,
      1.0,
      1.0
    ],
    [
      -0.48874306581838584,
      0.0,
      0.0,
      2.0,
      1.0,
      1.0
    ],
    [
      -0.48874306581838584,
      1.0,
      0.0,
      2.0,
      0.0,
      1.0
    ],
    [
      -1.2042884693694866,
      0.0,
      0.0,
      2.0,
      1.0,
      0.0
    ]
  ]
}. Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input exampl

In [72]:
# на предыдущем шаге некорректно сравнили метрики - теперь делаем на основе действительно лучшей модели
import optuna
from optuna.samplers import TPESampler
from autofeat import AutoFeatRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import pandas as pd
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
from mlflow import MlflowClient
from mlflow.exceptions import RestException


X_train_fe_autofeat = X_train.copy()
cat_features = ['Fuel_Type', 'Selling_type', 'Transmission']

encoder = ColumnTransformer([
    ("categorical", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_features)
], remainder="passthrough")

X_train_enc = pd.DataFrame(
    encoder.fit_transform(X_train_fe_autofeat),
    columns=cat_features + [col for col in X_train.columns if col not in cat_features]
)
X_test_enc = pd.DataFrame(
    encoder.transform(X_test),
    columns=cat_features + [col for col in X_test.columns if col not in cat_features]
)


autofeat_model = AutoFeatRegressor(verbose=1, feateng_steps=2)
X_train_autofeat = autofeat_model.fit_transform(X_train_enc, y_train)
X_test_autofeat = autofeat_model.transform(X_test_enc)


def objective(trial):
    model = RandomForestRegressor(
        n_estimators=trial.suggest_int("n_estimators", 50, 300),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        max_features=trial.suggest_float("max_features", 0.1, 1.0),
        random_state=42
    )
    model.fit(X_train_autofeat, y_train)
    preds = model.predict(X_test_autofeat)
    mae = mean_absolute_error(y_test, preds)
    return mae  # минимизируем!

study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=15)

best_params = study.best_params
print("Лучшие параметры:", best_params)

best_model = RandomForestRegressor(**best_params, random_state=42)
best_model.fit(X_train_autofeat, y_train)
y_pred = best_model.predict(X_test_autofeat)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)


mlflow.set_tracking_uri("sqlite:////Users/sergejvaresko/iis_proj/mlflow/mlflow.db")
mlflow.set_experiment("AutoFeat_Tuned")

client = MlflowClient()
model_name = "BaselineRandomForest"

# Удалим существующую версию 2
for mv in client.search_model_versions(f"name='{model_name}'"):
    if mv.version == "2":
        client.delete_model_version(name=model_name, version=2)

with mlflow.start_run(run_name="autofeat_rf_hyperopt_v2"):
    mlflow.set_tag("model_type", "autofeat + RF tuned")
    mlflow.set_tag("based_on", "autofeat_feateng_v2_rf")
    mlflow.set_tag("tuning", "optuna")
    
    mlflow.log_params(best_params)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mape", mape)
    mlflow.log_metric("mse", mse)

    signature = infer_signature(X_train_autofeat, y_pred)

    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="model",
        input_example=X_train_autofeat.iloc[:5],
        signature=signature,
        registered_model_name=model_name
    )



[featsel] Scaling data...done.        406 feature tuples combined


  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:


[AutoFeat]     9/   10 new features

[I 2025-04-10 21:34:45,393] A new study created in memory with name: no-name-a60ddea5-8b37-41f9-8e93-d36df84ecfc8
[I 2025-04-10 21:34:45,510] Trial 0 finished with value: 0.28826068216418577 and parameters: {'n_estimators': 144, 'max_depth': 20, 'max_features': 0.7587945476302645}. Best is trial 0 with value: 0.28826068216418577.


[AutoFeat]     9/   10 new features

[I 2025-04-10 21:34:45,606] Trial 1 finished with value: 0.3104503222618805 and parameters: {'n_estimators': 200, 'max_depth': 5, 'max_features': 0.2403950683025824}. Best is trial 0 with value: 0.28826068216418577.
[I 2025-04-10 21:34:45,654] Trial 2 finished with value: 0.2992764919781907 and parameters: {'n_estimators': 64, 'max_depth': 18, 'max_features': 0.6410035105688879}. Best is trial 0 with value: 0.28826068216418577.
[I 2025-04-10 21:34:45,784] Trial 3 finished with value: 0.3045257146214363 and parameters: {'n_estimators': 227, 'max_depth': 3, 'max_features': 0.9729188669457949}. Best is trial 0 with value: 0.28826068216418577.
[I 2025-04-10 21:34:45,911] Trial 4 finished with value: 0.3105702448074429 and parameters: {'n_estimators': 258, 'max_depth': 6, 'max_features': 0.26364247048639056}. Best is trial 0 with value: 0.28826068216418577.
[I 2025-04-10 21:34:45,975] Trial 5 finished with value: 0.30469389299815836 and parameters: {'n_estimators': 96, 'max_depth': 8, 'max_

Лучшие параметры: {'n_estimators': 134, 'max_depth': 15, 'max_features': 0.9952889269957197}


Registered model 'BaselineRandomForest' already exists. Creating a new version of this model...
Created version '3' of model 'BaselineRandomForest'.
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 5192.81it/s] 


**Обучить модель с помощью алгоритма CatBoost с выбранным набором признаков**

In [77]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
from mlflow import MlflowClient


xgb_model = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    verbosity=0
)

xgb_model.fit(X_train_autofeat, y_train)
y_pred = xgb_model.predict(X_test_autofeat)

mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

mlflow.set_tracking_uri("sqlite:////Users/sergejvaresko/iis_proj/mlflow/mlflow.db")
mlflow.set_experiment("AutoFeat_Tuned")

client = MlflowClient()
model_name = "BaselineRandomForest"

with mlflow.start_run(run_name="autofeat_xgboost_v4"):
    mlflow.set_tag("model_type", "autofeat + XGBoost")
    mlflow.set_tag("based_on", "autofeat_feateng_v2_rf")
    mlflow.set_tag("tuning", "manual default (xgboost)")

    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mape", mape)
    mlflow.log_metric("mse", mse)

    signature = infer_signature(X_train_autofeat, y_pred)

    mlflow.sklearn.log_model(
        sk_model=xgb_model,
        artifact_path="model",
        input_example=X_train_autofeat.iloc[:5],
        signature=signature,
        registered_model_name=model_name
    )


Registered model 'BaselineRandomForest' already exists. Creating a new version of this model...
Created version '4' of model 'BaselineRandomForest'.
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 5435.05it/s] 


# По всем прогонам найдена лучшая по метрикам модель, для нее сейчас сделаем прогон на всей выборке и все залогируем

In [78]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from autofeat import AutoFeatRegressor
from xgboost import XGBRegressor
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
from mlflow import MlflowClient


full_data = pd.concat([X_train, X_test], axis=0)
full_target = pd.concat([y_train, y_test], axis=0)

cat_features = ['Fuel_Type', 'Selling_type', 'Transmission']
encoder = ColumnTransformer([
    ("categorical", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_features)
], remainder="passthrough")

full_data_enc = pd.DataFrame(
    encoder.fit_transform(full_data),
    columns=cat_features + [col for col in full_data.columns if col not in cat_features]
)


autofeat_model = AutoFeatRegressor(verbose=0, feateng_steps=2)
full_data_autofeat = autofeat_model.fit_transform(full_data_enc, full_target)


final_model = XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    verbosity=0
)
final_model.fit(full_data_autofeat, full_target)

input_example = full_data_autofeat.iloc[:5]
signature = infer_signature(full_data_autofeat, final_model.predict(full_data_autofeat.iloc[:5]))


features_file = "final_autofeat_feature_names.txt"
with open(features_file, "w") as f:
    for col in full_data_autofeat.columns:
        f.write(f"{col}\n")

requirements_path = "/Users/sergejvaresko/iis_proj/requirements.txt"


mlflow.set_tracking_uri("sqlite:////Users/sergejvaresko/iis_proj/mlflow/mlflow.db")
mlflow.set_experiment("Final_Model_Training")

client = MlflowClient()
model_name = "BaselineRandomForest"

with mlflow.start_run(run_name="final_autofeat_xgboost_production"):
    mlflow.set_tag("stage", "production-ready")
    mlflow.set_tag("based_on", "autofeat_xgboost_v4")

    mlflow.log_artifact(features_file)
    mlflow.log_artifact(requirements_path)

    mlflow.sklearn.log_model(
        sk_model=final_model,
        artifact_path="model",
        input_example=input_example,
        signature=signature,
        registered_model_name=model_name
    )

# Обновляем stage на Production
latest_version = max([
    int(mv.version)
    for mv in client.search_model_versions(f"name='{model_name}'")
])

client.transition_model_version_stage(
    name=model_name,
    version=latest_version,
    stage="Production",
    archive_existing_versions=True
)


  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:
2025/04/10 23:10:27 INFO mlflow.tracking.fluent: Experiment with name 'Final_Model_Training' does not exist. Creating a new experiment.
Registered model 'BaselineRandomForest' already exists. Creating a new version of this model...
Created version '5' of model 'BaselineRandomForest'.
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 5573.30it/s] 
  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1744315829377, current_stage='Production', description=None, last_updated_timestamp=1744315829436, name='BaselineRandomForest', run_id='cab059e767df4e13a49baae66fcacdad', run_link=None, source='/Users/sergejvaresko/iis_proj/research/mlruns/9/cab059e767df4e13a49baae66fcacdad/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>