# Библиотеки

In [20]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import mlflow
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer
from sklearn.feature_selection import SequentialFeatureSelector 
from sklearn.model_selection import  GridSearchCV
from sklearn.metrics import make_scorer

# Загрузка и подготовка данных

In [2]:
with open("../data/clean_train_data.pkl", "rb") as f:
    df = pickle.load(f)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1935 entries, 1 to 1999
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   battery_power  1935 non-null   int64   
 1   blue           1935 non-null   category
 2   clock_speed    1935 non-null   float64 
 3   dual_sim       1935 non-null   category
 4   fc             1935 non-null   int64   
 5   four_g         1935 non-null   category
 6   int_memory     1935 non-null   int64   
 7   m_dep          1935 non-null   float64 
 8   mobile_wt      1935 non-null   int64   
 9   n_cores        1935 non-null   int64   
 10  pc             1935 non-null   int64   
 11  px_height      1935 non-null   int64   
 12  px_width       1935 non-null   int64   
 13  ram            1935 non-null   int64   
 14  sc_h           1935 non-null   int64   
 15  sc_w           1935 non-null   int64   
 16  talk_time      1935 non-null   int64   
 17  three_g        1935 non-null   categor

In [3]:
X = df.drop('price_range', axis=1)
y = df['price_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.75)

In [4]:
numeric_features = df.select_dtypes(exclude="category").columns.drop('price_range')
categorical_features = df.select_dtypes('category').columns

# Pipeline

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', StandardScaler(), numeric_features),
        ('categorical', TargetEncoder(), categorical_features)
])
pipeline_baseline = Pipeline([
    ('transform', preprocessor),
    ('classification', RandomForestClassifier())
])

# Baseline-модель 

In [6]:
estimator = pipeline_baseline.fit(X_train, y_train)
predictions = estimator.predict(X_test)
estimator

0,1,2
,steps,"[('transform', ...), ('classification', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,target_type,'auto'
,smooth,'auto'
,cv,5
,shuffle,True
,random_state,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# Mlflow

In [7]:
def calc_metrics(y_test, predictions, average='weighted'):
    metrics = {}
    metrics["recall"] = recall_score(y_test, predictions, average=average)   
    metrics["precision"] = precision_score(y_test, predictions, average=average)
    metrics["f1"] = f1_score(y_test, predictions, average=average)
    return metrics

In [8]:
TARGET_HOST = "localhost"
TARGET_PORT = 5000
TRACKING_URI = f"http://{TARGET_HOST}:{TARGET_PORT}"
REGISTRY_URI = TRACKING_URI

mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_registry_uri(REGISTRY_URI)

In [9]:
EXPERIMENT_NAME = 'MobilePrice Classification'
RUN_NAME = "baseline model"
REGISTRY_MODEL_NAME = "estate_model_rf"

signature = mlflow.models.infer_signature(model_input=X_train.head(5))
input_example = X_train.head(5)
req_file = '../requirements.txt'
params_dict = pipeline_baseline.get_params()



In [10]:
# Когда создаем новый эксперимент, то: 
experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:
# experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline_baseline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(calc_metrics(y_test, predictions))
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

 - scikit-learn-1.7.2 (current: uninstalled, required: scikit-learn-1.7.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - scikit-learn-1.7.2 (current: uninstalled, required: scikit-learn-1.7.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2025/10/18 18:15:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline model at: http://localhost:5000/#/experiments/1/runs/0a6f555d8b704eed8511ade47a794b3f.
2025/10/18 18:15:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


# Генерация новых признаков 

In [11]:
def generate_features_polynomial(data, colname_template, transformer, init_features):
    new_data = transformer.fit_transform(data[init_features])
    new_features = [colname_template.format(i) for i in range(new_data.shape[1])]
    data[new_features] = new_data
    return data

In [None]:
X_train_fe_sklearn = X_train.copy()
X_test_fe_sklearn = X_test.copy()
poly_features_init = ['sc_h', 'sc_w']
kbins_features_init = ['battery_power', 'n_cores']


X_train_fe_sklearn = generate_features_polynomial(
    X_train_fe_sklearn, 'polynomial_{}', PolynomialFeatures(degree=2), poly_features_init
)

X_test_fe_sklearn = generate_features_polynomial(
    X_test_fe_sklearn, 'polynomial_{}', PolynomialFeatures(degree=2), poly_features_init
)

X_train_fe_sklearn[poly_features_init] = X_train_fe_sklearn[poly_features_init].astype('float128')
X_test_fe_sklearn[poly_features_init] = X_test_fe_sklearn[poly_features_init].astype('float128')



new_data = KBinsDiscretizer(n_bins=3).fit_transform(X_train_fe_sklearn[kbins_features_init])
X_train_fe_sklearn[[f'kbins_{i}' for i in range(new_data.shape[1])]] = new_data.todense()

new_data = KBinsDiscretizer(n_bins=3).fit_transform(X_test_fe_sklearn[kbins_features_init])
X_test_fe_sklearn[[f'kbins_{i}' for i in range(new_data.shape[1])]] = new_data.todense()

with open('../mlflow/new_feature_cols.txt', 'w') as f:
    print(*X_train_fe_sklearn.columns, sep=',', file=f)

pipeline_new_features = Pipeline([
    ('preprocessor', ColumnTransformer([
        (
            'numeric',
            StandardScaler(),
            X_train_fe_sklearn.select_dtypes(exclude='category').columns
        ),
        ('categorical', TargetEncoder(), categorical_features)])),
    ('classifier', RandomForestClassifier())
])
estimator = pipeline_new_features.fit(X_train_fe_sklearn, y_train)
display(estimator)



0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,target_type,'auto'
,smooth,'auto'
,cv,5
,shuffle,True
,random_state,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
predictions = estimator.predict(X_test_fe_sklearn)

In [17]:
RUN_NAME = "new_features"
input_example = X_train_fe_sklearn.head(5)
signature = mlflow.models.infer_signature(model_input=input_example)

params_dict = pipeline_new_features.get_params()

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline_new_features, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(calc_metrics(y_test, predictions))
    mlflow.log_artifact('../mlflow/new_feature_cols.txt')
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

 - scikit-learn-1.7.2 (current: uninstalled, required: scikit-learn-1.7.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - scikit-learn-1.7.2 (current: uninstalled, required: scikit-learn-1.7.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2025/10/18 18:19:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run new_features at: http://localhost:5000/#/experiments/1/runs/881732228b30411295da48662900b272.
2025/10/18 18:19:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


# Отбор наиболее важных признаков

In [18]:
classifier_main_features = RandomForestClassifier()
selector = SequentialFeatureSelector(
    classifier_main_features, n_features_to_select=5, direction='forward'
)

selector.fit(X_train_fe_sklearn, y_train)

idx = selector.get_support(indices=True)
with open('../mlflow/main_features.txt', 'w') as f:
    print(*idx, sep=',', file=f)
    print(*X_train_fe_sklearn.columns[idx], sep=',', file=f)

print('main features:', *X_train_fe_sklearn.columns[idx])

pipeline_main_features = Pipeline([
    ('preprocessor', ColumnTransformer([
        (
            'numeric', StandardScaler(),
            X_train_fe_sklearn.select_dtypes(exclude='category').columns
        ),
        ('categorical', TargetEncoder(), categorical_features)])),
    ('selection', selector),
    ('classifier', classifier_main_features)
])
estimator = pipeline_main_features.fit(X_train_fe_sklearn, y_train)
display(estimator)
predictions = estimator.predict(X_test_fe_sklearn)



main features: battery_power px_width ram sc_w total_pixels


0,1,2
,steps,"[('preprocessor', ...), ('selection', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,target_type,'auto'
,smooth,'auto'
,cv,5
,shuffle,True
,random_state,

0,1,2
,estimator,RandomForestClassifier()
,n_features_to_select,5
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [19]:
RUN_NAME = "main_features"
input_example = X_train_fe_sklearn.iloc[:, idx].head(5)
signature = mlflow.models.infer_signature(model_input=input_example)

params_dict = pipeline_main_features.get_params()

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline_main_features, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(calc_metrics(y_test, predictions))
    mlflow.log_artifact('../mlflow/main_features.txt')
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

 - scikit-learn-1.7.2 (current: uninstalled, required: scikit-learn-1.7.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - scikit-learn-1.7.2 (current: uninstalled, required: scikit-learn-1.7.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
  "dataframe_split": {
    "columns": [
      "battery_power",
      "px_width",
      "ram",
      "sc_w",
      "total_pixels"
    ],
    "data": [
      [
        1113,
        1312,
        277,
        2,
        1373664
      ],
      [
        1986,
        599,
        3476,
        8,
        150349
      ],
      [
        1940,
        858,
        2297,
        6,
        96096
      ],
      [
        840,
        1081,
        3486,
        8,
        1066947
      ],
      [
       

# Настройка параметров для лучшей модели

In [25]:
param_grid = {
    'classifier__n_estimators': [50, 100], #, 200, 300], 
    'classifier__max_depth': [None, 10, 15, 20, 25, 30],
    'classifier__max_features': [i/10 for i in range(1,10)],
}

gs = GridSearchCV(
    pipeline_main_features, 
    param_grid, 
    cv=3,
    scoring=make_scorer(f1_score, average='binary')
)

gs.fit(X_train_fe_sklearn, y_train)
print("Лучшие гиперпараметры:", gs.best_params_)
print("Лучшее значение f1-score:", gs.best_score_)
print("Лучшая модель:", gs.best_estimator_)

Traceback (most recent call last):
  File "/home/vanina/Documents/study/IIS/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 942, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vanina/Documents/study/IIS/.venv/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 308, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vanina/Documents/study/IIS/.venv/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 408, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vanina/Documents/study/IIS/.venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 218, in wrap

Лучшие гиперпараметры: {'classifier__max_depth': None, 'classifier__max_features': 0.1, 'classifier__n_estimators': 50}
Лучшее значение f1-score: nan
Лучшая модель: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                                  Index(['battery_power', 'clock_speed', 'fc', 'int_memory', 'm_dep',
       'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h',
       'sc_w', 'talk_time', 'total_pixels', 'screen_size', 'polynomial_0',
       'polynomial_1', 'polynomial_2', 'polynomial_3', 'polynomial_4',
       'polynomi...', 'kbins_2', 'kbins_3', 'kbins_4',
       'kbins_5'],
      dtype='object')),
                                                 ('categorical',
                                                  TargetEncoder(),
                                                  Index(['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi'], dtype='object'))])),
        

In [26]:
print("Лучшие гиперпараметры:", gs.best_params_)
print("Лучшее значение f1-score:", gs.best_score_)
print("Лучшая модель:", gs.best_estimator_)

Лучшие гиперпараметры: {'classifier__max_depth': None, 'classifier__max_features': 0.1, 'classifier__n_estimators': 50}
Лучшее значение f1-score: nan
Лучшая модель: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                                  Index(['battery_power', 'clock_speed', 'fc', 'int_memory', 'm_dep',
       'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h',
       'sc_w', 'talk_time', 'total_pixels', 'screen_size', 'polynomial_0',
       'polynomial_1', 'polynomial_2', 'polynomial_3', 'polynomial_4',
       'polynomi...', 'kbins_2', 'kbins_3', 'kbins_4',
       'kbins_5'],
      dtype='object')),
                                                 ('categorical',
                                                  TargetEncoder(),
                                                  Index(['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi'], dtype='object'))])),
        

In [27]:
classifier_optimized = RandomForestClassifier(max_depth=None, max_features=0.1, n_estimators=50)

pipeline_optimized = Pipeline([
    ('preprocessor', ColumnTransformer([
        (
            'numeric', StandardScaler(),
            X_train_fe_sklearn.select_dtypes(exclude='category').columns
        ),
        ('categorical', TargetEncoder(), categorical_features)])),
    ('selection', selector),
    ('classifier', classifier_optimized)
])
estimator = pipeline_optimized.fit(X_train_fe_sklearn, y_train)
display(estimator)
predictions = estimator.predict(X_test_fe_sklearn)

0,1,2
,steps,"[('preprocessor', ...), ('selection', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,target_type,'auto'
,smooth,'auto'
,cv,5
,shuffle,True
,random_state,

0,1,2
,estimator,RandomForestClassifier()
,n_features_to_select,5
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,0.1
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
RUN_NAME = "best hyperparams"
input_example = X_train_fe_sklearn.head(5)
signature = mlflow.models.infer_signature(model_input=input_example)

params_dict = pipeline_main_features.get_params()

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline_main_features, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(calc_metrics(y_test, predictions))
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

 - scikit-learn-1.7.2 (current: uninstalled, required: scikit-learn-1.7.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - scikit-learn-1.7.2 (current: uninstalled, required: scikit-learn-1.7.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
  "dataframe_split": {
    "columns": [
      "battery_power",
      "px_width",
      "ram",
      "sc_w",
      "total_pixels"
    ],
    "data": [
      [
        1113,
        1312,
        277,
        2,
        1373664
      ],
      [
        1986,
        599,
        3476,
        8,
        150349
      ],
      [
        1940,
        858,
        2297,
        6,
        96096
      ],
      [
        840,
        1081,
        3486,
        8,
        1066947
      ],
      [
       

# Обучение лучшей модели на всей выборке

In [29]:
X_fe_sklearn = pd.concat([X_train_fe_sklearn, X_test_fe_sklearn])
y = pd.concat([y_train, y_test])

In [30]:
pipeline_all_data= Pipeline([
    ('preprocessor', ColumnTransformer([
        (
            'numeric', StandardScaler(),
            X_fe_sklearn.select_dtypes(exclude='category').columns
        ),
        ('categorical', TargetEncoder(), categorical_features)])),
    ('selection', selector),
    ('classifier', classifier_main_features)
])

estimator = pipeline_main_features.fit(X_fe_sklearn, y)
display(estimator)


0,1,2
,steps,"[('preprocessor', ...), ('selection', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,target_type,'auto'
,smooth,'auto'
,cv,5
,shuffle,True
,random_state,

0,1,2
,estimator,RandomForestClassifier()
,n_features_to_select,5
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [33]:
RUN_NAME = "all_dataset"
input_example = X_fe_sklearn.iloc[:, idx].head(5)
signature = mlflow.models.infer_signature(model_input=input_example)

params_dict = pipeline_all_data.get_params()

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline_all_data, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_artifact('../mlflow/main_features.txt')
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

 - scikit-learn-1.7.2 (current: uninstalled, required: scikit-learn-1.7.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - scikit-learn-1.7.2 (current: uninstalled, required: scikit-learn-1.7.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
  "dataframe_split": {
    "columns": [
      "battery_power",
      "px_width",
      "ram",
      "sc_w",
      "total_pixels"
    ],
    "data": [
      [
        1113,
        1312,
        277,
        2,
        1373664
      ],
      [
        1986,
        599,
        3476,
        8,
        150349
      ],
      [
        1940,
        858,
        2297,
        6,
        96096
      ],
      [
        840,
        1081,
        3486,
        8,
        1066947
      ],
      [
       