# Проведение экспериментов по настройке модели
## Загрузка модулей

In [41]:
import numpy as np
import pandas as pd
import os
import sys
import pickle

## Загрузка данных

In [43]:
with open("../data/clean_data.pkl", "rb") as f:
    data = pickle.load(f)

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1935 entries, 1 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   battery_power  1935 non-null   int64   
 1   blue           1935 non-null   category
 2   clock_speed    1935 non-null   float64 
 3   dual_sim       1935 non-null   category
 4   fc             1935 non-null   int64   
 5   four_g         1935 non-null   category
 6   int_memory     1935 non-null   int64   
 7   m_dep          1935 non-null   float64 
 8   mobile_wt      1935 non-null   int64   
 9   n_cores        1935 non-null   int64   
 10  pc             1935 non-null   int64   
 11  px_height      1935 non-null   int64   
 12  px_width       1935 non-null   int64   
 13  ram            1935 non-null   int64   
 14  sc_h           1935 non-null   int64   
 15  sc_w           1935 non-null   int64   
 16  talk_time      1935 non-null   int64   
 17  three_g        1935 non-null   categor

In [None]:
numerical_features = data.select_dtypes(exclude="category").columns.drop('price_range')
nominal_features = data.select_dtypes('category').columns
target = 'price_range'

In [None]:
numerical_features, nominal_features

(Index(['battery_power', 'clock_speed', 'fc', 'int_memory', 'm_dep',
        'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h',
        'sc_w', 'talk_time'],
       dtype='object'),
 Index(['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi'], dtype='object'))

## Pipeline
### Подготовка

In [47]:
from sklearn.preprocessing import StandardScaler, TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from IPython.display import display


In [48]:
X = data.drop(target, axis=1)
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.75)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', StandardScaler(), numerical_features),
        ('nominal', TargetEncoder(), nominal_features)
])
pipeline = Pipeline([
    ('transform', preprocessor),
    ('classification', RandomForestClassifier())
])

### baseline

In [68]:
estimator = pipeline.fit(X_train, y_train)
predictions = estimator.predict(X_test)
estimator

0,1,2
,steps,"[('transform', ...), ('classification', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric', ...), ('nominal', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,target_type,'auto'
,smooth,'auto'
,cv,5
,shuffle,True
,random_state,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### mlflow

In [51]:
import mlflow
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score

In [61]:
def calc_metrics(y_test, predictions, average='weighted'):
    metrics = {}
    metrics["recall"] = recall_score(y_test, predictions, average=average)   
    metrics["precision"] = precision_score(y_test, predictions, average=average)
    metrics["f1"] = f1_score(y_test, predictions, average=average)
    return metrics

In [None]:
TARGET_HOST = "localhost"
TARGET_PORT = 5000
TRACKING_URI = f"http://{TARGET_HOST}:{TARGET_PORT}"
REGISTRY_URI = TRACKING_URI
EXPERIMENT_NAME = 'Mobile Price Classification'
SIGNATURE = mlflow.models.infer_signature(model_input=X_train.head(5))
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_registry_uri(REGISTRY_URI)



In [66]:
def log_mlflow_model(run_name, estimator, metrics, signature=SIGNATURE, artifacts=None):
    if (exp := mlflow.get_experiment_by_name(EXPERIMENT_NAME)) is not None:
        experiment_id = exp.experiment_id
    else:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

    with mlflow.start_run(run_name=run_name, experiment_id=experiment_id) as run:
        run_id = run.info.run_id 
        # mlflow.log_artifact('../requirements.txt')
        if artifacts is not None:
            for a in artifacts:
                mlflow.log_artifact(a)
        mlflow.sklearn.log_model(estimator, artifact_path="models", signature=signature)
        mlflow.log_metrics(metrics)
    
    run = mlflow.get_run(run_id)
    return run.info.status =='FINISHED'

In [69]:
ok = log_mlflow_model(
    'baseline', estimator, metrics=calc_metrics(y_test, predictions))
assert ok

2025/09/28 12:36:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline at: http://localhost:5000/#/experiments/1/runs/3e4cb7d811c14bfdb38d55f3b644c9e6.
2025/09/28 12:36:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


## Генерация новых признаков

In [55]:
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer

In [87]:
def update_data(data, colname_template, transformer, init_features):
    new_data = transformer.fit_transform(data[init_features])
    new_features = [colname_template.format(i) for i in range(new_data.shape[1])]
    data[new_features] = new_data
    return data

In [None]:
X_fe_sklearn = X.copy()
poly_features_init = ['sc_h', 'sc_w']
kbins_features_init = ['battery_power', 'n_cores']
numerical_features_remains = numerical_features.drop(poly_features_init + kbins_features_init)

X_fe_sklearn = update_data(
    X_fe_sklearn, 'poly_{}', PolynomialFeatures(degree=2), poly_features_init
)

new_data = KBinsDiscretizer(n_bins=3).fit_transform(X_fe_sklearn[kbins_features_init])
X_fe_sklearn[[f'kbins_{i}' for i in range(new_data.shape[1])]] = new_data.todense()

with open('../mlflow/new_feature_cols.txt', 'w') as f:
    print(*X_fe_sklearn.columns, sep=',', file=f)

X_train_fe_sklearn, X_test_fe_sklearn, y_train, y_test = train_test_split(X_fe_sklearn, y, train_size=.75)

pipeline = Pipeline([
    ('preprocessor', ColumnTransformer([
        (
            'numeric',
            StandardScaler(),
            X_fe_sklearn.select_dtypes(exclude='category').columns
        ),
        ('nominal', TargetEncoder(), nominal_features)])),
    ('classifier', RandomForestClassifier())
])
estimator = pipeline.fit(X_train_fe_sklearn, y_train)
display(estimator)
predictions = estimator.predict(X_test_fe_sklearn)

log_mlflow_model(
    'new features', estimator,
    metrics=calc_metrics(y_test, predictions),
    signature=mlflow.models.infer_signature(model_input=X_train_fe_sklearn.head(5)),
    artifacts=['../mlflow/new_feature_cols.txt']
)




0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric', ...), ('nominal', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,target_type,'auto'
,smooth,'auto'
,cv,5
,shuffle,True
,random_state,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


2025/09/28 13:33:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run new features at: http://localhost:5000/#/experiments/1/runs/b625eef14f7042edbfde9fce67873274.
2025/09/28 13:33:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


True

## Выбор наиболее значимых признаков

In [104]:
from sklearn.feature_selection import SequentialFeatureSelector

In [None]:
classifier = RandomForestClassifier()
selector = SequentialFeatureSelector(
    classifier, n_features_to_select=10, direction='forward'
)
selector.fit(X_train_fe_sklearn, y_train)

idx = selector.get_support(indices=True)
with open('../mlflow/selected_features.txt', 'w') as f:
    print(*idx, sep=',', file=f)
    print(*X_fe_sklearn.columns[idx], sep=',', file=f)

print('selected features:', *X_fe_sklearn.columns[idx])

pipeline = Pipeline([
    ('preprocessor', ColumnTransformer([
        (
            'numeric', StandardScaler(),
            X_fe_sklearn.select_dtypes(exclude='category').columns
        ),
        ('nominal', TargetEncoder(), nominal_features)])),
    ('selection', selector),
    ('classifier', classifier)
])
estimator = pipeline.fit(X_train_fe_sklearn, y_train)
display(estimator)
predictions = estimator.predict(X_test_fe_sklearn)

log_mlflow_model(
    '10 selected features', estimator,
    metrics=calc_metrics(y_test, predictions),
    signature=mlflow.models.infer_signature(model_input=X_train_fe_sklearn.iloc[:, idx].head(5)),
    artifacts=['../mlflow/selected_features.txt']
)

selected features: battery_power four_g mobile_wt px_height px_width ram touch_screen poly_0 kbins_0 kbins_1


0,1,2
,steps,"[('preprocessor', ...), ('selection', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric', ...), ('nominal', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,target_type,'auto'
,smooth,'auto'
,cv,5
,shuffle,True
,random_state,

0,1,2
,estimator,RandomForestClassifier()
,n_features_to_select,10
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


2025/09/28 14:23:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run 10 selected features at: http://localhost:5000/#/experiments/1/runs/8d782dde118548259da1db70c5a3fa69.
2025/09/28 14:23:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


True

## Подброр гиперпараметров

In [111]:
from sklearn.model_selection import ParameterGrid

In [120]:
param_grid = {
    'classifier__n_estimators': [10, 50, 100],
    'classifier__max_depth': [3, 5, 10],
    'classifier__max_features': [.1, .4, .7]
}
grid = ParameterGrid(param_grid)

for i, params in enumerate(grid):
    name = f'trial_{i}'
    print(f'{i}. current {params=}', ' ' * 10, end='\r')
    pipeline.set_params(**params)
    artifact = os.path.join('../mlflow/', f'{name}.txt')
    with open(artifact, 'w') as f:
        print(*params.items(), sep='\n', file=f)

    estimator = pipeline.fit(X_train_fe_sklearn, y_train)
    predictions = estimator.predict(X_test_fe_sklearn)
    log_mlflow_model(
        name, estimator, calc_metrics(y_test, predictions), 
        signature=mlflow.models.infer_signature(model_input=X_test_fe_sklearn.head(5)),
        artifacts=[artifact]
    )


0. current params={'classifier__max_depth': 3, 'classifier__max_features': 0.1, 'classifier__n_estimators': 10}           

2025/09/28 15:20:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_0 at: http://localhost:5000/#/experiments/1/runs/ca8ffe3a55dd42a7ad8af7b27ac7dbff.
2025/09/28 15:20:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


1. current params={'classifier__max_depth': 3, 'classifier__max_features': 0.1, 'classifier__n_estimators': 50}           

2025/09/28 15:23:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_1 at: http://localhost:5000/#/experiments/1/runs/81c3489b3dc74dc18a27e309547a6d90.
2025/09/28 15:23:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


2. current params={'classifier__max_depth': 3, 'classifier__max_features': 0.1, 'classifier__n_estimators': 100}           

2025/09/28 15:31:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_2 at: http://localhost:5000/#/experiments/1/runs/d1ee87e13dce4b67963b23a3c9a6c8fc.
2025/09/28 15:31:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


3. current params={'classifier__max_depth': 3, 'classifier__max_features': 0.4, 'classifier__n_estimators': 10}           

2025/09/28 15:32:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_3 at: http://localhost:5000/#/experiments/1/runs/bda78d2c606643be8015a5d39620f0aa.
2025/09/28 15:32:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


4. current params={'classifier__max_depth': 3, 'classifier__max_features': 0.4, 'classifier__n_estimators': 50}           

2025/09/28 15:36:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_4 at: http://localhost:5000/#/experiments/1/runs/1ee9ecf2f8594077bd910f1bb4c7d80b.
2025/09/28 15:36:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


5. current params={'classifier__max_depth': 3, 'classifier__max_features': 0.4, 'classifier__n_estimators': 100}           

2025/09/28 15:52:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_5 at: http://localhost:5000/#/experiments/1/runs/499a46198aa140c1b68a41764fa2f64b.
2025/09/28 15:52:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


6. current params={'classifier__max_depth': 3, 'classifier__max_features': 0.7, 'classifier__n_estimators': 10}           

2025/09/28 15:53:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_6 at: http://localhost:5000/#/experiments/1/runs/31254edddaaa4fa08f157ed552aab446.
2025/09/28 15:53:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


7. current params={'classifier__max_depth': 3, 'classifier__max_features': 0.7, 'classifier__n_estimators': 50}           

2025/09/28 15:57:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_7 at: http://localhost:5000/#/experiments/1/runs/6337a0c362d145a2a7e5a95e1a338513.
2025/09/28 15:57:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


8. current params={'classifier__max_depth': 3, 'classifier__max_features': 0.7, 'classifier__n_estimators': 100}           

2025/09/28 17:14:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_8 at: http://localhost:5000/#/experiments/1/runs/11120cb3c0f047bd8c2d00c15d461706.
2025/09/28 17:14:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


9. current params={'classifier__max_depth': 5, 'classifier__max_features': 0.1, 'classifier__n_estimators': 10}           

2025/09/28 17:15:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_9 at: http://localhost:5000/#/experiments/1/runs/bae53a10a75a4fb7a69df1ae431e0061.
2025/09/28 17:15:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


10. current params={'classifier__max_depth': 5, 'classifier__max_features': 0.1, 'classifier__n_estimators': 50}           

2025/09/28 17:19:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_10 at: http://localhost:5000/#/experiments/1/runs/a1695f5fd3cf4b2f926e508410a7dbb9.
2025/09/28 17:19:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


11. current params={'classifier__max_depth': 5, 'classifier__max_features': 0.1, 'classifier__n_estimators': 100}           

2025/09/28 17:27:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_11 at: http://localhost:5000/#/experiments/1/runs/9b18e3ec562b4224b109fe4a143d9d7a.
2025/09/28 17:27:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


12. current params={'classifier__max_depth': 5, 'classifier__max_features': 0.4, 'classifier__n_estimators': 10}           

2025/09/28 17:28:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_12 at: http://localhost:5000/#/experiments/1/runs/4019a379e6fd40a8a4f3eb177d7d724a.
2025/09/28 17:28:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


13. current params={'classifier__max_depth': 5, 'classifier__max_features': 0.4, 'classifier__n_estimators': 50}           

2025/09/28 17:32:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_13 at: http://localhost:5000/#/experiments/1/runs/2c0536aad2904910a277a9000b9545c6.
2025/09/28 17:32:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


14. current params={'classifier__max_depth': 5, 'classifier__max_features': 0.4, 'classifier__n_estimators': 100}           

2025/09/28 17:41:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_14 at: http://localhost:5000/#/experiments/1/runs/753a08afa2274855b7c5c86f33fbd461.
2025/09/28 17:41:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


15. current params={'classifier__max_depth': 5, 'classifier__max_features': 0.7, 'classifier__n_estimators': 10}           

2025/09/28 17:42:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_15 at: http://localhost:5000/#/experiments/1/runs/b38e481ad2eb43e9a95e75c2752c4f23.
2025/09/28 17:42:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


16. current params={'classifier__max_depth': 5, 'classifier__max_features': 0.7, 'classifier__n_estimators': 50}           

2025/09/28 17:47:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_16 at: http://localhost:5000/#/experiments/1/runs/0f0ec03c6c974a38a71ad32d15c82a36.
2025/09/28 17:47:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


17. current params={'classifier__max_depth': 5, 'classifier__max_features': 0.7, 'classifier__n_estimators': 100}           

2025/09/28 17:58:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_17 at: http://localhost:5000/#/experiments/1/runs/a2a0fb9d5bd44e698efe846564438a32.
2025/09/28 17:58:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


18. current params={'classifier__max_depth': 10, 'classifier__max_features': 0.1, 'classifier__n_estimators': 10}           

2025/09/28 18:00:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_18 at: http://localhost:5000/#/experiments/1/runs/e07942ed12274713bc563b59e41a7304.
2025/09/28 18:00:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


19. current params={'classifier__max_depth': 10, 'classifier__max_features': 0.1, 'classifier__n_estimators': 50}           

2025/09/28 18:04:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run trial_19 at: http://localhost:5000/#/experiments/1/runs/b60196bec9ff445b8b1095d6710887da.
2025/09/28 18:04:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


20. current params={'classifier__max_depth': 10, 'classifier__max_features': 0.1, 'classifier__n_estimators': 100}           

KeyboardInterrupt: 