In [77]:
import os
import mlflow


In [78]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression


from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

In [79]:
df = pd.read_pickle('../data/clean_data.pkl')

In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Car_Name       301 non-null    category
 1   Year           301 non-null    category
 2   Selling_Price  301 non-null    float32 
 3   Present_Price  301 non-null    float32 
 4   Driven_kms     301 non-null    int32   
 5   Fuel_Type      301 non-null    category
 6   Selling_type   301 non-null    category
 7   Transmission   301 non-null    category
 8   Owner          301 non-null    int8    
dtypes: category(5), float32(2), int32(1), int8(1)
memory usage: 6.4 KB


In [81]:
df = df.rename(columns={'Selling_Price': 'target'})
df

Unnamed: 0,Car_Name,Year,target,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.60,6.87,42450,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...,...
296,city,2016,9.50,11.60,33988,Diesel,Dealer,Manual,0
297,brio,2015,4.00,5.90,60000,Petrol,Dealer,Manual,0
298,city,2009,3.35,11.00,87934,Petrol,Dealer,Manual,0
299,city,2017,11.50,12.50,9000,Diesel,Dealer,Manual,0


In [82]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2)

In [83]:
cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()
cat_features

['Car_Name', 'Year', 'Fuel_Type', 'Selling_type', 'Transmission']

In [84]:
num_features = X_train.select_dtypes(include=['number']).columns.to_list()
num_features

['Present_Price', 'Driven_kms', 'Owner']

In [85]:
s_scaler = StandardScaler()
l_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999999) # unknown_value нужно выбирать с умом
regressor = RandomForestRegressor()

In [86]:
# Для удобной работы со столбцами
preprocessor = ColumnTransformer(
    transformers=[
        ('num', s_scaler, num_features),  # преобразования для числовых признаков
        ('cat', l_encoder, cat_features), # преобразования для категориальных признаков
    ],
    remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования

In [87]:

pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', regressor)])

pipeline.fit(X_train, y_train)

In [88]:
predictions = pipeline.predict(X_test) 

metrics = {}
metrics["mae"] = mean_absolute_error(y_test, predictions)   
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)

metrics

{'mae': np.float64(0.6429434291812542),
 'mape': np.float64(0.37846500354290635),
 'mse': np.float64(1.8206972689037493)}

In [89]:

# Работаем с MLflow локально
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5002

registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

mlflow.set_tracking_uri(tracking_uri)   
mlflow.set_registry_uri(registry_uri) 

In [90]:
# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться
EXPERIMENT_NAME = "estate_project"
RUN_NAME = "baseline model"
REGISTRY_MODEL_NAME = "estate_model_rf"

In [91]:
# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их
from mlflow.models import infer_signature

signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)



In [92]:
# Будем логировать requirements и артефакт - текстовый файл
req_file = '../requirements.txt'
art = '../comment.txt'

In [93]:
# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели
#params_dict = {'n_estimators': 10, 'max_depth': 10}
params_dict = pipeline.get_params()

In [94]:
# Когда создаем новый эксперимент, то: 
experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:
#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact(art)
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

2024/10/22 16:50:43 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline model at: http://127.0.0.1:5002/#/experiments/1/runs/b7633adfe85c4c52a89a466d44d78b80.
2024/10/22 16:50:43 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5002/#/experiments/1.


In [98]:
from mlflow.models import infer_signature
input_example = X_train.head(5)
signature =  infer_signature(model_input = X_train.head(5))

In [100]:
regressor2 = RandomForestRegressor(n_estimators=10, max_depth=6)

In [101]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', regressor2)])

pipeline.fit(X_train, y_train)

In [102]:
predictions = pipeline.predict(X_test) 
metrics = {}
metrics["mae"] = mean_absolute_error(y_test, predictions)   
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)

metrics

{'mae': np.float64(0.6840565249536025),
 'mape': np.float64(0.3810990417583104),
 'mse': np.float64(2.304099009471227)}

In [103]:
# !!! Проверить название прогона а также все логируемые параметры и артефакты, что они соответствуют второй "маленькой" модели. 


RUN_NAME = 'smaller_model'

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact(art)
    mlflow.log_params(pipeline.get_params())

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

2024/10/22 17:09:43 INFO mlflow.tracking._tracking_service.client: 🏃 View run smaller_model at: http://127.0.0.1:5002/#/experiments/1/runs/d822e98603854cc495f1dd73ea407f50.
2024/10/22 17:09:43 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5002/#/experiments/1.


**Feature engineerig**

In [105]:

from sklearn.preprocessing import QuantileTransformer, SplineTransformer, PolynomialFeatures, MinMaxScaler

In [106]:
X_train_sklearn = X_train.copy()

In [107]:
pf = PolynomialFeatures(degree=2)

In [108]:
X_train_sklearn

Unnamed: 0,Car_Name,Year,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
155,Honda Activa 4G,2017,0.510000,4300,Petrol,Individual,Automatic,0
104,Royal Enfield Classic 350,2017,1.470000,4100,Petrol,Individual,Manual,0
285,jazz,2016,8.500000,15059,Petrol,Dealer,Automatic,0
219,verna,2012,9.400000,36000,Petrol,Dealer,Manual,0
291,brio,2015,6.100000,31427,Petrol,Dealer,Manual,0
...,...,...,...,...,...,...,...,...
75,etios g,2015,6.800000,36000,Petrol,Dealer,Manual,0
22,sx4,2011,8.010000,50000,Petrol,Dealer,Automatic,0
72,corolla altis,2013,18.610001,56001,Petrol,Dealer,Manual,0
15,ertiga,2016,10.790000,43000,Diesel,Dealer,Manual,0


In [109]:
mlflow.sklearn.autolog()

with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:
    pipeline.fit(X_train, y_train)

 - mlflow (current: 2.16.0, required: mlflow==2.17.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/10/22 17:18:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run auto at: http://127.0.0.1:5002/#/experiments/1/runs/1a4c37b3822f43e39df04702653eb403.
2024/10/22 17:18:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5002/#/experiments/1.


In [110]:
pf.fit_transform(X_train_sklearn[['Present_Price','Driven_kms']])

array([[1.00000000e+00, 5.09999990e-01, 4.30000000e+03, 2.60099990e-01,
        2.19299996e+03, 1.84900000e+07],
       [1.00000000e+00, 1.47000003e+00, 4.10000000e+03, 2.16090008e+00,
        6.02700012e+03, 1.68100000e+07],
       [1.00000000e+00, 8.50000000e+00, 1.50590000e+04, 7.22500000e+01,
        1.28001500e+05, 2.26773481e+08],
       ...,
       [1.00000000e+00, 1.86100006e+01, 5.60010000e+04, 3.46332123e+02,
        1.04217864e+06, 3.13611200e+09],
       [1.00000000e+00, 1.07900000e+01, 4.30000000e+04, 1.16424099e+02,
        4.63969998e+05, 1.84900000e+09],
       [1.00000000e+00, 7.30000019e-01, 1.20000000e+04, 5.32900028e-01,
        8.76000023e+03, 1.44000000e+08]])

In [113]:
sp = SplineTransformer(n_knots=3, degree=3)
sp.fit_transform(X_train_sklearn[['Driven_kms']])

array([[1.59174223e-01, 6.66436925e-01, 1.74388264e-01, 5.87060755e-07,
        0.00000000e+00],
       [1.59562848e-01, 6.66460389e-01, 1.73976264e-01, 4.99159983e-07,
        0.00000000e+00],
       [1.39185616e-01, 6.63367490e-01, 1.97413878e-01, 3.30161856e-05,
        0.00000000e+00],
       ...,
       [7.84167130e-02, 6.22769436e-01, 2.96984763e-01, 1.82908835e-03,
        0.00000000e+00],
       [9.52392302e-02, 6.40172664e-01, 2.63766811e-01, 8.21294755e-04,
        0.00000000e+00],
       [1.44687491e-01, 6.64595243e-01, 1.90700995e-01, 1.62714322e-05,
        0.00000000e+00]])

In [114]:
qt = QuantileTransformer()
qt.fit_transform(X_train_sklearn[['Driven_kms']])



array([[0.06696429],
       [0.0625    ],
       [0.25892857],
       [0.55803571],
       [0.46205357],
       [0.99107143],
       [0.83482143],
       [0.21428571],
       [0.52232143],
       [0.71428571],
       [0.39732143],
       [0.70089286],
       [0.57142857],
       [0.41294643],
       [0.71428571],
       [0.63392857],
       [0.57589286],
       [0.17857143],
       [0.11383929],
       [0.9375    ],
       [0.42857143],
       [0.87946429],
       [0.53571429],
       [0.98660714],
       [0.15625   ],
       [0.1875    ],
       [0.3125    ],
       [0.77008929],
       [0.00892857],
       [0.54910714],
       [0.79464286],
       [0.36607143],
       [0.47544643],
       [0.20089286],
       [0.09375   ],
       [0.125     ],
       [0.77008929],
       [0.29910714],
       [0.45535714],
       [0.23883929],
       [0.61383929],
       [0.01785714],
       [0.44866071],
       [0.10267857],
       [0.02678571],
       [0.04464286],
       [0.13392857],
       [0.968

In [115]:
pf = PolynomialFeatures(degree=2)
qt = QuantileTransformer()
sp = SplineTransformer(n_knots=3, degree=3)

In [116]:
# Значения преобразованных признаков нужно отскейлить, поэтому создаем pipeline из двух шагов - преобразование и скейлинг
pf_pipeline = Pipeline(steps=[
    ('poly', pf),
    ('scale', StandardScaler())
])

In [126]:
preprocessor_sklearn = ColumnTransformer(
    transformers=[
        ('num', s_scaler, num_features),  # преобразования для числовых признаков
        ('cat', l_encoder, cat_features), # преобразования для категориальных признаков
        ('quantile', qt,num_features),
        ('poly', pf_pipeline, ['Present_Price','Driven_kms']), # В преобразования добавляем созданный ранее pipeline
        ('spline', sp, ['Driven_kms'])
    ],
    remainder='drop',
    ) # Удаляем столбцы, которые не затронуты преобразования

In [127]:
## не влезаем в float64 в полиномальном преобразовании. Использовать его нужно с умом!
X_train_sklearn[['Present_Price','Driven_kms']] = X_train_sklearn[['Present_Price','Driven_kms']].astype('float128')
X_train_sklearn[['Present_Price','Driven_kms']] = X_train_sklearn[['Present_Price','Driven_kms']].astype('float128')

In [128]:
X_train_sklearn_raw = preprocessor_sklearn.fit_transform(X_train_sklearn)
X_train_sklearn = pd.DataFrame(X_train_sklearn_raw, columns=preprocessor_sklearn.get_feature_names_out())

2024/10/22 17:30:44 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '93ca800af4a64249bbcd0a8db8b9b2c7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
 - mlflow (current: 2.16.0, required: mlflow==2.17.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/10/22 17:30:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run bustling-rat-201 at: http://127.0.0.1:5002/#/experiments/0/runs/93ca800af4a64249bbcd0a8db8b9b2c7.
2024/10/22 17:30:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5002/#/experiments/0.


In [129]:
# Удобно использовать для отображения всех строк\столбцов в DataFrame
with pd.option_context('display.max_rows', 5, 'display.max_columns', None):
    display (X_train_sklearn)

Unnamed: 0,num__Present_Price,num__Driven_kms,num__Owner,cat__Car_Name,cat__Year,cat__Fuel_Type,cat__Selling_type,cat__Transmission,quantile__Present_Price,quantile__Driven_kms,quantile__Owner,poly__1,poly__Present_Price,poly__Driven_kms,poly__Present_Price^2,poly__Present_Price Driven_kms,poly__Driven_kms^2,spline__Driven_kms_sp_0,spline__Driven_kms_sp_1,spline__Driven_kms_sp_2,spline__Driven_kms_sp_3,spline__Driven_kms_sp_4
0,-0.812979,-0.797744,-0.180741,22.0,12.0,2.0,1.0,0.0,0.017857,0.066964,0.0,0.0,-0.812979,-0.797744,-0.246635,-0.526441,-0.186065,0.159174,0.666437,0.174388,5.870608e-07,0.0
1,-0.709078,-0.802502,-0.180741,34.0,12.0,2.0,1.0,1.0,0.234375,0.062500,0.0,0.0,-0.709078,-0.802502,-0.243498,-0.520862,-0.186163,0.159563,0.666460,0.173976,4.991600e-07,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,0.299637,0.122902,-0.180741,58.0,11.0,1.0,0.0,1.0,0.785714,0.680804,0.0,0.0,0.299637,0.122902,-0.054935,0.145502,-0.078957,0.095239,0.640173,0.263767,8.212948e-04,0.0
224,-0.789169,-0.614567,-0.180741,25.0,8.0,2.0,1.0,1.0,0.098214,0.200893,0.0,0.0,-0.789169,-0.614567,-0.246184,-0.516885,-0.178721,0.144687,0.664595,0.190701,1.627143e-05,0.0


In [130]:
pipeline_sklearn = Pipeline(steps=[
    ('transform', preprocessor_sklearn),
    ('model', regressor)
])

model_sklearn = pipeline_sklearn.fit(X_train, y_train)

2024/10/22 17:34:13 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '5b5f180e2e67486684f7e5629ceb484e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
 - mlflow (current: 2.16.0, required: mlflow==2.17.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/10/22 17:34:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run incongruous-shad-225 at: http://127.0.0.1:5002/#/experiments/0/runs/5b5f180e2e67486684f7e5629ceb484e.
2024/10/22 17:34:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5002/#/experiments/0.


In [131]:
model_sklearn

In [151]:
predictions = model_sklearn.predict(X_test) 
metrics = {}
metrics["mae"] = mean_absolute_error(y_test, predictions)   
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)

metrics



{'mae': np.float64(0.6116434278848925),
 'mape': np.float64(0.31559991304928764),
 'mse': np.float64(1.776924905435717)}

In [152]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
RUN_NAME = 'fe_sklearn1'

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(model_sklearn, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact(art)
    mlflow.log_params(model_sklearn.get_params())

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

2024/10/22 18:30:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run fe_sklearn1 at: http://127.0.0.1:5002/#/experiments/1/runs/e1b9d6dc5df4472abf4af769ef33f6d9.
2024/10/22 18:30:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5002/#/experiments/1.


**mlextend**

In [146]:
from mlxtend.feature_selection import SequentialFeatureSelector 
#from sklearn.feature_selection import SequentialFeatureSelector

In [148]:
sfs = SequentialFeatureSelector(RandomForestRegressor(n_estimators=3), 
                                k_features=3,
                                forward=True,
                                floating=False, # True to drop selected features
                                scoring='neg_mean_absolute_error',
                                cv=2)

sfs.fit(X_train_sklearn,y_train)



AttributeError: `np.NINF` was removed in the NumPy 2.0 release. Use `-np.inf` instead.