In [163]:
import os
import mlflow

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

In [164]:
df = pd.read_pickle("../data/car_data_edited.pkl")

In [165]:
df.head(5)

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,mileage_level
0,ritz,2014,3.349609,5.589844,27000,Petrol,Dealer,Manual,0,mid
1,sx4,2013,4.75,9.539062,43000,Diesel,Dealer,Manual,0,mid
2,ciaz,2017,7.25,9.851562,6900,Petrol,Dealer,Manual,0,low
3,wagon r,2011,2.849609,4.148438,5200,Petrol,Dealer,Manual,0,low
4,swift,2014,4.601562,6.871094,42450,Diesel,Dealer,Manual,0,mid


In [166]:
df = df.rename(columns={'Selling_Price': 'target'})

In [167]:
df.head(5)

Unnamed: 0,Car_Name,Year,target,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,mileage_level
0,ritz,2014,3.349609,5.589844,27000,Petrol,Dealer,Manual,0,mid
1,sx4,2013,4.75,9.539062,43000,Diesel,Dealer,Manual,0,mid
2,ciaz,2017,7.25,9.851562,6900,Petrol,Dealer,Manual,0,low
3,wagon r,2011,2.849609,4.148438,5200,Petrol,Dealer,Manual,0,low
4,swift,2014,4.601562,6.871094,42450,Diesel,Dealer,Manual,0,mid


In [168]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=7)

In [169]:
cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()
cat_features

['Car_Name',
 'Year',
 'Fuel_Type',
 'Selling_type',
 'Transmission',
 'mileage_level']

In [170]:
num_features = X_train.select_dtypes(include=['number']).columns.to_list()
num_features

['Present_Price', 'Driven_kms', 'Owner']

In [171]:
s_scaler = StandardScaler()
l_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999999) # unknown_value нужно выбирать с умом
regressor = RandomForestRegressor()

In [172]:
# Для удобной работы со столбцами
preprocessor = ColumnTransformer(
    transformers=[
        ('num', s_scaler, num_features),  # преобразования для числовых признаков
        ('cat', l_encoder, cat_features), # преобразования для категориальных признаков
    ],
    remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования

In [173]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', regressor)])

pipeline.fit(X_train, y_train)

In [174]:
predictions = pipeline.predict(X_test) 

metrics = {}
metrics["mae"] = mean_absolute_error(y_test, predictions)   
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)

metrics

{'mae': 0.7476821126302081,
 'mape': 0.17738610452551604,
 'mse': 2.003424498746852}

In [175]:
# Работаем с MLflow локально
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5001

registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

mlflow.set_tracking_uri(tracking_uri)   
mlflow.set_registry_uri(registry_uri)   

In [176]:
# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться
EXPERIMENT_NAME = "estate_project"
RUN_NAME = "baseline model"
REGISTRY_MODEL_NAME = "estate_model_rf"

In [177]:
# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их
from mlflow.models import infer_signature

signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)



In [178]:
# Будем логировать requirements и артефакт - текстовый файл
req_file = '../requirements.txt'
art = '../comment.txt'

In [179]:
# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели
#params_dict = {'n_estimators': 10, 'max_depth': 10}
params_dict = pipeline.get_params()

In [180]:
# Когда создаем новый эксперимент, то: 
experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:
#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact(art)
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

 - bokeh (current: 3.6.0, required: bokeh==3.4.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


 - bokeh (current: 3.6.0, required: bokeh==3.4.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/10/22 17:53:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline model at: http://127.0.0.1:5001/#/experiments/1/runs/8b35461d554746f9bfc635304f780f56.
2024/10/22 17:53:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1.


In [181]:
# mlflow.sklearn.autolog()

# with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:
#     pipeline.fit(X_train, y_train)

In [182]:
regressor2 = RandomForestRegressor(n_estimators=10, max_depth=6)

In [183]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', regressor2)])

pipeline.fit(X_train, y_train)

In [184]:
predictions = pipeline.predict(X_test) 
metrics = {}
metrics["mae"] = mean_absolute_error(y_test, predictions)   
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)

metrics

{'mae': 0.8910489294443952,
 'mape': 0.24156829746260977,
 'mse': 3.050278255628128}

In [185]:
RUN_NAME = 'smaller_model'

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact(art)
    mlflow.log_params(pipeline.get_params())

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')


 - bokeh (current: 3.6.0, required: bokeh==3.4.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - bokeh (current: 3.6.0, required: bokeh==3.4.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/10/22 17:53:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run smaller_model at: http://127.0.0.1:5001/#/experiments/1/runs/acdbc71a6aac4d059bf3aebe67861957.
2024/10/22 17:53:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1.


In [186]:
regressor3 = CatBoostRegressor()

In [187]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', regressor3)])

pipeline.fit(X_train, y_train)

Learning rate set to 0.032324
0:	learn: 4.7153467	total: 350us	remaining: 350ms
1:	learn: 4.6370845	total: 650us	remaining: 324ms
2:	learn: 4.5568435	total: 890us	remaining: 296ms
3:	learn: 4.4847538	total: 1.06ms	remaining: 264ms
4:	learn: 4.4233423	total: 1.33ms	remaining: 264ms
5:	learn: 4.3465459	total: 1.5ms	remaining: 249ms
6:	learn: 4.2789600	total: 1.77ms	remaining: 251ms
7:	learn: 4.2092038	total: 2.07ms	remaining: 257ms
8:	learn: 4.1522367	total: 2.31ms	remaining: 254ms
9:	learn: 4.0893077	total: 2.58ms	remaining: 255ms
10:	learn: 4.0285265	total: 2.86ms	remaining: 258ms
11:	learn: 3.9669526	total: 3.13ms	remaining: 258ms
12:	learn: 3.9087721	total: 3.4ms	remaining: 258ms
13:	learn: 3.8557176	total: 3.67ms	remaining: 258ms
14:	learn: 3.8029993	total: 3.92ms	remaining: 258ms
15:	learn: 3.7429440	total: 4.28ms	remaining: 263ms
16:	learn: 3.6791326	total: 4.53ms	remaining: 262ms
17:	learn: 3.6310104	total: 4.76ms	remaining: 260ms
18:	learn: 3.5737747	total: 5.03ms	remaining: 260

In [188]:
predictions = pipeline.predict(X_test) 
metrics = {}
metrics["mae"] = mean_absolute_error(y_test, predictions)   
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)

metrics

{'mae': 0.9270591269280699,
 'mape': 0.5746140589734728,
 'mse': 2.14151799141817}

In [189]:
RUN_NAME = 'catboost_model'

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact(art)
    mlflow.log_params(pipeline.get_params())

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')


 - bokeh (current: 3.6.0, required: bokeh==3.4.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - bokeh (current: 3.6.0, required: bokeh==3.4.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/10/22 17:53:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run catboost_model at: http://127.0.0.1:5001/#/experiments/1/runs/13c9290b4f19431389577e8a3d2c6d21.
2024/10/22 17:53:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1.


In [190]:
class MlExperiment():
    def __init__(self, RUN_NAME, EXPERIMENT_NAME, preprocessor, regressor) -> None:
        self.RUN_NAME = RUN_NAME
        self.EXPERIMENT_NAME = EXPERIMENT_NAME
        self.pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', regressor)])

    def model_fit(self, X_train, y_train):
        self.pipeline.fit(X_train, y_train)  
          
    def model_predict(self, X_test, y_test):
        predictions = self.pipeline.predict(X_test) 
        self.metrics = {}
        self.metrics["mae"] = mean_absolute_error(y_test, predictions)   
        self.metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
        self.metrics["mse"] = mean_squared_error(y_test, predictions)


    def start_run(self, signature, input_example, art):
        experiment_id = mlflow.get_experiment_by_name(self.EXPERIMENT_NAME).experiment_id
        with mlflow.start_run(run_name=self.RUN_NAME, experiment_id=experiment_id) as run:
            # получаем уникальный идентификатор запуска эксперимента
            run_id = run.info.run_id 
            mlflow.sklearn.log_model(self.pipeline, 
                                    artifact_path="models",
                                    signature=signature,
                                    input_example=input_example,
                                    pip_requirements=req_file
                                    )
            mlflow.log_metrics(self.metrics)
            mlflow.log_artifact(art)
            mlflow.log_params(self.pipeline.get_params())

        run = mlflow.get_run(run_id) 
        assert (run.info.status =='FINISHED')

        
        

In [191]:
new_exp = MlExperiment('test', EXPERIMENT_NAME, preprocessor, LinearRegression())

In [192]:
new_exp.model_fit(X_train, y_train)

In [193]:
new_exp.model_predict(X_test, y_test)

In [194]:
new_exp.start_run(signature, input_example, art)

 - bokeh (current: 3.6.0, required: bokeh==3.4.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
 - bokeh (current: 3.6.0, required: bokeh==3.4.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/10/22 17:53:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run test at: http://127.0.0.1:5001/#/experiments/1/runs/a288a632dc8441d0b8a930bd8ad5c021.
2024/10/22 17:53:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1.


# Feature engineering

In [229]:
from sklearn.preprocessing import QuantileTransformer, SplineTransformer, PolynomialFeatures, MinMaxScaler

In [250]:
X_train_sklearn = X_train.copy()

In [251]:
pf = PolynomialFeatures(degree=2)

In [252]:
X_train_sklearn


Unnamed: 0,Car_Name,Year,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner,mileage_level
116,Royal Enfield Classic 350,2013,1.469727,33000,Petrol,Individual,Manual,0,mid
162,Bajaj Pulsar NS 200,2012,0.990234,14500,Petrol,Individual,Manual,0,low
134,TVS Apache RTR 160,2017,0.810059,11800,Petrol,Individual,Manual,0,low
62,fortuner,2014,35.968750,78000,Diesel,Dealer,Automatic,0,high
175,Hero Honda CBZ extreme,2011,0.787109,75000,Petrol,Individual,Manual,0,high
...,...,...,...,...,...,...,...,...,...
213,i20,2011,6.789062,31604,Petrol,Dealer,Manual,0,mid
68,corolla altis,2011,13.742188,88000,Petrol,Dealer,Manual,0,high
26,swift,2013,5.871094,55138,Petrol,Dealer,Manual,0,high
198,Bajaj Discover 125,2011,0.569824,35000,Petrol,Individual,Manual,1,mid


In [253]:
pf.fit_transform(X_train_sklearn[['Present_Price', 'Driven_kms']])


array([[1.00000000e+00, 1.46972656e+00, 3.30000000e+04, 2.16009617e+00,
        4.85009766e+04, 1.08900000e+09],
       [1.00000000e+00, 9.90234375e-01, 1.45000000e+04, 9.80564117e-01,
        1.43583984e+04, 2.10250000e+08],
       [1.00000000e+00, 8.10058594e-01, 1.18000000e+04, 6.56194925e-01,
        9.55869141e+03, 1.39240000e+08],
       ...,
       [1.00000000e+00, 5.87109375e+00, 5.51380000e+04, 3.44697418e+01,
        3.23720367e+05, 3.04019904e+09],
       [1.00000000e+00, 5.69824219e-01, 3.50000000e+04, 3.24699640e-01,
        1.99438477e+04, 1.22500000e+09],
       [1.00000000e+00, 5.69824219e-01, 2.40000000e+04, 3.24699640e-01,
        1.36757812e+04, 5.76000000e+08]])

In [254]:
sp = SplineTransformer(n_knots=3, degree=3)

In [255]:
sp.fit_transform(X_train_sklearn[['Driven_kms']])

array([[1.09701260e-01, 6.50834618e-01, 2.39096855e-01, 3.67267367e-04,
        0.00000000e+00],
       [1.40180422e-01, 6.63612457e-01, 1.96177763e-01, 2.93573172e-05,
        0.00000000e+00],
       [1.45052173e-01, 6.64665846e-01, 1.90266544e-01, 1.54371665e-05,
        0.00000000e+00],
       ...,
       [7.94665211e-02, 6.24041272e-01, 2.94747121e-01, 1.74508522e-03,
        0.00000000e+00],
       [1.06699342e-01, 6.48902507e-01, 2.43958822e-01, 4.39328668e-04,
        0.00000000e+00],
       [1.23907622e-01, 6.58229508e-01, 2.17724023e-01, 1.38846791e-04,
        0.00000000e+00]])

In [256]:
qt = QuantileTransformer()

In [257]:
qt.fit_transform(X_train_sklearn[['Driven_kms']])



array([[0.51793722],
       [0.19730942],
       [0.15695067],
       [0.93946188],
       [0.92376682],
       [0.03587444],
       [0.9103139 ],
       [0.98206278],
       [0.73542601],
       [0.72197309],
       [0.79372197],
       [0.29596413],
       [0.1793722 ],
       [0.08520179],
       [0.1838565 ],
       [0.75336323],
       [0.89686099],
       [0.89237668],
       [0.47982063],
       [0.93273543],
       [0.31390135],
       [0.21524664],
       [0.41704036],
       [0.67713004],
       [0.66367713],
       [0.52914798],
       [0.0896861 ],
       [0.13901345],
       [0.90134529],
       [0.44394619],
       [0.78923767],
       [0.87668161],
       [0.40358744],
       [0.78475336],
       [0.32286996],
       [0.70179372],
       [0.68161435],
       [0.98654709],
       [0.62780269],
       [0.6367713 ],
       [0.86547085],
       [0.34529148],
       [0.09865471],
       [0.25112108],
       [0.28475336],
       [0.54932735],
       [0.77130045],
       [0.273

In [258]:
pf = PolynomialFeatures(degree=2)
qt = QuantileTransformer()
sp = SplineTransformer(n_knots=3, degree=3)

In [259]:
# Значения преобразованных признаков нужно отскейлить, поэтому создаем pipeline из двух шагов - преобразование и скейлинг
pf_pipeline = Pipeline(steps=[
    ('poly', pf),
    ('scale', StandardScaler())
])

In [260]:
preprocessor_sklearn = ColumnTransformer(
    transformers=[
        ('num', s_scaler, num_features),  # преобразования для числовых признаков
        ('cat', l_encoder, cat_features), # преобразования для категориальных признаков
        ('quantile', qt,num_features),
        ('poly', pf_pipeline, ['Present_Price', 'Driven_kms']), # В преобразования добавляем созданный ранее pipeline
        ('spline', sp, ['Driven_kms'])
    ],
    remainder='drop',
    ) # Удаляем столбцы, которые не затронуты преобразования

In [261]:
X_train_sklearn[['Present_Price', 'Driven_kms']] = X_train_sklearn[['Present_Price', 'Driven_kms']].astype('float64')
X_train_sklearn[['Present_Price', 'Driven_kms']] = X_train_sklearn[['Present_Price', 'Driven_kms']].astype('float64')


In [262]:
X_train_sklearn_raw = preprocessor_sklearn.fit_transform(X_train_sklearn)
X_train_sklearn = pd.DataFrame(X_train_sklearn_raw, columns=preprocessor_sklearn.get_feature_names_out())



In [263]:
# Удобно использовать для отображения всех строк\столбцов в DataFrame
with pd.option_context('display.max_rows', 5, 'display.max_columns', None):
    display (X_train_sklearn)

Unnamed: 0,num__Present_Price,num__Driven_kms,num__Owner,cat__Car_Name,cat__Year,cat__Fuel_Type,cat__Selling_type,cat__Transmission,cat__mileage_level,quantile__Present_Price,quantile__Driven_kms,quantile__Owner,poly__1,poly__Present_Price,poly__Driven_kms,poly__Present_Price^2,poly__Present_Price Driven_kms,poly__Driven_kms^2,spline__Driven_kms_sp_0,spline__Driven_kms_sp_1,spline__Driven_kms_sp_2,spline__Driven_kms_sp_3,spline__Driven_kms_sp_4
0,-0.673208,-0.112090,-0.165900,38.0,10.0,2.0,1.0,1.0,2.0,0.280269,0.517937,0.0,0.0,-0.673208,-0.112090,-0.215020,-0.425486,-0.118363,0.109701,0.650835,0.239097,0.000367,0.0
1,-0.728080,-0.563611,-0.165900,13.0,9.0,2.0,1.0,1.0,1.0,0.246637,0.197309,0.0,0.0,-0.728080,-0.563611,-0.216997,-0.474820,-0.170291,0.140180,0.663612,0.196178,0.000029,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,-0.776191,-0.063277,6.027714,9.0,8.0,2.0,1.0,1.0,2.0,0.082960,0.549327,1.0,0.0,-0.776191,-0.063277,-0.218097,-0.466749,-0.110326,0.106699,0.648903,0.243959,0.000439,0.0
223,-0.776191,-0.331749,-0.165900,25.0,13.0,2.0,1.0,0.0,2.0,0.082960,0.385650,0.0,0.0,-0.776191,-0.331749,-0.218097,-0.475806,-0.148678,0.123908,0.658230,0.217724,0.000139,0.0


In [264]:
new = MlExperiment('fe_sklearn', EXPERIMENT_NAME, preprocessor_sklearn, regressor)
new.model_fit(X_train, y_train)
new.model_predict(X_test, y_test)
new.start_run(signature, input_example, art)

 - bokeh (current: 3.6.0, required: bokeh==3.4.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


 - bokeh (current: 3.6.0, required: bokeh==3.4.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/10/22 18:27:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run fe_sklearn at: http://127.0.0.1:5001/#/experiments/1/runs/fe91a7c4cab147c4be8fbe7224f41c20.
2024/10/22 18:27:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1.


In [265]:
new.pipeline