# Модель предсказания стоимости автомобилей

# 1. Подготовка данных

In [122]:
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('/datasets/autos.csv')

In [106]:
data.shape

(354369, 16)

In [107]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
DateCrawled          354369 non-null object
Price                354369 non-null int64
VehicleType          316879 non-null object
RegistrationYear     354369 non-null int64
Gearbox              334536 non-null object
Power                354369 non-null int64
Model                334664 non-null object
Kilometer            354369 non-null int64
RegistrationMonth    354369 non-null int64
FuelType             321474 non-null object
Brand                354369 non-null object
NotRepaired          283215 non-null object
DateCreated          354369 non-null object
NumberOfPictures     354369 non-null int64
PostalCode           354369 non-null int64
LastSeen             354369 non-null object
dtypes: int64(7), object(9)
memory usage: 43.3+ MB


In [108]:
data.head(3)

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,NotRepaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,2016-03-24 11:52:17,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354364,2016-03-21 09:50:58,0,,2005,manual,0,colt,150000,7,petrol,mitsubishi,yes,2016-03-21 00:00:00,0,2694,2016-03-21 10:42:49
354365,2016-03-14 17:48:27,2200,,2005,,0,,20000,1,,sonstige_autos,,2016-03-14 00:00:00,0,39576,2016-04-06 00:46:52
354366,2016-03-05 19:56:21,1199,convertible,2000,auto,101,fortwo,125000,3,petrol,smart,no,2016-03-05 00:00:00,0,26135,2016-03-11 18:17:12
354367,2016-03-19 18:57:12,9200,bus,1996,manual,102,transporter,150000,3,gasoline,volkswagen,no,2016-03-19 00:00:00,0,87439,2016-04-07 07:15:26


Удалим колонки которые явно не имеют значимости для предсказания цены:

- DateCrawled - дата получения объявления из базы
- DateCreated - дата создания объявления
- LastSeen - последняя активность пользователя
- NumberOfPictures - количество фото, оно везде равно 0
- RegistrationMonth - не сообщающий ни о чем столбец, некоторые значения так же которого равны 0

In [109]:
data = data.drop(columns = ['DateCrawled', 'LastSeen', 'DateCreated', 'NumberOfPictures', 'RegistrationMonth'])

Заменим пропуски в столбцах VehicleType, Gearbox, Model, FuelType, NotRepaired на слово unknown (неизвестные), так как данных достаточно много, чтобы их заменять или удалять.

In [110]:
data['VehicleType'] = data['VehicleType'].fillna('unknown')
data['Gearbox'] = data['Gearbox'].fillna('unknown')
data['Model'] = data['Model'].fillna('unknown')
data['FuelType'] = data['FuelType'].fillna('unknown')
data['NotRepaired'] = data['NotRepaired'].fillna('unknown')

In [111]:
data

Unnamed: 0,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,FuelType,Brand,NotRepaired,PostalCode
0,480,unknown,1993,manual,0,golf,150000,petrol,volkswagen,unknown,70435
1,18300,coupe,2011,manual,190,unknown,125000,gasoline,audi,yes,66954
2,9800,suv,2004,auto,163,grand,125000,gasoline,jeep,unknown,90480
3,1500,small,2001,manual,75,golf,150000,petrol,volkswagen,no,91074
4,3600,small,2008,manual,69,fabia,90000,gasoline,skoda,no,60437
...,...,...,...,...,...,...,...,...,...,...,...
354364,0,unknown,2005,manual,0,colt,150000,petrol,mitsubishi,yes,2694
354365,2200,unknown,2005,unknown,0,unknown,20000,unknown,sonstige_autos,unknown,39576
354366,1199,convertible,2000,auto,101,fortwo,125000,petrol,smart,no,26135
354367,9200,bus,1996,manual,102,transporter,150000,gasoline,volkswagen,no,87439


Объеденим категоральные колонки и закодируем их методом LabelEncoder:

In [112]:
categorical_columns = ['VehicleType', 'Gearbox', 'Model', 'FuelType','Brand', 'NotRepaired']

le = LabelEncoder()
for column in categorical_columns:
    le.fit(data[column].astype('str'))
    data[column] = le.transform(data[column].astype('str'))

Определим целевой признак и отделим от остальных:

In [113]:
X = data.drop(columns='Price')
y = data['Price']

Используем модель для деления 14:3:3 так как, помимо валидационной выборки выделяем так же тестовую выборку (равную ей) для дальнейшего тестирования поведения модели.

In [114]:
X_train, X_test_valid, y_train, y_test_valid = train_test_split(X, y, test_size=0.3, random_state=12345)
X_test, X_valid, y_test, y_valid = train_test_split(X_test_valid, y_test_valid, test_size=0.5, random_state=12345)

Напишем функцию, которая будет исходя из параметров модели считать время на обучение и предсказание, а так же возращать метрику качества RMSE.

In [115]:
def mean_of_mse(model, X_train, y_train, X_valid, y_valid):
    
    start = time.time()    
    model.fit(X_train, y_train)
    training_time = time.time() - start
    
    start = time.time()  
    y_pred = model.predict(X_valid)
    predict_time = time.time() - start

    return training_time, predict_time, (mean_squared_error(y_valid, y_pred))**0.5

# 2. Обучение моделей

##### LGBMRegressor

Для поиска оптимальной комбинации гиперпараметров для LGBMRegressor будем использовать GridSearchCV с кросс-валидацией на трёх фолдах. Остановимся на двух гиперпараметрах - max_depth и num_leaves

In [116]:
cv = KFold(3, shuffle=True, random_state=12345)

param_grid = {
    'max_depth': [6, 8, 10],
    'num_leaves': [50, 100, 200]
}

In [117]:
grid = GridSearchCV(lgb.LGBMRegressor(random_state=12345), 
                    param_grid, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=False)

In [118]:
%%time
grid.fit(X_train, y_train)

CPU times: user 4min 53s, sys: 1.67 s, total: 4min 54s
Wall time: 4min 57s


GridSearchCV(cv=KFold(n_splits=3, random_state=12345, shuffle=True),
             error_score='raise-deprecating',
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective=None, random_state=12345,
                                     reg_alpha=0.0, reg_lambda=0.0, silent=True,
                                     subsample=1.0, subsample_for_bin=200000,
                                     subsample_freq=0),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [6, 8, 10], 'num_leaves': [50, 100, 200]},
             pre_dispatch='

In [119]:
grid.best_params_

{'max_depth': 10, 'num_leaves': 200}

In [120]:
print(mean_of_mse(grid.best_estimator_, X_train, y_train, X_valid, y_valid))

(16.581926822662354, 1.2960987091064453, 1737.2136387783273)


#### CatBoost

In [123]:
CatBoost = CatBoostRegressor(random_seed = 12345,
                        loss_function = 'RMSE',
                        silent = True,
                        cat_features = categorical_columns)

param_grid_CatBoost = {
    'learning_rate': np.logspace(-3, 0, 5),
    'iterations': [40, 60],
    'depth': [6, 8, 10],
}

grid_CatBoost = GridSearchCV(CatBoost,
                        param_grid_CatBoost,
                        cv=cv,
                        scoring='neg_mean_squared_error',
                        verbose=False)

In [124]:
%%time
grid_CatBoost.fit(X_train, y_train)

CPU times: user 26min, sys: 2min 24s, total: 28min 25s
Wall time: 33min 21s


GridSearchCV(cv=KFold(n_splits=3, random_state=12345, shuffle=True),
             error_score='raise-deprecating',
             estimator=<catboost.core.CatBoostRegressor object at 0x7f6072ff9510>,
             iid='warn', n_jobs=None,
             param_grid={'depth': [6, 8, 10], 'iterations': [40, 60],
                         'learning_rate': array([0.001     , 0.00562341, 0.03162278, 0.17782794, 1.        ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=False)

In [125]:
grid_CatBoost.best_params_

{'depth': 10, 'iterations': 60, 'learning_rate': 0.1778279410038923}

In [126]:
print(mean_of_mse(grid_CatBoost.best_estimator_, X_train, y_train, X_valid, y_valid))

(38.91535472869873, 0.17096686363220215, 1817.7041504111924)


#### RandomForestRegressor

In [130]:
param_grid_RandomForestRegressor = {'max_depth': [4, 6, 8, 10],
               'n_estimators': [10, 30, 60, 100]}

grid_RandomForestRegressor = GridSearchCV(RandomForestRegressor(random_state=12345), 
                        param_grid_RandomForestRegressor, refit=False, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error') 

In [131]:
%%time
grid_RandomForestRegressor.fit(X_train, y_train)

CPU times: user 11min 29s, sys: 0 ns, total: 11min 29s
Wall time: 11min 31s


GridSearchCV(cv=KFold(n_splits=3, random_state=12345, shuffle=True),
             error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False,
                                             random_state=12345, verbose=0,
                                             warm

In [132]:
grid_RandomForestRegressor.best_params_

{'max_depth': 10, 'n_estimators': 100}

In [135]:
model = RandomForestRegressor(random_state=12345, max_depth = 10, n_estimators = 100)

In [136]:
%%time
model.fit(X_train, y_train)

CPU times: user 1min, sys: 0 ns, total: 1min
Wall time: 1min


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=12345,
                      verbose=0, warm_start=False)

In [137]:
%%time
y_pred = model.predict(X_valid)

CPU times: user 582 ms, sys: 0 ns, total: 582 ms
Wall time: 581 ms


In [138]:
(mean_squared_error(y_valid, y_pred))**0.5

2017.2798944034384

# 3. Анализ моделей

In [140]:
result = pd.DataFrame({'index' : ['LightGBM', 'CatBoost', 'RandomForestRegressor'],
                       'trainig_time' : [16.58, 1.30, 1.00],
                      'prediction_time' : [38.92, 0.17, 0.58],
                      'RMSE' : [1737.21, 1817.70, 2017.28]})
result

Unnamed: 0,index,trainig_time,prediction_time,RMSE
0,LightGBM,16.58,38.92,1737.21
1,CatBoost,1.3,0.17,1817.7
2,RandomForestRegressor,1.0,0.58,2017.28


- Случайный лес с результатом 1 секунда показал себя самой быстрой моделью по скорости обучения. Результат CatBoost - 1.3 секунды. LightGBM обучается значительно дольше - 16,58 секунд. При этом качество предсказаний по метрике RMSE расположились в обратном порядке - 2017.28, 1817.70, 1737.21 соответственно.
- Поиск лучших гиперпараметров быстрее всего у LGBM-регрессии, и составляет 4,5 минуты. Что касаемо других моделей, то у модели случайного леса - 11,5 минут, у модели категорального бустинга - более 35 минут, что значительно влияет на продолжительность совокупную продолжительность обучения моделей.
- Исходя из требований к качеству и скорости предсказания, а также времени обучения, оптимальным сочетанием гиперпараметров будет - max_depth = 10, num_leaves = 200 у модели LGBMRegressor.

Проверим на тестовой выборке качество LGBM модели:

In [143]:
model = lgb.LGBMRegressor(max_depth=10, num_leaves=200)
model.fit(X_train, y_train)
test_prediction = model.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,test_prediction)))

1739.0545369572476
