In [1]:
import pandas as pd
import numpy as np
import math
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from IPython.core.display import display

In [3]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder


In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [5]:
SEED = 42
target = 'price'

In [6]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))

In [7]:
""" функция перобразования списка в датафрейм (для красоты отображения на экран)"""
def list_cut(to_cut):
    """
    :param to_cut: список для преобразорвания в датафрейм
    :return: в результате на экран выводится таблица
    """

    # Объявим функции для разбиение списка на список стеков и генерировнаия названий для датафрейма
    to_cut = list(np.sort(to_cut))
    cut_list = lambda item: [item[i:i+7] for i in range(0, len(item), 7)]
    name_list = lambda nom: ["vectors"+ str(i) for i in range(0,math.ceil(nom/7))]

    # создание датафрейма и присовение названий векторам (пропуски заполняем '')
    df_ = pd.DataFrame(cut_list(to_cut)).T
    df_.columns = name_list(len(to_cut))
    df_.fillna('', inplace=True)

    # выводим на экран
    display(df_)

In [8]:
""" Функция проверки корреляции с целевым вектором в виде графика heatmap """
def heatmap(drop_list, dd, target = target):
    """
    :param drop_list: список для удаления из датасета
    :param dd: название датасета, по умолчанию = train
    :param target: целевой вектор, по умолчанию значение переменной target извне функции
    :return: выводит только информацию в виде таблицы исключенных столбцов и графика корреляции
    """
    dlst = list(set(dd.columns) - set(drop_list) - {target})
    dlst = list(np.sort(dlst))

    dlst.append(target)
    sns.set(font_scale=1)
    plt.subplots(figsize=(16, 16))
    sns.heatmap(dd[dlst].corr(), square=True,
                annot=True, fmt=".3f", linewidths=0.1, cmap="RdBu", vmin=-1, vmax=1)

    # для удобства отображения делим веткор для модели на части по 7 строк и перобразум в таблицу.
    list_cut(drop_list)

In [9]:
path = '/Users/irenaradzevich/Documents/sergei_dev/ project_05/'

In [14]:
auto = pd.read_csv(path+'all_auto_ru_09_09_2020.csv')

In [12]:
add = pd.read_csv(path+'train2.csv')

In [16]:
test = pd.read_csv(path+'test.csv')
train = pd.read_csv(path+'train.csv')
print(f'Train data shape : {train.shape},\n  add data shape : {add.shape},\n auto data shape : {auto.shape},\n test data shape : {test.shape}')

Train data shape : (103794, 33),
 add data shape : (25465, 33),
 auto data shape : (89378, 26),
 test data shape : (34686, 32)


---
# 0. Crearing data
---

In [18]:
# append addon data to train dataset
train = train.append(add)

# drop duplicates and records with zero-prices and records on new cars (they has different webpage structure)
train.drop_duplicates(inplace=True)
train.dropna(subset=['price', 'bodyType'], inplace=True)
print(f'Records {train.shape[0]}, duplicates {train.duplicated().sum()}')

# statistics of NAN in train dataset
nulls = train.isnull().sum()
print('Number of empty features values in train dataset:')
nulls[nulls.values >0]

Records 111974, duplicates 0
Number of empty features values in train dataset:


complectation_dict          70
image                        2
modelDate               111974
model_info              111974
numberOfDoors               70
vehicleConfiguration    111974
vendor                  111974
Владельцы                    7
Владение                 80244
ПТС                          1
Состояние                   11
Таможня                     17
dtype: int64

In [19]:
# fill empty 'numberOfDoors' with value as the same bodyType has
doors = dict(train.groupby(['bodyType', 'numberOfDoors'])['numberOfDoors'].count().index)
train['numberOfDoors']=train.bodyType.map(doors)

# fill empty 'complectation_dict' with empty dict
train.complectation_dict = train.complectation_dict.apply(lambda q: dict() if pd.isna(q) else q)

In [22]:
train.numberOfDoors = train.numberOfDoors.apply(int)
train.parsing_unixtime = train.parsing_unixtime.apply(int)
train.productionDate = train.productionDate.apply(int)
train.sell_id = train.sell_id.apply(int)
train.mileage = train.mileage.apply(int)

In [23]:
nulls = train.isnull().sum()
print('Number of empty features values in train dataset:')
nulls[nulls.values >0]

Number of empty features values in train dataset:


image                        2
modelDate               111974
model_info              111974
vehicleConfiguration    111974
vendor                  111974
Владельцы                    7
Владение                 80244
ПТС                          1
Состояние                   11
Таможня                     17
dtype: int64

In [24]:
test.isnull().sum()

bodyType                    0
brand                       0
car_url                     0
color                       0
complectation_dict      28268
description                 0
engineDisplacement          0
enginePower                 0
equipment_dict           9996
fuelType                    0
image                       0
mileage                     0
modelDate                   0
model_info                  0
model_name                  0
name                        0
numberOfDoors               0
parsing_unixtime            0
priceCurrency               0
productionDate              0
sell_id                     0
super_gen                   0
vehicleConfiguration        0
vehicleTransmission         0
vendor                      0
Владельцы                   0
Владение                22691
ПТС                         1
Привод                      0
Руль                        0
Состояние                   0
Таможня                     0
dtype: int64

---
### Checking 'test' and 'auto' datadset.
It is very important to understand when they was parsed. The prices in Russian is always run up,
so ML-model can be good to predict today prices, but several month ago parsed test dataset - no,
because only the another level of prices.

In [25]:
auto.price.mean(), train.price.mean() , train.price.mean()/auto.price.mean()

(1294586.3563303659, 1625792.069507207, 1.2558390265410155)

In [26]:
import time
print(time.ctime(test.parsing_unixtime.min()))
print(time.ctime(test.parsing_unixtime.max()))

Mon Oct 19 14:35:06 2020
Mon Oct 26 14:04:24 2020


### Resume:
test data was parsed nearly when was pased auto dataset - in september 2020.
Today's prises in my train dataset is higher on 25.584%.
So we must to divide current prices by the factor 1.255839.

In [27]:
train['price_current']=train.price
train.price = train.price / 1.255839

----
# 1. Clearin data


First of all I will normalize of filling data in train and test datasets, because some vectors have been download not in the same format

In [56]:
""" function to show filling difference between same vectors in train and test datasets"""
def vector_items(vector: str):
    """
    :param vector: nave of vector
    :return: difference between train and test, test and train and unique values of test
    """
    train_set = set(train[vector])
    test_set = set(test[vector].unique())
    train_dif = train_set - test_set
    test_dif = test_set - train_set
    print(f'Statistics of items in vector: {vector}.\n ---------------------------')
    print(f'Train has {len(train_set)} unique items and has difference:\n{train_dif},')
    print('--')
    print(f'Test has {len(test_set)} unique items and has difference:\n{test_dif}.')
    print('Test_set values:')
    print(test_set)

## 1.2. bodyType

In [29]:
vector_items('bodyType')

Statistics of items in vector: bodyType.
 ---------------------------
Train has 23 unique items and has difference: set(),
--
Test has 24 unique items and has difference: {'фастбек'}.
{'кабриолет', 'пикап одинарная кабина', 'минивэн', 'внедорожник открытый', 'универсал 5 дв.', 'пикап двойная кабина', 'седан-хардтоп', 'микровэн', 'тарга', 'купе-хардтоп', 'внедорожник 5 дв.', 'седан', 'родстер', 'пикап полуторная кабина', 'компактвэн', 'лимузин', 'фургон', 'купе', 'внедорожник 3 дв.', 'хэтчбек 3 дв.', 'хэтчбек 5 дв.', 'лифтбек', 'фастбек', 'седан 2 дв.'}


Body type of cars in train is wider then in test by value 'фастбек'. It is OK.

## brand, model_name, model (new vector)

In [30]:
vector_items('brand')

Statistics of items in vector: brand.
 ---------------------------
Train has 12 unique items and has difference: set(),
--
Test has 12 unique items and has difference: set().
{'VOLVO', 'SKODA', 'BMW', 'INFINITI', 'LEXUS', 'VOLKSWAGEN', 'TOYOTA', 'MERCEDES', 'AUDI', 'MITSUBISHI', 'NISSAN', 'HONDA'}


Brand - was a key for download, so vectors are equal.

In [31]:
train.model_name = train.model_name.str.upper()

In [32]:
train[train.model_name == 'S-КЛАСС']

Unnamed: 0,bodyType,brand,car_url,color,complectation_dict,description,engineDisplacement,enginePower,equipment_dict,fuelType,...,vendor,Владельцы,Владение,ПТС,Привод,Руль,Состояние,Таможня,price,price_current
41563,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,чёрный,"{'back-brake': 'Ventilated disc', 'feeding': '...",Авто новое пробег 2500км в самой топовой компл...,2.9 л,249 л.с.,"{'e-adjustment-wheel': True, 'multi-wheel': Tr...",Дизель,...,,1 владелец,,Оригинал,полный,Левый,Не требует ремонта,Растаможен,1.254142e+07,15750000.0
41564,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,чёрный,"{'back-brake': 'Ventilated disc', 'feeding': '...",Автомобиль приобретён у официального дилера Па...,3.0 л,367 л.с.,"{'asr': True, 'roller-blind-for-rear-window': ...",Бензин,...,,2 владельца,,Оригинал,полный,Левый,Не требует ремонта,Растаможен,1.262104e+07,15850000.0
41565,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,чёрный,"{'back-brake': 'Ventilated disc', 'feeding': '...",ЛУКАВТО официальный дилер Mercedes Benz в Рос...,2.9 л,249 л.с.,"{'asr': True, 'roller-blind-for-rear-window': ...",Дизель,...,,1 владелец,,Оригинал,полный,Левый,Не требует ремонта,Растаможен,1.273969e+07,15999000.0
41566,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,чёрный,"{'back-brake': 'Ventilated disc', 'feeding': '...",Внимание Только для клиентов AVILON Автомобили...,3.0 л,367 л.с.,"{'cruise-control': True, 'asr': True, 'roller-...",Бензин,...,,1 владелец,,Оригинал,полный,Левый,Не требует ремонта,Растаможен,1.272456e+07,15980000.0
41567,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,чёрный,"{'back-brake': 'Ventilated disc', 'feeding': '...",В пленке самая полная комплектация нет только ...,3.0 л,367 л.с.,"{'asr': True, 'tinted-glass': True, 'roller-bl...",Бензин,...,,1 владелец,8 месяцев,Оригинал,полный,Левый,Не требует ремонта,Растаможен,1.234235e+07,15500000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9545,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,синий,"{'back-brake': 'Ventilated disc', 'feeding': '...",Хороший автомобиль без нареканий требует мелки...,6.0 л,394 л.с.,{},Бензин,...,,1 владелец,,Оригинал,задний,Левый,Не требует ремонта,Растаможен,3.025866e+05,380000.0
9546,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,синий,"{'back-brake': 'Disc', 'feeding': 'None', 'ful...",Нормальное состояние по кузову есть не критичн...,3.2 л,224 л.с.,{},Бензин,...,,3 или более,,Дубликат,задний,Левый,Не требует ремонта,Растаможен,2.946238e+05,370000.0
9547,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,чёрный,"{'back-brake': 'Disc', 'feeding': 'None', 'ful...",Авто без проблемм Не шпаклеванный не из кусков...,4.3 л,279 л.с.,"{'cruise-control': True, 'engine-proof': True,...",Бензин,...,,3 или более,,Оригинал,задний,Левый,Не требует ремонта,Растаможен,2.910843e+05,365555.0
9548,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,серый,"{'back-brake': 'Ventilated disc', 'feeding': '...",ЗАВОДСКАЯ БРОНЕКАПСУЛА Богатая комплектация Ра...,5.0 л,320 л.с.,{},Бензин,...,,3 или более,,Дубликат,задний,Левый,Не требует ремонта,Растаможен,2.906424e+05,365000.0


In [33]:
test[test.model_name == 'S_KLASSE']

Unnamed: 0,bodyType,brand,car_url,color,complectation_dict,description,engineDisplacement,enginePower,equipment_dict,fuelType,...,vehicleConfiguration,vehicleTransmission,vendor,Владельцы,Владение,ПТС,Привод,Руль,Состояние,Таможня
15416,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,синий,,Позвоните нам и получите дополнительную скидку...,5.5 LTR,388 N12,"{""electro-window-back"":true,""alloy-wheel-disks...",бензин,...,SEDAN AUTOMATIC 5.5,автоматическая,EUROPEAN,2 владельца,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
15449,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,чёрный,,В ДТП не участвовала.\n\n。Подушка безопасности...,4.7 LTR,435 N12,"{""cruise-control"":true,""asr"":true,""tinted-glas...",бензин,...,SEDAN AUTOMATIC 4.7,автоматическая,EUROPEAN,3 или более,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
15453,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,чёрный,,ЛОТ: 01200668\nFAVORIT MOTORS KIA Юг\n\nВы мож...,3.0 LTR,235 N12,"{""asr"":true,""tinted-glass"":true,""esp"":true,""ad...",дизель,...,SEDAN AUTOMATIC 3.0,автоматическая,EUROPEAN,3 или более,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
15492,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,чёрный,"{""id"":""21016733"",""name"":""S 450 4MATIC"",""availa...",МБ-Беляево — официальный дилер «Мерседес-Бенц»...,3.0 LTR,367 N12,"{""cruise-control"":true,""roller-blind-for-rear-...",бензин,...,SEDAN AUTOMATIC 3.0,автоматическая,EUROPEAN,1 владелец,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
15497,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,чёрный,,Автодилер года 2020\n\nПобедитель в номинации ...,4.7 LTR,435 N12,"{""cruise-control"":true,""airbag-rear-side"":true...",бензин,...,SEDAN AUTOMATIC 4.7,автоматическая,EUROPEAN,3 или более,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34214,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,чёрный,,Автомобиль в отличном состоянии.Обслуживался в...,5.0 LTR,306 N12,,бензин,...,SEDAN AUTOMATIC 5.0,автоматическая,EUROPEAN,2 владельца,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
34226,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,чёрный,,Автомобиль Приобретался у официального дилера ...,4.0 LTR,469 N12,,бензин,...,SEDAN AUTOMATIC 4.0,автоматическая,EUROPEAN,2 владельца,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
34239,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,чёрный,,"Продаётся легендарный S500 короткая база, полн...",5.0 LTR,306 N12,,бензин,...,SEDAN AUTOMATIC 5.0,автоматическая,EUROPEAN,2 владельца,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
34241,седан,MERCEDES,https://auto.ru/cars/used/sale/mercedes/s_klas...,чёрный,,Автомобиль продается официальным дилером Автом...,3.0 LTR,235 N12,"{""cruise-control"":true,""asr"":true,""tinted-glas...",дизель,...,SEDAN AUTOMATIC 3.0,автоматическая,EUROPEAN,3 или более,,Оригинал,полный,Левый,Не требует ремонта,Растаможен


In [34]:
vector_items('model_name')

Statistics of items in vector: model_name.
 ---------------------------
Train has 532 unique items and has difference: {'CR-X', 'CL-КЛАСС', 'N-WGN', 'COROLLA II', 'CL-КЛАСС AMG', 'SL-КЛАСС AMG', 'PAJERO IO', 'LANCER RALLIART', 'GLS AMG', 'POLO GTI', 'SL-КЛАСС', 'GRAND HIACE', 'RS 3', 'DELICA D:5', 'Z3 M', 'FR-V', 'M-КЛАСС AMG', 'S60 CROSS COUNTRY', 'CLK-КЛАСС', 'PAJERO MINI', 'CIVIC TYPE R', 'TYPE 1', 'CORONA EXIV', '4 СЕРИИ', 'A-КЛАСС AMG', 'E-TRON', 'GLC COUPE', 'MAYBACH GLS', 'GOLF R', 'GLS', 'NAVARA (FRONTIER)', '3 СЕРИИ', '8 СЕРИИ', '5 СЕРИИ', 'RS Q3', 'GT-R', 'MOBILIO SPIKE', 'MR-S', '190 (W201)', 'RAV4', 'SPRINTER TRUENO', 'CLA', 'X4 M', 'FAIRLADY Z', 'WILL CYPHA', 'QASHQAI+2', 'TT RS', '6 СЕРИИ', 'GLE COUPE', 'GLE COUPE AMG', 'SCEPTER', 'TOWN ACE', 'VERSO-S', 'TERRANO REGULUS', 'CLC-КЛАСС', 'MASTERACE SURF', '240 SERIES', 'COROLLA SPACIO', 'GLE AMG', 'LANCER EVOLUTION', 'S-КЛАСС AMG', 'COROLLA LEVIN', 'E-КЛАСС AMG', 'CLA AMG', 'C-КЛАСС', 'N-BOX', 'CLS', '7 СЕРИИ', 'JUKE NISMO',

Model name has different filling format. Example is above. I prefer to make equal names from urls, because they are didn't mutate in time, and delete model name from analise.

In [35]:
# example: https://auto.ru/cars/used/sale/audi/a4/1106130217-e5fd99e4/
# /audi/a4 - brand and model
train['model'] = train.car_url.apply(lambda q: q.split('/')[7])
test['model'] = test.car_url.apply(lambda q: q.split('/')[7])

In [36]:
vector_items('model')

Statistics of items in vector: model.
 ---------------------------
Train has 532 unique items and has difference: set(),
--
Test has 544 unique items and has difference: {'120', 'lm', 'origin', 'golf_r32', '140', 'e3', '100_series', 'sera', 'popular', 'i', '280zx', 'simplex'}.
{'nx', 'x6', 'allroad', 'mega_cruiser', 'q', 'golf_r32', 'v70', 'xc70', 'integra', 'wingroad', 'esquire', 'corona', 'ridgeline', 'cedric', 'karoq', 'mr2', 'popular', 'fr_v', 'torneo', 'rc', 'odyssey_na', 'tts', '8er', 'caddy', 'pajero_pinin', 'airwave', 'm3', '350z', 'silvia', 'gloria', 'tt_rs', 'grandis', 'galant', 'qx50', 'maybach_gls', 'supra', 'nv200', 'x2', 'caravelle', 'tt', 'q7', 'fortuner', 'tercel', '140', 'murano', 'm1', 'corsa', 'verso', 'mark_x_zio', 's7', 'delica', 'z', 'qashqai_plus_2', 'bassara', 'avancier', 'maxima', 'qx30', 'gt86', 'civic_type_r', 'jazz', 'odyssey', 'm6', 'x1', '80', 'golf_r', 'tiguan', 'master_ace_surf', 'succeed', 'sl_klasse', 'vigor', 'stream', 'progres', 'porte', 'm2', 'sprin

---
### resume:
Vectors 'model_name' and 'name' is very garbage. May be it will be worth to drop them and to use only
vectors 'brand' and 'model'


---
# color

In [37]:
vector_items('color')

Statistics of items in vector: color.
 ---------------------------
Train has 16 unique items and has difference: set(),
--
Test has 16 unique items and has difference: set().
{'синий', 'жёлтый', 'оранжевый', 'зелёный', 'чёрный', 'белый', 'коричневый', 'пурпурный', 'фиолетовый', 'розовый', 'серебристый', 'красный', 'голубой', 'золотистый', 'серый', 'бежевый'}


The color - is OK.

---
## engineDisplacement, fuelType

Resume for following code:

1. Rename some long values in train dataset. May by it is worth to delete them, because in test dataset we haven't the same car fuel type?
2. Electro-car has no engine displacement. This vectors in train and test datasets filled in different ways. So I need to make some new vector with engine fuel-type-capacity for classification in one vector. For electro-cars - capacity will by power in kVt.
3. For ML I transform vector 'enginePower', leaving only power in horse value in numeric format.
4. In vector engineDisplacement reoplace english tail 'LTR' for russian 'л'.

In [38]:
vector_items('fuelType')

Statistics of items in vector: fuelType.
 ---------------------------
Train has 9 unique items and has difference: {'Гибрид, газобаллонное оборудование', 'Электро', 'Бензин, газобаллонное оборудование', 'Гибрид', 'Дизель', 'Бензин', 'Дизель, газобаллонное оборудование', 'Газ, газобаллонное оборудование', 'Газ'},
--
Test has 5 unique items and has difference: {'бензин', 'электро', 'газ', 'дизель', 'гибрид'}.
{'бензин', 'электро', 'газ', 'дизель', 'гибрид'}


In [39]:
# replace long name ', газобаллонное оборудование' on '-гбо' and make all letters lower case
train.fuelType = train.fuelType.str.replace(', газобаллонное оборудование', '-гбо')
train.fuelType = train.fuelType.str.lower()

In [40]:
vector_items('fuelType')

Statistics of items in vector: fuelType.
 ---------------------------
Train has 9 unique items and has difference: {'гибрид-гбо', 'газ-гбо', 'бензин-гбо', 'дизель-гбо'},
--
Test has 5 unique items and has difference: set().
{'бензин', 'электро', 'газ', 'дизель', 'гибрид'}


In [41]:
""" function to take only numeric data from string"""
def find_number(field: str):
    """
    :param field: string to find number
    :return: number (in string format)
    """

    # mask for search
    p = '[\d]+[.,\d]+|[\d]*[.][\d]+|[\d]+'

    # main cycle for searching
    if re.search(p, field) is not None:
        for catch in re.finditer(p, field):
            return catch[0]

In fact, electro-cars have no engine displacement.
In datasets vector 'engineDisplacement' electro-cars records have different filling.
In test dataset vector 'engineDisplacement' filled with *'LTR'*, in train dataset - they filled with *horse power value*.
in test dataset vector 'enginePower' filled with *horse power value*, in train dataset - with *kVt power value*

We need to make something uniform. So we will make new vector 'engineName' as mask ['fuelType'+' '+'engineDisplacement'] for non-electro and ['fuelType'+' '+ kVt power value] for electro cars.

In [42]:
# make the dict from pairs (horse power, kVt power)
kvt=dict()

qq = train[train.fuelType == 'электро'].groupby(['engineDisplacement', 'enginePower'])['enginePower'].count()
qq = qq.to_dict()
for item, value in qq.keys():
    kvt[int(find_number(item))] = int(find_number(value))
kvt

{109: 80,
 115: 85,
 136: 100,
 150: 110,
 154: 113,
 170: 125,
 179: 132,
 184: 135,
 218: 160,
 313: 230,
 408: 300,
 41: 30,
 64: 47,
 67: 49,
 74: 100}

In [43]:
""" function to make new uniform vector of type of engine as mentioned above"""
def engineDisplacement(record):
    """
    :param record: raw from dataset
    :return: value for new vector
    """

    # case of gasoline and diesel
    if record.fuelType != 'электро':
        result = record.fuelType + ' ' + find_number(record.engineDisplacement)

    # case of electro
    else:
        # case for train dataset, where we have filled vector engineDisplacement, so we can take kVt power direct
        if find_number(record.engineDisplacement):
            result = record.fuelType + ' ' + find_number(record.enginePower)

        # case for test dataset, where we only have horse power filling
        else:
            value_kvt = int(record.enginePower.split()[0])
            try:
                # use dict to find value of kVt power from horse power
                value_kvt = str(kvt[value_kvt])

            # someone we don't have value of kVt power, so we leave just 'электро'
            except:
                value_kvt = ''

            result = record.fuelType + ' ' + str(value_kvt)

    return result

In [44]:
train['engineName'] = train.apply(engineDisplacement, axis=1)
test['engineName'] = test.apply(engineDisplacement, axis=1)

In [47]:
vector_items('engineName')

Statistics of items in vector: engineName.
 ---------------------------
Train has 158 unique items and has difference:
 {'бензин-гбо 2.8', 'бензин-гбо 1.5', 'бензин-гбо 2.9', 'бензин-гбо 4.5', 'электро 100', 'бензин-гбо 3.3', 'бензин-гбо 2.1', 'бензин-гбо 1.4', 'дизель-гбо 2.8', 'бензин-гбо 4.8', 'бензин-гбо 4.6', 'бензин-гбо 2.3', 'бензин-гбо 1.6', 'бензин-гбо 4.2', 'бензин-гбо 3.5', 'бензин-гбо 5.5', 'бензин-гбо 1.3', 'дизель 3.3', 'бензин-гбо 4.7', 'дизель-гбо 3.0', 'бензин-гбо 5.0', 'бензин-гбо 2.5', 'бензин-гбо 3.2', 'электро 135', 'бензин-гбо 4.3', 'электро 47', 'бензин-гбо 1.8', 'бензин-гбо 6.0', 'газ 1.8', 'электро 30', 'бензин-гбо 5.7', 'газ-гбо 1.4', 'бензин-гбо 3.6', 'бензин-гбо 2.7', 'бензин-гбо 2.4', 'бензин-гбо 2.0', 'электро 113', 'гибрид-гбо 3.3', 'бензин-гбо 5.6', 'бензин-гбо 3.0', 'бензин-гбо 2.6', 'бензин-гбо 4.0', 'дизель-гбо 4.5', 'газ-гбо 2.0', 'бензин-гбо 1.9', 'гибрид 1.4', 'бензин-гбо 1.7', 'бензин-гбо 3.7', 'электро 230', 'бензин-гбо 3.8', 'дизель 1.8', 'газ-г

In [48]:
test.enginePower = test.enginePower.str.replace('N12', 'л.с.')

In [49]:
train['enginePowerValue'] = train.enginePower
test['enginePowerValue'] = test.enginePower

In [50]:
def enginePowerExchange(record):
    if record.enginePower.split()[1] == 'кВт':
        result = find_number(record.engineDisplacement)
    else:
        result = find_number(record.enginePower)
    return result

In [51]:
train.enginePower = train.apply(enginePowerExchange, axis=1)
test.enginePower = test.apply(enginePowerExchange, axis=1)

In [53]:
train.enginePower = train.enginePower.apply(int)
test.enginePower = test.enginePower.apply(int)

In [54]:
vector_items('enginePower')

Statistics of items in vector: enginePower.
 ---------------------------
Train has 329 unique items and has difference:
 {134, 274, 537, 410, 670, 415, 161, 34, 417, 41, 299, 176, 564, 314, 187, 700, 448, 449, 456, 206, 344, 89, 237, 503},
--
Test has 315 unique items and has difference:
 {32, 195, 550, 487, 457, 42, 44, 430, 214, 30}.
{30, 32, 38, 40, 42, 44, 46, 48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 60, 61, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 135, 136, 137, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156, 157, 158, 159, 160, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 173, 174, 175, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 188, 189, 190, 192, 193, 194, 195, 197, 199, 200, 201,

In [58]:
vector_items('engineDisplacement')

Statistics of items in vector: engineDisplacement.
 ---------------------------
Train has 68 unique items and has difference:
{'1.0 л', '4.6 л', '179 л.с.', '154 л.с.', '5.0 л', '3.5 л', '170 л.с.', '2.3 л', '1.4 л', '3.3 л', '4.7 л', '5.5 л', '4.5 л', '3.7 л', '6.0 л', '2.2 л', '3.4 л', '5.8 л', '184 л.с.', '3.2 л', '1.3 л', '1.5 л', '1.1 л', '136 л.с.', '3.1 л', '4.1 л', '2.9 л', '2.8 л', '3.0 л', '3.6 л', '5.2 л', '2.7 л', '5.4 л', '408 л.с.', '6.6 л', '6.3 л', '1.6 л', '1.7 л', '150 л.с.', '2.1 л', '2.0 л', '218 л.с.', '313 л.с.', '3.9 л', '74 л.с.', '6.2 л', '1.8 л', '4.3 л', '67 л.с.', '5.9 л', '64 л.с.', '2.4 л', '2.5 л', '41 л.с.', '4.8 л', '4.0 л', '2.6 л', '3.8 л', '109 л.с.', '0.7 л', '5.7 л', '1.9 л', '1.2 л', '5.6 л', '4.2 л', '4.4 л', '4.9 л', '115 л.с.'},
--
Test has 55 unique items and has difference:
{'4.6 LTR', '1.3 LTR', '4.3 LTR', '2.3 LTR', '4.5 LTR', '4.8 LTR', '2.2 LTR', '6.3 LTR', '6.6 LTR', '5.2 LTR', '6.0 LTR', '2.4 LTR', '5.0 LTR', '3.7 LTR', '3.0 LTR', '2.9 

In [59]:
test.engineDisplacement = test.engineDisplacement.str.replace('LTR', 'л')

In [60]:
vector_items('engineDisplacement')

Statistics of items in vector: engineDisplacement.
 ---------------------------
Train has 68 unique items and has difference:
{'64 л.с.', '41 л.с.', '184 л.с.', '150 л.с.', '179 л.с.', '136 л.с.', '154 л.с.', '218 л.с.', '313 л.с.', '109 л.с.', '170 л.с.', '74 л.с.', '67 л.с.', '408 л.с.', '115 л.с.'},
--
Test has 55 unique items and has difference:
{' л', '5.3 л'}.
Test_set values:
{'1.0 л', '4.6 л', '5.3 л', '5.0 л', '3.5 л', '2.3 л', '1.4 л', '3.3 л', ' л', '5.5 л', '4.7 л', '4.5 л', '3.7 л', '6.0 л', '2.2 л', '3.4 л', '5.8 л', '1.3 л', '1.5 л', '1.1 л', '3.1 л', '4.1 л', '2.9 л', '2.8 л', '3.0 л', '3.6 л', '5.2 л', '2.7 л', '4.4 л', '5.4 л', '6.6 л', '6.3 л', '1.6 л', '1.7 л', '2.1 л', '2.0 л', '3.9 л', '6.2 л', '1.8 л', '4.3 л', '5.9 л', '2.5 л', '2.4 л', '4.8 л', '4.0 л', '2.6 л', '3.8 л', '0.7 л', '5.7 л', '1.2 л', '5.6 л', '4.2 л', '3.2 л', '4.9 л', '1.9 л'}


---
# vehicleConfiguration

Vehicle configuration in train dataset has to be combined from data of body type, vehicle transmission and engine displacement

In [61]:
vector_items('vehicleTransmission')

Statistics of items in vector: vehicleTransmission.
 ---------------------------
Train has 4 unique items and has difference:
set(),
--
Test has 4 unique items and has difference:
set().
Test_set values:
{'механическая', 'вариатор', 'роботизированная', 'автоматическая'}


In [62]:
transmission = {
    'механическая': 'MECHANICAL',
    'вариатор': 'VARIATOR',
    'роботизированная': 'ROBOT',
    'автоматическая': 'AUTOMATIC'
}

In [63]:
temp_df = test[
    ['bodyType', 'vehicleConfiguration']].groupby(
    ['bodyType', 'vehicleConfiguration'])['vehicleConfiguration'].count().to_dict()

bodyTypeEng = dict()
for key, value in temp_df.keys():
    bodyTypeEng[key] = value.split()[0]

bodyTypeEng

{'внедорожник 3 дв.': 'ALLROAD_3_DOORS',
 'внедорожник 5 дв.': 'ALLROAD_5_DOORS',
 'внедорожник открытый': 'ALLROAD_OPEN',
 'кабриолет': 'CABRIO',
 'компактвэн': 'COMPACTVAN',
 'купе': 'COUPE',
 'купе-хардтоп': 'COUPE_HARDTOP',
 'лимузин': 'LIMOUSINE',
 'лифтбек': 'LIFTBACK',
 'микровэн': 'MICROVAN',
 'минивэн': 'MINIVAN',
 'пикап двойная кабина': 'PICKUP_TWO',
 'пикап одинарная кабина': 'PICKUP_ONE',
 'пикап полуторная кабина': 'PICKUP_ONE_HALF',
 'родстер': 'ROADSTER',
 'седан': 'SEDAN',
 'седан 2 дв.': 'SEDAN_2_DOORS',
 'седан-хардтоп': 'SEDAN_HARDTOP',
 'тарга': 'TARGA',
 'универсал 5 дв.': 'WAGON_5_DOORS',
 'фастбек': 'FASTBACK',
 'фургон': 'VAN',
 'хэтчбек 3 дв.': 'HATCHBACK_3_DOORS',
 'хэтчбек 5 дв.': 'HATCHBACK_5_DOORS'}

In [64]:
""" function to construct filling for vector 'vehicleConfiguration' from several another vectors"""
def vehicleConfiguration(record):
    """
    :param record: record from dataset (call by apply)
    :return: filling for field in vector with mask like : 'COUPE ROBOT 2.5'
    """
    body = bodyTypeEng[record.bodyType]
    trans = transmission[record.vehicleTransmission]
    disp = find_number(record.engineDisplacement)

    result = body + ' ' + trans
    if disp:
        result = result + ' ' + disp

    return result

In [65]:
train.vehicleConfiguration = train.apply(vehicleConfiguration, axis=1)

In [66]:
vector_items('vehicleConfiguration')

Statistics of items in vector: vehicleConfiguration.
 ---------------------------
Train has 764 unique items and has difference:
{'HATCHBACK_5_DOORS AUTOMATIC 136', 'ALLROAD_5_DOORS MECHANICAL 1.3', 'ALLROAD_5_DOORS AUTOMATIC 408', 'ROADSTER ROBOT 2.0', 'LIFTBACK MECHANICAL 1.5', 'HATCHBACK_3_DOORS MECHANICAL 2.2', 'SEDAN_2_DOORS MECHANICAL 1.8', 'VAN AUTOMATIC 2.0', 'VAN AUTOMATIC 1.8', 'CABRIO ROBOT 3.0', 'VAN MECHANICAL 3.0', 'COMPACTVAN MECHANICAL 2.3', 'ALLROAD_3_DOORS MECHANICAL 2.7', 'HATCHBACK_3_DOORS MECHANICAL 3.0', 'ALLROAD_3_DOORS MECHANICAL 0.7', 'WAGON_5_DOORS VARIATOR 1.6', 'MINIVAN ROBOT 1.4', 'SEDAN_HARDTOP AUTOMATIC 2.2', 'HATCHBACK_3_DOORS VARIATOR 2.0', 'COMPACTVAN VARIATOR 1.4', 'HATCHBACK_5_DOORS AUTOMATIC 115', 'MICROVAN ROBOT 0.7', 'HATCHBACK_5_DOORS AUTOMATIC 2.5', 'HATCHBACK_5_DOORS AUTOMATIC 150', 'MICROVAN AUTOMATIC 41', 'PICKUP_TWO AUTOMATIC 3.3', 'HATCHBACK_5_DOORS AUTOMATIC 179', 'VAN MECHANICAL 1.8', 'ROADSTER ROBOT 3.2', 'ROADSTER MECHANICAL 2.0', 'VAN 

In [67]:
train.drop(['modelDate', 'model_info', 'model_name', 'name', 'price_current'], axis=1, inplace=True)
test.drop(['modelDate', 'model_info', 'model_name', 'name'], axis=1, inplace=True)

---
# vendor

Vendor in test dataset has only two ubique value. So I fill train dataset with dictionary brand-vendor.

In [68]:
# prepare to filling vector 'vendor' in train dataset.
# make dictionary of brand-vendors
vend = test.groupby(['brand', 'vendor'])['vendor'].count().to_dict().keys()
vend=dict(vend)

In [69]:
train.vendor = train.brand.map(vend)

In [70]:
vector_items('vendor')

Statistics of items in vector: vendor.
 ---------------------------
Train has 2 unique items and has difference:
set(),
--
Test has 2 unique items and has difference:
set().
Test_set values:
{'JAPANESE', 'EUROPEAN'}


---
# Владение, Владельцы

Transform vector 'Владельцы' to numeric value, delete vector 'Владение' - because it has a lot of NA.

In [71]:
vector_items('Владельцы')

Statistics of items in vector: Владельцы.
 ---------------------------
Train has 4 unique items and has difference:
{nan, '1 владелец', '2 владельца'},
--
Test has 3 unique items and has difference:
{'1\xa0владелец', '2\xa0владельца'}.
Test_set values:
{'3 или более', '1\xa0владелец', '2\xa0владельца'}


In [72]:
train['Владельцы'].fillna('3 или более', inplace=True)

In [73]:
test['owners'] = test['Владельцы']
test['Владельцы'] = test['Владельцы'].apply(find_number)

In [74]:
train['owners'] = train['Владельцы']
train['Владельцы'] = train['Владельцы'].apply(find_number)

In [75]:
train['Владельцы'] = train['Владельцы'].apply(int)
test['Владельцы'] = test['Владельцы'].apply(int)

In [76]:
vector_items('Владельцы')

Statistics of items in vector: Владельцы.
 ---------------------------
Train has 3 unique items and has difference:
set(),
--
Test has 3 unique items and has difference:
set().
Test_set values:
{1, 2, 3}


In [77]:
train.drop(['Владение'], axis=1, inplace=True)
test.drop(['Владение'], axis=1, inplace=True)

---
# ПТС

Filling NA with 'Дубликат'

In [78]:
test['ПТС'].value_counts()

Оригинал    30098
Дубликат     4587
Name: ПТС, dtype: int64

In [79]:
train['ПТС'].fillna('Дубликат', inplace=True)
test['ПТС'].fillna('Дубликат', inplace=True)

In [80]:
vector_items('ПТС')

Statistics of items in vector: ПТС.
 ---------------------------
Train has 2 unique items and has difference:
set(),
--
Test has 2 unique items and has difference:
set().
Test_set values:
{'Оригинал', 'Дубликат'}


---
# Состояние

In train dataset there are some records with NA 'Состояние'. Analysis of the records led to the conclusion that these options relate to car accidents. So I delete this records from dataset.
After that - delete vector as unuseful for ML

In [81]:
vector_items('Состояние')

Statistics of items in vector: Состояние.
 ---------------------------
Train has 2 unique items and has difference:
{nan},
--
Test has 1 unique items and has difference:
set().
Test_set values:
{'Не требует ремонта'}


In [82]:
train[train['Состояние'] != 'Не требует ремонта']

Unnamed: 0,bodyType,brand,car_url,color,complectation_dict,description,engineDisplacement,enginePower,equipment_dict,fuelType,...,ПТС,Привод,Руль,Состояние,Таможня,price,model,engineName,enginePowerValue,owners
36691,внедорожник 5 дв.,MERCEDES,https://auto.ru/cars/used/sale/mercedes/glb_kl...,белый,"{'back-brake': 'Disc', 'feeding': 'Turbocharge...",Куплен 25 декабря 2020 у официального дилера в...,1.3 л,150,{},бензин,...,Оригинал,передний,Левый,,Растаможен,2134032.0,glb_klasse,бензин 1.3,150 л.с.,1 владелец
67936,лифтбек,SKODA,https://auto.ru/cars/used/sale/skoda/octavia/1...,серебристый,"{'back-brake': 'Disc', 'feeding': 'None', 'ful...",Битая Цена ремонта на 150 тр,1.6 л,102,"{'cruise-control': True, 'tinted-glass': True,...",бензин,...,Оригинал,передний,Левый,,Растаможен,159256.1,octavia,бензин 1.6,102 л.с.,3 или более
77393,седан,TOYOTA,https://auto.ru/cars/used/sale/toyota/corolla/...,серый,"{'feeding': 'None', 'horse-power': 105, 'kvt-p...",Надо Делать ДВС Обмен Торг,1.6 л,105,{},бензин,...,Оригинал,передний,Правый,,Растаможен,79628.04,corolla,бензин 1.6,105 л.с.,2 владельца
77431,седан,TOYOTA,https://auto.ru/cars/used/sale/toyota/corolla/...,белый,"{'back-brake': 'Drum', 'feeding': 'None', 'ran...",продается легенда японского автопрома битая по...,1.6 л,115,{},бензин,...,Дубликат,передний,Правый,,Растаможен,59721.03,corolla,бензин 1.6,115 л.с.,3 или более
89883,хэтчбек 5 дв.,VOLKSWAGEN,https://auto.ru/cars/used/sale/volkswagen/golf...,красный,"{'back-brake': 'Drum', 'feeding': 'None', 'ful...",Проблемы с электрикой производилась замена вых...,1.6 л,75,"{'ptf': True, 'abs': True, 'eco-leather': True...",бензин,...,Дубликат,передний,Левый,,Растаможен,119442.1,golf,бензин 1.6,75 л.с.,3 или более
89888,хэтчбек 5 дв.,VOLKSWAGEN,https://auto.ru/cars/used/sale/volkswagen/golf...,чёрный,"{'back-brake': 'Drum', 'feeding': 'None', 'ful...",Замена порогов задняя юбка днище обработана ан...,1.6 л,72,{},бензин,...,Оригинал,передний,Левый,,Растаможен,127404.9,golf,бензин 1.6,72 л.с.,3 или более
94015,универсал 5 дв.,VOLKSWAGEN,https://auto.ru/cars/used/sale/volkswagen/pass...,белый,"{'back-brake': 'Drum', 'feeding': 'None', 'ful...",На разбор или под восстановление без АКБ Торг ...,1.8 л,107,{},бензин,...,Оригинал,передний,Левый,,Растаможен,51758.23,passat,бензин 1.8,107 л.с.,3 или более
16755,лифтбек,SKODA,https://auto.ru/cars/used/sale/skoda/rapid/110...,серый,"{'back-brake': 'Drum', 'feeding': 'None', 'ful...",Официальный дилерский центр РОЛЬФ Премиум Вешк...,1.6 л,110,"{'airbag-driver': True, 'rain-sensor': True, '...",бензин,...,Оригинал,передний,Левый,,Растаможен,613135.9,rapid,бензин 1.6,110 л.с.,1 владелец
18626,хэтчбек 5 дв.,TOYOTA,https://auto.ru/cars/used/sale/toyota/corolla/...,белый,"{'back-brake': 'Disc', 'feeding': 'None', 'ful...",Машина была привезена в Россию из Германии ещё...,1.6 л,110,{},бензин,...,Оригинал,передний,Левый,,Растаможен,167218.9,corolla,бензин 1.6,110 л.с.,2 владельца
21554,седан,VOLKSWAGEN,https://auto.ru/cars/used/sale/volkswagen/vent...,серый,"{'back-brake': 'Drum', 'feeding': 'None', 'ful...",Битый двигатель не пострадал система охлаждени...,1.8 л,75,{},бензин,...,Оригинал,передний,Левый,,Растаможен,79628.04,vento,бензин 1.8,75 л.с.,3 или более


In [83]:
train.dropna(subset=['Состояние'], inplace=True)

In [84]:
vector_items('Состояние')

Statistics of items in vector: Состояние.
 ---------------------------
Train has 1 unique items and has difference:
set(),
--
Test has 1 unique items and has difference:
set().
Test_set values:
{'Не требует ремонта'}


In [85]:
train.drop(['Состояние'], axis=1, inplace=True)
test.drop(['Состояние'], axis=1, inplace=True)

---
# Таможня

The same history with vector 'Таможня'. Analysis of the records led to the conclusion that these options relate to non-cleared cars or dubious ownership or purchase histories. So I delete this records from dataset.
After that - delete vector as unuseful for ML

In [86]:
vector_items('Таможня')

Statistics of items in vector: Таможня.
 ---------------------------
Train has 2 unique items and has difference:
{nan},
--
Test has 1 unique items and has difference:
set().
Test_set values:
{'Растаможен'}


In [87]:
train.dropna(subset=['Таможня'], inplace=True)

In [88]:
vector_items('Таможня')

Statistics of items in vector: Таможня.
 ---------------------------
Train has 1 unique items and has difference:
set(),
--
Test has 1 unique items and has difference:
set().
Test_set values:
{'Растаможен'}


In [89]:
train.drop(['Таможня'], axis=1, inplace=True)
test.drop(['Таможня'], axis=1, inplace=True)

---
# Prepare for Feature engineering

In [93]:
# delete some vectors
train.drop(['complectation_dict', 'equipment_dict', 'car_url',
            'image', 'description', 'priceCurrency'], axis=1, inplace=True)
test.drop(['complectation_dict', 'equipment_dict', 'car_url',
           'image', 'description', 'priceCurrency'], axis=1, inplace=True)

In [94]:
# append test dataset to train dataset with value of price=0
# vector-mark of test - 'test'
test['price']=0
test['test']=1
train['test']=0
data = train.append(test)

In [95]:
data.shape, train.shape, test.shape

((146632, 25), (111946, 25), (34686, 25))

In [96]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146632 entries, 0 to 34685
Data columns (total 25 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   bodyType              146632 non-null  object 
 1   brand                 146632 non-null  object 
 2   color                 146632 non-null  object 
 3   engineDisplacement    146632 non-null  object 
 4   enginePower           146632 non-null  int64  
 5   fuelType              146632 non-null  object 
 6   mileage               146632 non-null  int64  
 7   numberOfDoors         146632 non-null  int64  
 8   parsing_unixtime      146632 non-null  int64  
 9   productionDate        146632 non-null  int64  
 10  sell_id               146632 non-null  int64  
 11  super_gen             146632 non-null  object 
 12  vehicleConfiguration  146632 non-null  object 
 13  vehicleTransmission   146632 non-null  object 
 14  vendor                146632 non-null  object 
 15  В

---
# FE

* average mileage per year
* average mileage by owner
* rarity
* older 3-5 year
* color clustering
* body clustering
*

In [87]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87746 entries, 0 to 103793
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   bodyType              87746 non-null  object 
 1   brand                 87746 non-null  object 
 2   car_url               87746 non-null  object 
 3   color                 87746 non-null  object 
 4   engineDisplacement    87746 non-null  object 
 5   enginePower           87746 non-null  int64  
 6   fuelType              87746 non-null  object 
 7   mileage               87746 non-null  int64  
 8   numberOfDoors         87746 non-null  int64  
 9   parsing_unixtime      87746 non-null  int64  
 10  productionDate        87746 non-null  int64  
 11  sell_id               87746 non-null  int64  
 12  super_gen             87746 non-null  object 
 13  vehicleConfiguration  87746 non-null  object 
 14  vehicleTransmission   87746 non-null  object 
 15  vendor            

In [79]:
test.isnull().sum()

bodyType                    0
brand                       0
car_url                     0
color                       0
complectation_dict      28268
description                 0
engineDisplacement          0
enginePower                 0
equipment_dict           9996
fuelType                    0
image                       0
mileage                     0
numberOfDoors               0
parsing_unixtime            0
priceCurrency               0
productionDate              0
sell_id                     0
super_gen                   0
vehicleConfiguration        0
vehicleTransmission         0
vendor                      0
Владельцы                   0
ПТС                         0
Привод                      0
Руль                        0
model                       0
engineName                  0
enginePowerValue            0
owners                      0
dtype: int64

In [84]:
train.isnull().sum()

bodyType                0
brand                   0
car_url                 0
color                   0
complectation_dict      0
description             0
engineDisplacement      0
enginePower             0
equipment_dict          0
fuelType                0
image                   1
mileage                 0
numberOfDoors           0
parsing_unixtime        0
priceCurrency           0
productionDate          0
sell_id                 0
super_gen               0
vehicleConfiguration    0
vehicleTransmission     0
vendor                  0
Владельцы               0
ПТС                     0
Привод                  0
Руль                    0
price                   0
model                   0
engineName              0
enginePowerValue        0
owners                  0
dtype: int64

----
# CatBosting

In [None]:
features = ['bodyType', 'brand', 'color', 'engineDisplacement',
            'enginePower', 'fuelType', 'mileage',
            'numberOfDoors', 'productionDate', 'vehicleTransmission']
features_obj =['bodyType', 'brand', 'color', 'engineDisplacement',
               'enginePower', 'fuelType',
               'numberOfDoors', 'vehicleTransmission']
train_f = train[features].copy()
for colum in features_obj:
    train_f[colum] = train_f[colum].astype('category').cat.codes

In [None]:
train_f

In [None]:
target = 'price'
#x = train.drop(['price', 'model_info', 'modelDate', 'vehicleConfiguration',
#                'vendor', 'Владельцы', 'Владение' ], axis=1)
x = train_f
y = train[target]

In [None]:
stack = x.copy()
stack['price']=y
stack.corr()['price'].abs().sort_values(ascending=False)[1:]

In [None]:
heatmap([],stack)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED)

In [None]:
model = CatBoostRegressor(iterations = 5000,
                          random_seed = SEED,
                          eval_metric='MAPE',
                          custom_metric=['R2', 'MAE'],
                          silent=True,
                          )
model.fit(X_train, y_train,
          #cat_features=cat_features_ids,
          eval_set=(X_test, y_test),
          verbose_eval=0,
          use_best_model=True,
          #plot=True
          )

model.save_model('catboost_single_model_baseline.model')

In [None]:
predict = model.predict(X_test)
print(f"Точность модели по метрике MAPE: {(mape(y_test, predict))*100:0.2f}%")

In [None]:
model = CatBoostRegressor(iterations = 5000,
                          random_seed = SEED,
                          eval_metric='MAPE',
                          custom_metric=['R2', 'MAE'],
                          silent=True,
                          )
model.fit(X_train, np.log(y_train),
          #cat_features=cat_features_ids,
          eval_set=(X_test, np.log(y_test)),
          verbose_eval=0,
          use_best_model=True,
          #plot=True
          )

model.save_model('catboost_single_model_2_baseline.model')

In [None]:
predict_test = np.exp(model.predict(X_test))
print(f"Точность модели по метрике MAPE: {(mape(y_test, predict_test))*100:0.2f}%")

---
# Test various models

In [None]:
test_classifires = {
    'RandomForest': RandomForestRegressor(n_estimators=300, random_state = SEED),
    'DecisionTree': DecisionTreeRegressor(random_state=SEED),
    'ExtraTrees'  : ExtraTreesRegressor(random_state=SEED),
    'LightGBM'    : LGBMRegressor(random_state=SEED),
    'XGBoost'     : XGBRegressor(random_state=SEED)
}

In [None]:
results = dict()
for item in tqdm(test_classifires):
    cls=test_classifires[item]

    cls.fit(X_train, y_train)
    predict =cls.predict(X_test)

    cls.fit(X_train, np.log(y_train))
    predict_log = np.exp(cls.predict(X_test))

    results[item] = [round(mape(y_test, predict)*100,2), round(mape(y_test, predict_log)*100,2)]
results

---
# RandomForest best parameters

In [None]:
# Making best parameters searching diapasons
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_features = ['log2', 'sqrt']
max_depth = [int(x) for x in np.linspace(start = 1, stop = 15, num = 15)]
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 50, num = 10)]
min_samples_leaf = [int(x) for x in np.linspace(start = 2, stop = 50, num = 10)]
bootstrap = [True, False]
param_dist = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap}

# Randomized search. Making 3 cross validation (cv) for each 100 iterations candidates
model_rfc = RandomForestRegressor(random_state = SEED)
rs = RandomizedSearchCV(model_rfc,
                        param_dist,
                        n_iter = 100,
                        cv = 3,
                        verbose = 1,
                        n_jobs=-1,
                        random_state=SEED)
rs.fit(X_train, np.log(y_train))
rfc_best = rs.best_params_
rfc_best['random_state']=SEED
rfc_best
# approx. 16 minutes
# {'n_estimators': 200,
# 'min_samples_split': 2,
# 'min_samples_leaf': 2,
# 'max_features': 'sqrt',
# 'max_depth': 14,
# 'bootstrap': False,
# 'random_state': 42}

In [None]:
#
model_rfc = RandomForestRegressor(**rfc_best)

model_rfc.fit(X_train, np.log(y_train))
predict_log = np.exp(model_rfc.predict(X_test))
round(mape(y_test, predict_log)*100,2)

In [None]:
rs_df = pd.DataFrame(rs.cv_results_).sort_values('rank_test_score').reset_index(drop=True)
rs_df = rs_df.drop([
    'mean_fit_time',
    'std_fit_time',
    'mean_score_time',
    'std_score_time',
    'params',
    'split0_test_score',
    'split1_test_score',
    'split2_test_score',
    'std_test_score'],
    axis=1)

In [None]:
fig, axs = plt.subplots(ncols=3, nrows=2)
sns.set(style="whitegrid", color_codes=True, font_scale = 2)
fig.set_size_inches(30,25)
sns.barplot(x='param_n_estimators', y='mean_test_score', data=rs_df, ax=axs[0,0], color='lightgrey')
axs[0,0].set_ylim([.83,.95])
axs[0,0].set_title(label = 'n_estimators', size=30, weight='bold')
sns.barplot(x='param_min_samples_split', y='mean_test_score', data=rs_df, ax=axs[0,1], color='coral')
axs[0,1].set_ylim([.85,.95])
axs[0,1].set_title(label = 'min_samples_split', size=30, weight='bold')
sns.barplot(x='param_min_samples_leaf', y='mean_test_score', data=rs_df, ax=axs[0,2], color='lightgreen')
axs[0,2].set_ylim([.80,.95])
axs[0,2].set_title(label = 'min_samples_leaf', size=30, weight='bold')
sns.barplot(x='param_max_features', y='mean_test_score', data=rs_df, ax=axs[1,0], color='wheat')
axs[1,0].set_ylim([.80,.95])
axs[1,0].set_title(label = 'max_features', size=30, weight='bold')
sns.barplot(x='param_max_depth', y='mean_test_score', data=rs_df, ax=axs[1,1], color='lightpink')
axs[1,1].set_ylim([.80,.95])
axs[1,1].set_title(label = 'max_depth', size=30, weight='bold')
sns.barplot(x='param_bootstrap',y='mean_test_score', data=rs_df, ax=axs[1,2], color='skyblue')
axs[1,2].set_ylim([.80,.95])
axs[1,2].set_title(label = 'bootstrap', size=30, weight='bold')
plt.show()

---
# DecisionTree

---
# GradientBoosting

---
# ExtraTrees

---
# LightGBM

---
# XGBoost