## imports

In [352]:
import pandas as pd
import numpy as np
import re
import time
import json

from lib.data_viz_functions import *

In [216]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

## reading data from disk

In [240]:
train = pd.read_pickle("data/train_df_full_part1.pkl.zip", compression="zip")
test = pd.read_pickle("data/test.pkl.zip", compression="zip")

train.shape, test.shape

((130201, 36), (34686, 32))

## functions

In [315]:
def get_number_of_weeks_from_ownings(in_str):
    if not isinstance(in_str, str):
        return None
    list_of_own = in_str.split()
    if len(list_of_own) == 5:
        return int(list_of_own[0]) * 12 + int(list_of_own[3])
    else:
        if list_of_own[1] in ['лет', 'год']:
            return int(list_of_own[0]) * 12
        else:
            return int(list_of_own[0])
        
        
def get_number_of_owners_from_owners(in_str):
    if not isinstance(in_str, str):
        return None
    else:
        result = in_str.replace('\xa0', '')
        return int(re.sub('\D', '', result))
    # return string (to cat)
    

def get_engine_value(in_str):
    parsed_str = re.findall('(\d+.\d+)', in_str)
    if len(parsed_str):
        return float(parsed_str[0])
    else:
        return None

## quick view

In [7]:
describe_nums(train.select_dtypes(exclude="object"))

Unnamed: 0,unique,n/a count,count,mean,std,min,25%,50%,75%,max
parsing_unixtime,130154,47,130154.0,1632842487.144867,131636.029755,1632613481.0,1632724602.5,1632820776.0,1632972687.5,1633049056.0
sell_id,40721,232,129969.0,1104865960.875278,1779786.356124,1003139396.0,1104900484.0,1105218565.0,1105376150.0,1105402618.0
mileage,15181,27167,103034.0,116610.915407,85154.492046,1.0,51000.0,100000.0,161000.0,1000000.0
price,8105,20436,109765.0,2801223.260639,2689068.150067,23000.0,1000000.0,2045000.0,3649000.0,39990000.0
views,4511,28203,101998.0,564.982284,1814.036025,2.0,127.0,245.0,511.0,148149.0
modelDate,53,215,129986.0,2012.934578,6.017749,1938.0,2010.0,2014.0,2017.0,2021.0
productionDate,50,213,129988.0,2015.083854,5.929774,1952.0,2012.0,2016.0,2020.0,2021.0
numberOfDoors,4,214,129987.0,4.643164,0.619354,2.0,4.0,5.0,5.0,5.0
model_info,0,130201,0.0,,,,,,,
vendor,0,130201,0.0,,,,,,,


In [8]:
describe_nums(test.select_dtypes(exclude="object"))

Unnamed: 0,unique,n/a count,count,mean,std,min,25%,50%,75%,max
parsing_unixtime,34686,0,34686.0,1603286733.619356,149307.058483,1603107306.0,1603221157.75,1603254133.0,1603290080.25,1603710264.0
sell_id,34686,0,34686.0,1098300150.880644,19112247.594511,2665.0,1099048798.0,1100910913.0,1101245023.75,1101374610.0
mileage,11268,0,34686.0,162009.767889,100676.559489,1.0,91153.5,149779.5,215000.0,1000000.0
productionDate,69,0,34686.0,2009.264602,7.047661,1904.0,2006.0,2011.0,2014.0,2020.0
modelDate,66,0,34686.0,2007.074728,7.415894,1904.0,2004.0,2008.0,2012.0,2020.0
numberOfDoors,5,0,34686.0,4.450816,0.70304,0.0,4.0,5.0,5.0,5.0


In [9]:
train.select_dtypes("object").shape, test.select_dtypes("object").shape

((130201, 26), (34686, 26))

In [10]:
train.select_dtypes("object").describe().T.sort_values("unique", ascending=False)

Unnamed: 0,count,unique,top,freq
super_gen,130135,43825,"{'sale-data-attributes': {'asciiCat': 'cars', ...",235
image,130065,42933,https://avatars.mds.yandex.net/get-autoru-vos/...,241
car_url,129969,40721,https://auto.ru/cars/new/group/toyota/rav_4/21...,428
description,129988,35274,Официальный дилер Mercedes Benz ООО РОЛЬФ фили...,3230
equipment_dict,129969,31190,{},8214
complectation_dict,105267,2348,"['cruise-control', 'multi-wheel', 'airbag-pass...",1979
name,130153,2283,Nissan X-Trail III Рестайлинг,4013
region,103034,1335,в Москве,39180
date_added,103034,939,25 сентября,44873
vehicleConfiguration,129987,564,ALLROAD_5_DOORS AUTOMATIC 3.0,10895


In [67]:
train['enginePower'] = train['enginePower'].replace('undefined N12', None)
train['enginePower'] = train[~pd.isna(train['enginePower'])]['enginePower'].str.split().str.get(0).astype('int')

In [70]:
train['engineDisplacement'] = train['engineDisplacement'].replace(' LTR', None)
train['engineDisplacement'] = train[~pd.isna(train['engineDisplacement'])]['engineDisplacement'].str.split().str.get(0).astype('float')

In [253]:
train['engineDisplacement'] = train[~pd.isna(train['engineDisplacement'])]['engineDisplacement'].apply(get_engine_value)

In [254]:
train[~pd.isna(train['engineDisplacement'])]['engineDisplacement']

0         1.8
1         1.6
2         1.6
3         1.4
4         1.4
         ... 
130196    2.0
130197    2.4
130198    1.6
130199    1.6
130200    2.0
Name: engineDisplacement, Length: 129572, dtype: float64

In [257]:
for i in range(10):
    print(f'page {i+1} of 10', end='\r')
    time.sleep(1)

page 10 of 10

In [281]:
train['used'] = train['car_url'].str.contains('used')

In [290]:
train[train['used'] == False].dropna(thresh=24).shape

(0, 37)

check NA <= 5 () (new column)  
make new or used column  
electric - drop

In [280]:
train.loc[~train['car_url'].str.contains('used')]

TypeError: bad operand type for unary ~: 'float'

make plot 

In [220]:
test.iloc[34682]

bodyType                                                            седан
brand                                                                 BMW
car_url                 https://auto.ru/cars/used/sale/bmw/5er/1101369...
color                                                              чёрный
complectation_dict                                                    NaN
description                                Продаётся отличный автомобиль.
engineDisplacement                                                2.0 LTR
enginePower                                                       190 N12
equipment_dict                                                        NaN
fuelType                                                           дизель
image                   https://autoru.naydex.net/nwI1K7152/b604fdsYg7...
mileage                                                             98000
modelDate                                                            2016
model_info              {"code":"5ER",

In [195]:
train['car_url'].str.contains('used').value_counts()

True     103034
False     26935
Name: car_url, dtype: int64

In [249]:
in_str = '2.0 LTR'
parsed_str = re.findall('(\d+.\d+)', in_str)
if len(parsed_str):
    print(parsed_str[0])
else:
    print(None)

2.0


In [11]:
test.select_dtypes("object").describe().T.sort_values("unique", ascending=False)

Unnamed: 0,count,unique,top,freq
car_url,34686,34686,https://auto.ru/cars/used/sale/bmw/3er/1078412...,1
image,34686,34557,https://avatars.mds.yandex.net/get-verba/21620...,13
description,34686,31732,Выгода до 82 000 руб. при обмене на Ваш автомо...,264
equipment_dict,24690,23705,"{""leather"":true}",108
super_gen,34686,5890,"{""id"":""6214876"",""displacement"":1598,""engine_ty...",193
name,34686,2780,1.6 AT (110 л.с.),631
complectation_dict,6418,2364,"{""id"":""4562904"",""name"":""Elegance"",""available_o...",51
model_info,34686,954,"{""code"":""OCTAVIA"",""name"":""Octavia"",""ru_name"":""...",1404
vehicleConfiguration,34686,634,ALLROAD_5_DOORS AUTOMATIC 3.0,2389
model_name,34686,544,OCTAVIA,1418


In [321]:
test['fuelType'].value_counts()

бензин     28601
дизель      5800
гибрид       223
электро       55
газ            7
Name: fuelType, dtype: int64

In [332]:
test[~pd.isna(test['complectation_dict'])]

Unnamed: 0,bodyType,brand,car_url,color,complectation_dict,description,engineDisplacement,enginePower,equipment_dict,fuelType,image,mileage,modelDate,model_info,model_name,name,numberOfDoors,parsing_unixtime,priceCurrency,productionDate,sell_id,super_gen,vehicleConfiguration,vehicleTransmission,vendor,Владельцы,Владение,ПТС,Привод,Руль,Состояние,Таможня
2,лифтбек,SKODA,https://auto.ru/cars/used/sale/skoda/superb/11...,серый,"{""id"":""20026336"",""name"":""Ambition"",""available_...","Все автомобили, представленные в продаже, прох...",1.8 LTR,152 N12,"{""cruise-control"":true,""tinted-glass"":true,""es...",бензин,https://avatars.mds.yandex.net/get-autoru-vos/...,88000,2013,"{""code"":""SUPERB"",""name"":""Superb"",""ru_name"":""Су...",SUPERB,DSG 1.8 AMT (152 л.с.),5,1603226280,RUB,2014,1100658222,"{""id"":""20026323"",""nameplate"":""DSG"",""displaceme...",LIFTBACK ROBOT 1.8,роботизированная,EUROPEAN,1 владелец,,Оригинал,передний,Левый,Не требует ремонта,Растаможен
3,лифтбек,SKODA,https://auto.ru/cars/used/sale/skoda/octavia/1...,коричневый,"{""id"":""20803582"",""name"":""Ambition"",""available_...",КОМПЛЕКТ ЗИМНЕЙ (ЛЕТНЕЙ) РЕЗИНЫ ПО СЕЗОНУ В ПО...,1.6 LTR,110 N12,"{""cruise-control"":true,""roller-blind-for-rear-...",бензин,https://autoru.naydex.net/o9DBXQ270/5ac010hAY0...,95000,2013,"{""code"":""OCTAVIA"",""name"":""Octavia"",""ru_name"":""...",OCTAVIA,1.6 AT (110 л.с.),5,1603226284,RUB,2014,1100937408,"{""id"":""20105521"",""displacement"":1598,""engine_t...",LIFTBACK AUTOMATIC 1.6,автоматическая,EUROPEAN,1 владелец,,Оригинал,передний,Левый,Не требует ремонта,Растаможен
5,лифтбек,SKODA,https://auto.ru/cars/used/sale/skoda/octavia_r...,серый,"{""id"":""5011515"",""name"":""RS"",""available_options...",Продаю свой автомобиль! Автомобиль полностью и...,2.0 LTR,200 N12,"{""cruise-control"":true,""tinted-glass"":true,""es...",бензин,https://avatars.mds.yandex.net/get-autoru-vos/...,172000,2008,"{""code"":""OCTAVIA_RS"",""name"":""Octavia RS"",""ru_n...",OCTAVIA_RS,2.0 AMT (200 л.с.),5,1603226291,RUB,2012,1100912634,"{""id"":""5009158"",""displacement"":1984,""engine_ty...",LIFTBACK ROBOT 2.0,роботизированная,EUROPEAN,3 или более,,Оригинал,передний,Левый,Не требует ремонта,Растаможен
6,внедорожник 5 дв.,SKODA,https://auto.ru/cars/used/sale/skoda/yeti/1101...,пурпурный,"{""id"":""20069264"",""name"":""Elegance"",""available_...",- выгода 60 000 рублей при покупке авто в Trad...,1.8 LTR,152 N12,"{""cruise-control"":true,""tinted-glass"":true,""es...",бензин,https://autoru.naydex.net/o9DBXQ270/5ac010hAY0...,107000,2009,"{""code"":""YETI"",""name"":""Yeti"",""ru_name"":""Йети"",...",YETI,1.8 AMT (152 л.с.) 4WD,5,1603226295,RUB,2012,1101228730,"{""id"":""20089129"",""displacement"":1798,""engine_t...",ALLROAD_5_DOORS ROBOT 1.8,роботизированная,EUROPEAN,1 владелец,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
16,лифтбек,SKODA,https://auto.ru/cars/used/sale/skoda/octavia/1...,белый,"{""id"":""20913628"",""name"":""Style"",""available_opt...",• Автомобиль продается официальным дилером SKO...,1.8 LTR,180 N12,"{""asr"":true,""esp"":true,""adaptive-light"":true,""...",бензин,https://avatars.mds.yandex.net/get-autoru-vos/...,43800,2017,"{""code"":""OCTAVIA"",""name"":""Octavia"",""ru_name"":""...",OCTAVIA,1.8 AMT (180 л.с.),5,1603226333,RUB,2018,1101140033,"{""id"":""20898378"",""displacement"":1798,""engine_t...",LIFTBACK ROBOT 1.8,роботизированная,EUROPEAN,1 владелец,,Оригинал,передний,Левый,Не требует ремонта,Растаможен
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34665,внедорожник 5 дв.,BMW,https://auto.ru/cars/used/sale/bmw/x4/11013707...,белый,"{""id"":""21184881"",""name"":""xDrive20i xLine"",""ava...","Доброго дня!.Автомобиль в отличном состоянии, ...",2.0 LTR,184 N12,"{""esp"":true,""start-stop-function"":true,""airbag...",бензин,https://autoru.naydex.net/nwI1K7152/b604fdsYg7...,80000,2014,"{""code"":""X4"",""name"":""X4"",""ru_name"":""Х4"",""morph...",X4,20i 2.0 AT (184 л.с.) 4WD,5,1603707377,RUB,2016,1101370731,"{""id"":""20102730"",""name"":""20i"",""nameplate"":""20i...",ALLROAD_5_DOORS AUTOMATIC 2.0,автоматическая,EUROPEAN,2 владельца,2 года и 7 месяцев,Оригинал,полный,Левый,Не требует ремонта,Растаможен
34666,седан,BMW,https://auto.ru/cars/used/sale/bmw/3er/1101370...,коричневый,"{""id"":""20580761"",""name"":""328i"",""available_opti...",Третий собственник по ПТС. Машина была куплен...,2.0 LTR,245 N12,"{""asr"":true,""tinted-glass"":true,""esp"":true,""us...",бензин,https://autoru.naydex.net/nwI1K7152/b604fdsYg7...,117000,2011,"{""code"":""3ER"",""name"":""3 серии"",""ru_name"":""3 се...",3ER,328i 2.0 AT (245 л.с.),4,1603707401,RUB,2012,1101370758,"{""id"":""7959524"",""name"":""328"",""nameplate"":""328i...",SEDAN AUTOMATIC 2.0,автоматическая,EUROPEAN,3 или более,1 год и 3 месяца,Оригинал,задний,Левый,Не требует ремонта,Растаможен
34668,седан,BMW,https://auto.ru/cars/used/sale/bmw/5er/1101369...,чёрный,"{""id"":""21037026"",""name"":""520d xDrive"",""availab...",Добрый день . Предлагаю ЛУЧШИЙ автомобиль в св...,2.0 LTR,190 N12,"{""cruise-control"":true,""asr"":true,""tinted-glas...",дизель,https://avatars.mds.yandex.net/get-autoru-vos/...,45000,2016,"{""code"":""5ER"",""name"":""5 серии"",""ru_name"":""5 се...",5ER,520d xDrive 2.0d AT (190 л.с.) 4WD,4,1603707483,RUB,2018,1101369894,"{""id"":""20856402"",""name"":""520"",""nameplate"":""520...",SEDAN AUTOMATIC 2.0,автоматическая,EUROPEAN,2 владельца,1 год и 11 месяцев,Оригинал,полный,Левый,Не требует ремонта,Растаможен
34669,седан,BMW,https://auto.ru/cars/used/sale/bmw/3er/1101370...,чёрный,"{""id"":""21078977"",""name"":""318i"",""available_opti...","Машина немецкой сборки, на гарантии, обслужива...",1.5 LTR,136 N12,"{""engine-proof"":true,""esp"":true,""airbag-driver...",бензин,https://autoru.naydex.net/nwI1K7152/b604fdsYg7...,64600,2015,"{""code"":""3ER"",""name"":""3 серии"",""ru_name"":""3 се...",3ER,318i 1.5 AT (136 л.с.),4,1603707487,RUB,2018,1101370676,"{""id"":""20548469"",""name"":""318"",""nameplate"":""318...",SEDAN AUTOMATIC 1.5,автоматическая,EUROPEAN,1 владелец,2 года и 3 месяца,Оригинал,задний,Левый,Не требует ремонта,Растаможен


In [309]:
test.groupby('car_url')['image'].count().sort_values(ascending=False).head(10)

car_url
https://auto.ru/cars/used/sale/audi/100/1016944865-8eb8f/            1
https://auto.ru/cars/used/sale/skoda/octavia/1100434568-e9537421/    1
https://auto.ru/cars/used/sale/skoda/octavia/1100515744-d97a41fa/    1
https://auto.ru/cars/used/sale/skoda/octavia/1100514944-54916e94/    1
https://auto.ru/cars/used/sale/skoda/octavia/1100513036-35594e9f/    1
https://auto.ru/cars/used/sale/skoda/octavia/1100495414-bb724b6c/    1
https://auto.ru/cars/used/sale/skoda/octavia/1100492150-5ae5281f/    1
https://auto.ru/cars/used/sale/skoda/octavia/1100491772-7f29f9e8/    1
https://auto.ru/cars/used/sale/skoda/octavia/1100487934-c961ee31/    1
https://auto.ru/cars/used/sale/skoda/octavia/1100486714-6ff064ce/    1
Name: image, dtype: int64

In [334]:
test.iloc[2]['model_info']

'{"code":"SUPERB","name":"Superb","ru_name":"Суперб","morphology":{},"nameplate":{"code":"","name":"","semantic_url":""}}'

In [323]:
test[test['vehicleConfiguration'] == 'ALLROAD_5_DOORS AUTOMATIC 2.0']

Unnamed: 0,bodyType,brand,car_url,color,complectation_dict,description,engineDisplacement,enginePower,equipment_dict,fuelType,image,mileage,modelDate,model_info,model_name,name,numberOfDoors,parsing_unixtime,priceCurrency,productionDate,sell_id,super_gen,vehicleConfiguration,vehicleTransmission,vendor,Владельцы,Владение,ПТС,Привод,Руль,Состояние,Таможня
2373,внедорожник 5 дв.,AUDI,https://auto.ru/cars/used/sale/audi/q5/1101018...,коричневый,,Продам свою машину AUDI Q5 в комплектации EXCL...,2.0 LTR,225 N12,"{""cruise-control"":true,""tinted-glass"":true,""es...",бензин,https://autoru.naydex.net/ys1kR7800/fbd964zfcB...,114000,2012,"{""code"":""Q5"",""name"":""Q5"",""ru_name"":""Ку5"",""morp...",Q5,2.0 AT (225 л.с.) 4WD,5,1603121495,RUB,2012,1101018904,"{""id"":""8351307"",""displacement"":1984,""engine_ty...",ALLROAD_5_DOORS AUTOMATIC 2.0,автоматическая,EUROPEAN,3 или более,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
2375,внедорожник 5 дв.,AUDI,https://auto.ru/cars/used/sale/audi/q5/1101206...,чёрный,,Фактический 1 владелец.\n\n。Подушка безопаснос...,2.0 LTR,225 N12,"{""cruise-control"":true,""asr"":true,""tinted-glas...",бензин,https://avatars.mds.yandex.net/get-autoru-vos/...,67205,2012,"{""code"":""Q5"",""name"":""Q5"",""ru_name"":""Ку5"",""morp...",Q5,2.0 AT (225 л.с.) 4WD,5,1603121503,RUB,2015,1101206595,"{""id"":""8351307"",""displacement"":1984,""engine_ty...",ALLROAD_5_DOORS AUTOMATIC 2.0,автоматическая,EUROPEAN,1 владелец,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
2419,внедорожник 5 дв.,AUDI,https://auto.ru/cars/used/sale/audi/q5/1101192...,чёрный,,АСЦ / ЧЕСТНО!\n«АвтоСпецЦентр» Химки – это шир...,2.0 LTR,225 N12,"{""alloy-wheel-disks"":true,""ptf"":true,""esp"":tru...",бензин,https://autoru.naydex.net/ys1kR7800/fbd964zfcB...,126700,2012,"{""code"":""Q5"",""name"":""Q5"",""ru_name"":""Ку5"",""morp...",Q5,2.0 AT (225 л.с.) 4WD,5,1603121680,RUB,2013,1101192211,"{""id"":""8351307"",""displacement"":1984,""engine_ty...",ALLROAD_5_DOORS AUTOMATIC 2.0,автоматическая,EUROPEAN,2 владельца,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
2465,внедорожник 5 дв.,AUDI,https://auto.ru/cars/used/sale/audi/q5/1101289...,чёрный,,Комфорт: 5. Безопасность: 5. Обзор: 4. Салон: ...,2.0 LTR,225 N12,"{""cruise-control"":true,""airbag-rear-side"":true...",бензин,https://avatars.mds.yandex.net/get-autoru-vos/...,81378,2012,"{""code"":""Q5"",""name"":""Q5"",""ru_name"":""Ку5"",""morp...",Q5,2.0 AT (225 л.с.) 4WD,5,1603121870,RUB,2014,1101289792,"{""id"":""8351307"",""displacement"":1984,""engine_ty...",ALLROAD_5_DOORS AUTOMATIC 2.0,автоматическая,EUROPEAN,1 владелец,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
2466,внедорожник 5 дв.,AUDI,https://auto.ru/cars/used/sale/audi/q5/1101015...,чёрный,,Артикул: 25803\n\nПРИ ПОКУПКЕ АВТОМОБИЛЯ В КРЕ...,2.0 LTR,211 N12,"{""cruise-control"":true,""asr"":true,""tinted-glas...",бензин,https://avatars.mds.yandex.net/get-autoru-vos/...,111000,2008,"{""code"":""Q5"",""name"":""Q5"",""ru_name"":""Ку5"",""morp...",Q5,8tiptronic 2.0 AT (211 л.с.) 4WD,5,1603121875,RUB,2011,1101015857,"{""id"":""8219747"",""nameplate"":""8tiptronic"",""disp...",ALLROAD_5_DOORS AUTOMATIC 2.0,автоматическая,EUROPEAN,2 владельца,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34648,внедорожник 5 дв.,BMW,https://auto.ru/cars/used/sale/bmw/x4/11013729...,белый,"{""id"":""21073109"",""name"":""xDrive20d M Sport Лок...","В продаже автомобили от компании АО ABTODOM, о...",2.0 LTR,190 N12,"{""cruise-control"":true,""esp"":true,""adaptive-li...",дизель,https://autoru.naydex.net/nwI1K7152/b604fdsYg7...,36639,2014,"{""code"":""X4"",""name"":""X4"",""ru_name"":""Х4"",""morph...",X4,20d 2.0d AT (190 л.с.) 4WD,5,1603706862,RUB,2018,1101372969,"{""id"":""20102733"",""name"":""20d"",""nameplate"":""20d...",ALLROAD_5_DOORS AUTOMATIC 2.0,автоматическая,EUROPEAN,1 владелец,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
34649,внедорожник 5 дв.,BMW,https://auto.ru/cars/used/sale/bmw/x1/11013719...,белый,,"Автомобиль в отличном состоянии, дизель, небол...",2.0 LTR,184 N12,"{""wheel-heat"":true,""navigation"":true,""eco-leat...",дизель,https://avatars.mds.yandex.net/get-autoru-vos/...,82700,2012,"{""code"":""X1"",""name"":""X1"",""ru_name"":""Х1"",""morph...",X1,20d 2.0d AT (184 л.с.) 4WD,5,1603706889,RUB,2013,1101371938,"{""id"":""8247004"",""name"":""20d"",""nameplate"":""20d""...",ALLROAD_5_DOORS AUTOMATIC 2.0,автоматическая,EUROPEAN,1 владелец,,Дубликат,полный,Левый,Не требует ремонта,Растаможен
34658,внедорожник 5 дв.,BMW,https://auto.ru/cars/used/sale/bmw/x3/11013699...,чёрный,,Комфорт: 10. Прочее: 1. Безопасность: 9. Салон...,2.0 LTR,245 N12,"{""cruise-control"":true,""engine-proof"":true,""as...",бензин,https://avatars.mds.yandex.net/get-autoru-vos/...,156000,2010,"{""code"":""X3"",""name"":""X3"",""ru_name"":""Х3"",""morph...",X3,28i xDrive 2.0 AT (245 л.с.) 4WD,5,1603707061,RUB,2013,1101369973,"{""id"":""7957949"",""name"":""28i"",""nameplate"":""28i ...",ALLROAD_5_DOORS AUTOMATIC 2.0,автоматическая,EUROPEAN,2 владельца,,Оригинал,полный,Левый,Не требует ремонта,Растаможен
34665,внедорожник 5 дв.,BMW,https://auto.ru/cars/used/sale/bmw/x4/11013707...,белый,"{""id"":""21184881"",""name"":""xDrive20i xLine"",""ava...","Доброго дня!.Автомобиль в отличном состоянии, ...",2.0 LTR,184 N12,"{""esp"":true,""start-stop-function"":true,""airbag...",бензин,https://autoru.naydex.net/nwI1K7152/b604fdsYg7...,80000,2014,"{""code"":""X4"",""name"":""X4"",""ru_name"":""Х4"",""morph...",X4,20i 2.0 AT (184 л.с.) 4WD,5,1603707377,RUB,2016,1101370731,"{""id"":""20102730"",""name"":""20i"",""nameplate"":""20i...",ALLROAD_5_DOORS AUTOMATIC 2.0,автоматическая,EUROPEAN,2 владельца,2 года и 7 месяцев,Оригинал,полный,Левый,Не требует ремонта,Растаможен


In [136]:
test['Владение'] = test['Владение'].apply(get_number_of_weeks_from_ownings)
train['Владение'] = train['Владение'].apply(get_number_of_weeks_from_ownings)

In [159]:
test['Владельцы'] = test['Владельцы'].apply(get_number_of_owners_from_owners)
train['Владельцы'] = train['Владельцы'].apply(get_number_of_owners_from_owners)

In [8]:
print(
    "unique object cols in train:",
    set(train.select_dtypes("object").columns.tolist())
    - set(test.select_dtypes("object").columns.tolist()),
    "\nunique object cols in test:",
    set(test.select_dtypes("object").columns.tolist())
    - set(train.select_dtypes("object").columns.tolist()),
)


unique object cols in train: {'region', 'date_added'} 
unique object cols in test: {'vendor', 'model_info'}


## making train and test similar

In [9]:
train["model_name"] = train.model_name.apply(lambda x: str(x).lower())
test["model_name"] = test.model_name.apply(lambda x: str(x).lower())

In [10]:
vendor_voc = test[["brand", "vendor"]].drop_duplicates().set_index("brand").to_dict()["vendor"]
vendor_voc

{'SKODA': 'EUROPEAN',
 'AUDI': 'EUROPEAN',
 'HONDA': 'JAPANESE',
 'VOLVO': 'EUROPEAN',
 'BMW': 'EUROPEAN',
 'NISSAN': 'JAPANESE',
 'INFINITI': 'JAPANESE',
 'MERCEDES': 'EUROPEAN',
 'TOYOTA': 'JAPANESE',
 'LEXUS': 'JAPANESE',
 'VOLKSWAGEN': 'EUROPEAN',
 'MITSUBISHI': 'JAPANESE'}

In [11]:
train.brand.unique().tolist()

['SKODA',
 nan,
 'AUDI',
 'HONDA',
 'VOLVO',
 'BMW',
 'NISSAN',
 'INFINITI',
 'MERCEDES',
 'TOYOTA',
 'LEXUS',
 'VOLKSWAGEN',
 'MITSUBISHI']

In [12]:
train["vendor"] = train["brand"].map(vendor_voc)
train.vendor.unique().tolist()

['EUROPEAN', nan, 'JAPANESE']

In [13]:
print(len(train.loc[train.vendor.isna()]["model_name"].unique().tolist()), "na of", len(train.model_name.unique().tolist()))

1 na of 514


In [14]:
train.loc[train.vendor.isna()].shape

(213, 36)

In [15]:
train.priceCurrency.unique()

array(['RUB', 'RUR', nan], dtype=object)

In [16]:
del train["priceCurrency"]
del test["priceCurrency"]
del train["model_info"]
del test["model_info"]
del train["views"]
del train["date_added"]
del train["region"]
del train["Состояние"]
del test["Состояние"]
del train["Таможня"]
del test["Таможня"]

## view again

In [17]:
describe_nums(train.select_dtypes(exclude="object"))

Unnamed: 0,unique,n/a count,count,mean,std,min,25%,50%,75%,max
parsing_unixtime,130154,47,130154.0,1632842487.144867,131636.029755,1632613481.0,1632724602.5,1632820776.0,1632972687.5,1633049056.0
sell_id,40721,232,129969.0,1104865960.875278,1779786.356124,1003139396.0,1104900484.0,1105218565.0,1105376150.0,1105402618.0
mileage,15181,27167,103034.0,116610.915407,85154.492046,1.0,51000.0,100000.0,161000.0,1000000.0
price,8105,20436,109765.0,2801223.260639,2689068.150068,23000.0,1000000.0,2045000.0,3649000.0,39990000.0
modelDate,53,215,129986.0,2012.934578,6.017749,1938.0,2010.0,2014.0,2017.0,2021.0
productionDate,50,213,129988.0,2015.083854,5.929774,1952.0,2012.0,2016.0,2020.0,2021.0
numberOfDoors,4,214,129987.0,4.643164,0.619354,2.0,4.0,5.0,5.0,5.0


In [18]:
describe_nums(test.select_dtypes(exclude="object"))

Unnamed: 0,unique,n/a count,count,mean,std,min,25%,50%,75%,max
parsing_unixtime,34686,0,34686.0,1603286733.619356,149307.058483,1603107306.0,1603221157.75,1603254133.0,1603290080.25,1603710264.0
sell_id,34686,0,34686.0,1098300150.880644,19112247.594511,2665.0,1099048798.0,1100910913.0,1101245023.75,1101374610.0
mileage,11268,0,34686.0,162009.767889,100676.559489,1.0,91153.5,149779.5,215000.0,1000000.0
productionDate,69,0,34686.0,2009.264602,7.047661,1904.0,2006.0,2011.0,2014.0,2020.0
modelDate,66,0,34686.0,2007.074728,7.415894,1904.0,2004.0,2008.0,2012.0,2020.0
numberOfDoors,5,0,34686.0,4.450816,0.70304,0.0,4.0,5.0,5.0,5.0


In [19]:
train.select_dtypes("object").shape, test.select_dtypes("object").shape

((130201, 22), (34686, 22))

In [20]:
train.select_dtypes("object").describe().T.sort_values("unique", ascending=False)

Unnamed: 0,count,unique,top,freq
super_gen,130135,43825,"{'sale-data-attributes': {'asciiCat': 'cars', ...",235
image,130065,42933,https://avatars.mds.yandex.net/get-autoru-vos/...,241
car_url,129969,40721,https://auto.ru/cars/new/group/toyota/rav_4/21...,428
description,129988,35274,Официальный дилер Mercedes Benz ООО РОЛЬФ фили...,3230
equipment_dict,129969,31190,{},8214
complectation_dict,105267,2348,"['cruise-control', 'multi-wheel', 'airbag-pass...",1979
name,130153,2283,Nissan X-Trail III Рестайлинг,4013
vehicleConfiguration,129987,564,ALLROAD_5_DOORS AUTOMATIC 3.0,10895
model_name,130201,514,,27167
enginePower,129988,307,249 N12,14262


In [306]:
test.select_dtypes("object").describe().T.sort_values("unique", ascending=False)['top'].head(1).tolist()

['https://auto.ru/cars/used/sale/bmw/3er/1078412133-45adb3d5/']

In [344]:
test.sample(3).T

Unnamed: 0,16633,5432,8200
bodyType,внедорожник 5 дв.,внедорожник 5 дв.,внедорожник 5 дв.
brand,MERCEDES,HONDA,BMW
car_url,https://auto.ru/cars/used/sale/mercedes/g_klas...,https://auto.ru/cars/used/sale/honda/cr_v/1101...,https://auto.ru/cars/used/sale/bmw/x5_m/108984...
color,чёрный,голубой,синий
complectation_dict,,,"{""id"":""20493549"",""name"":""Базовая"",""available_o..."
description,id: 41467\n«МБ Измайлово» – официальный дилер ...,"В идеальном состоянии, ни одного крашенного эл...",Родной Цвет - Белый.\nАвтомобиль куплен весной...
engineDisplacement,3.0 LTR,2.4 LTR,4.4 LTR
enginePower,245 N12,188 N12,575 N12
equipment_dict,"{""cruise-control"":true,""engine-proof"":true,""as...","{""cruise-control"":true,""tinted-glass"":true,""es...","{""cruise-control"":true,""esp"":true,""adaptive-li..."
fuelType,дизель,бензин,бензин


In [366]:
train.iloc[97515]['complectation_dict']

"['cruise-control', 'multi-wheel', 'airbag-passenger', 'lock', 'electro-mirrors', 'mirrors-heat', 'leather', 'computer', 'seat-transformation', 'wheel-power', 'light-cleaner', 'airbag-rear-side', 'airbag-side', 'abs', 'wheel-leather', 'climate-control-1', 'auto-mirrors', 'esp', 'audiopreparation', 'electro-window-back', 'condition', 'park-assist-r', 'airbag-driver', 'aux', 'electro-window-front', 'hcc', 'airbag-curtain', 'keyless-entry', 'passenger-seat-electric', 'start-button', 'ptf', 'audiosystem-cd', 'migration-flag', 'front-seats-heat', 'bluetooth', 'wheel-configuration2', 'wheel-configuration1', 'immo']"

In [365]:
test.iloc[16633]['model_info']

'{"code":"G_KLASSE","name":"G-Класс","ru_name":"G-класс","morphology":{},"nameplate":{"code":"9264617","name":"350","semantic_url":"350"}}'

In [362]:
test['transmission'] = test['super_gen'].apply(lambda x: json.loads(x)['transmission'])

In [363]:
test['transmission'].unique()

array(['ROBOT', 'MECHANICAL', 'AUTOMATIC', 'VARIATOR'], dtype=object)

In [22]:
train.shape, test.shape

((130201, 29), (34686, 28))

In [23]:
train.loc[train.price.isna()].shape[0], train.price.shape[0], train.loc[train.price.isna()].shape[0] / train.price.shape[0]

(20436, 130201, 0.1569573198362532)

## conclusion

$y = price$ - dropna, take a log

- **car_url** - why we have different rows with the same url for train?
- **image** - maybe same images with different urls indicate fraud? - checked - to remove
- **description** - to tokenize - to read more about tokenize
- **equipment_dict** - deserialize, expand as additional cols
- **complectation_dict** - deserialize, expand as additional cols
- **name** - to check intersection train vs test
- **vehicleConfiguration** - view and maybe split to several features if splittable, and check the mean of number 3.0
- **engineDisplacement** - convert to float
- **enginePower** - convert to integer
- **Владельцы** - convert to integer
- **Владение** - calculate number of days
- **model_name** - check NAs, compare with **name** - maybe keep only one?
- **vendor** - check NAs
- **bodyType**, **color**, **brand**, **fuelType**, **vehicleTransmission**, **Привод**, **ПТС**, **Руль** - _temporary keep as is_
- mileage rename  
- compare with existing features  
- compare 4 dicts (equepment, complactation) train - test  
Numerics - fill na, log if tailed, standartize  
https://www.kaggle.com/datasets/gmbitz/all-auto-ru-09-09-2020