In [1]:
from warnings import filterwarnings

import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException

from tqdm import tqdm

import re
import time
import json

filterwarnings("ignore")

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [3]:
test_data = pd.read_pickle("data/test.pkl.zip", compression="zip")

In [4]:
test_data.iloc[12561]['description']

'Машина в отличном состоянии!!17.10.2020 было сделано Т.О заменено масло в коробке передач в двигателе,так же заменены все фильтра.Почти максимальная комплектация кроме люка!!! Без ключевой доступ. В ДТП не была не билась.Красился капот и передний бампер так как от времени были сколы.Капот фары туманки затянуты броне пленкой.Делали для себя супруга не ездит вся причина продажи.. Новая летняя резина. Продаю не спеша. '

In [5]:
valid_data = pd.DataFrame()
catalog_data = pd.DataFrame()

In [6]:
test_data['bodyType'].unique()

array(['лифтбек', 'внедорожник 5 дв.', 'хэтчбек 5 дв.', 'седан',
       'компактвэн', 'универсал 5 дв.', 'пикап одинарная кабина',
       'хэтчбек 3 дв.', 'купе', 'кабриолет', 'минивэн',
       'пикап двойная кабина', 'внедорожник 3 дв.', 'родстер', 'микровэн',
       'седан 2 дв.', 'купе-хардтоп', 'фастбек', 'тарга',
       'внедорожник открытый', 'лимузин', 'пикап полуторная кабина',
       'седан-хардтоп', 'фургон'], dtype=object)

In [7]:
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
# driver = webdriver.Chrome("./chromedriver", options=options)



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Driver [/home/user/.wdm/drivers/chromedriver/linux64/99.0.4844.51/chromedriver] found in cache


In [8]:
regions = [
    "leningradskaya_oblast",
    # "moskovskaya_oblast",
]
models = [
    "skoda",
    "audi",
    "honda",
    "volvo",
    # "bmw",
    # "nissan",
    # "infiniti",
    # "mercedes",
    # "toyota",
    # "lexus",
    # "volkswagen",
    # "mitsubishi",
]


In [9]:
urls_list = []

In [10]:
def get_urls_for_model_in_region(model, region):
    
    result_list = []
    cnt = True
    page_num = 1
    
    while cnt is True:
        try:
            driver.get(f'https://auto.ru/{region}/cars/{model}/used/?page={page_num}')
            next_page = driver.find_element(By.CLASS_NAME, 'ListingItemTitle__link')
            page_links = driver.find_elements(By.CLASS_NAME, 'ListingItemTitle__link')

            for item in page_links:
                result_list.append(item.get_attribute('href'))
            print(f'Parsing {page_num} page for {model} in {region}', end='\r')
            page_num += 1
        except NoSuchElementException:
            cnt = False
            print()
            print(f'There are {page_num} pages by {model}')
    
    return result_list

In [11]:
def get_car_info(url):
        
    driver.get(url)
    
    year = int(driver.find_element(By.CLASS_NAME, 'CardInfoRow_year').text.split('\n')[1])
    
    car = driver.find_elements(By.CLASS_NAME,'CardBreadcrumbs__itemText')
    brand = car[3].text
    model = car[4].text
    
    car_url = url
    
    bodytype = driver.find_element(By.CLASS_NAME, 'CardInfoRow_bodytype').text.split('\n')[1]
    kmage = int(re.sub('\D', '', driver.find_element(By.CLASS_NAME, 'CardInfoRow_kmAge').text))
    color = driver.find_element(By.CLASS_NAME, 'CardInfoRow_color').text.split('\n')[1]
    
    engine = driver.find_element(By.CLASS_NAME, 'CardInfoRow_engine').text.split('/')
    engineDisplacement = float(re.findall('(\d+.\d+)', engine[0])[0])
    enginePower = int(re.findall('\d+', engine[1])[0])
    fuelType = engine[2]
    
    super_gen = json.loads(driver.find_element(By.ID, 'sale-data-attributes').get_attribute('data-bem'))['sale-data-attributes']
    
    vehicleTransmission = driver.find_element(By.CLASS_NAME, 'CardInfoRow_transmission').text.split('\n')[1]
    drive = driver.find_element(By.CLASS_NAME, 'CardInfoRow_drive').text.split('\n')[1]
    wheel = driver.find_element(By.CLASS_NAME, 'CardInfoRow_wheel').text.split('\n')[1]
    state = driver.find_element(By.CLASS_NAME, 'CardInfoRow_state').text.split('\n')[1]
    owner = driver.find_element(By.CLASS_NAME, 'CardInfoRow_ownersCount').text.split('\n')[1]
    pts = driver.find_element(By.CLASS_NAME, 'CardInfoRow_pts').text.split('\n')[1]
    customs = driver.find_element(By.CLASS_NAME, 'CardInfoRow_customs').text.split('\n')[1]
    
    try:
        owningTime = driver.find_element(By.CLASS_NAME, 'CardInfoRow_owningTime').text.split('\n')[1]
    except NoSuchElementException:
        owningTime = None
    
    try:
        description = driver.find_element(By.CLASS_NAME, 'CardDescriptionHTML').text
    except NoSuchElementException:
        description = None
    
    sell_id = int(re.findall('\d+', driver.find_element(By.CLASS_NAME, 'CardHead__id').text)[0])
    price = int(re.sub('\D', '', driver.find_element(By.CLASS_NAME, 'OfferPriceCaption__price').text))
    
    complect_list = []
    
    complect_data = driver.find_elements(By.CLASS_NAME, 'ComplectationGroupsDesktop__itemList')

    for item in complect_data:
        complect_list.extend(item.text.replace('\n', '').split('•')[1:])
    
    catalog_url = driver.find_element(By.CLASS_NAME, 'CardCatalogLink').get_attribute('href')
    
    data_dict = {
        'brand': brand, 'model': model, 'year': year, 'bodytype': bodytype, 'kmage': kmage,
        'color': color, 'engineDisplacement': engineDisplacement, 'enginePower': enginePower, 
        'fuelType': fuelType, 'super_gen': super_gen, 'vehicleTransmission': vehicleTransmission,
        'drive': drive, 'wheel': wheel, 'state': state, 'owner': owner, 'pts': pts, 'customs': customs, 
        'owningTime': owningTime, 'description': description, 'sell_id': sell_id, 'price': price, 
        'car_url': car_url, 'catalog_url': catalog_url, 'equipment_dict': complect_list
    }
    
    return data_dict

In [12]:
def get_dicts_from_catalog(url):
    
    driver.get(url)

    car_info_full = driver.find_elements(By.CLASS_NAME, 'list-values')
    
    car_info_dict_ru = {}
    
    for item in car_info_full:
        el = item.text.split('\n')
        for i in range(0, len(el) - 1, 2):
            car_info_dict_ru[el[i]] = el[i + 1]
    
    engine_type_dict = {'бензин': 'GASOLINE', 'дизель': 'DIESEL', 
                        'гибрид': 'HYBRID', 'электро': 'ELECTRO', 'газ': 'LPG'}
    gear_type_dict = {'передний': 'FORWARD_CONTROL', 'полный': 'ALL_WHEEL_DRIVE', 'задний': 'REAR_DRIVE'}
    transmission_dict = {'автомат': 'AUTOMATIC', 'робот': 'ROBOT', 
                         'механика': 'MECHANICAL', 'вариатор': 'VARIATOR'}

    car_info_dict_en = {
        'engine_type': engine_type_dict[car_info_dict_ru['Тип двигателя']],
        'gear_type': gear_type_dict[car_info_dict_ru['Привод']],
        'transmission': transmission_dict[car_info_dict_ru['Коробка']],
        'power': int(re.findall('\d+', car_info_dict_ru['Мощность'])[0]),
        'power_kvt': int(re.findall('\d+', car_info_dict_ru['Максимальная мощность, л.с./кВт при об/мин'])[1]),
        'acceleration': car_info_dict_ru.get('Разгон до 100 км/ч, с'),
        'clearance_min': min(map(int, re.findall('\d+', car_info_dict_ru['Клиренс']))),
        'fuel_rate': float(car_info_dict_ru['Расход топлива, л город/трасса/смешанный'].split('/')[2])
    }
    
    url2 = url.replace('specifications', 'equipment')
    
    driver.get(url2)

    equipment_list = []

    equipment_data = driver.find_elements(By.CLASS_NAME, 'catalog__package-list-i')
    for item in equipment_data:
        equipment_list.append(item.text)
    
    car_data = json.loads(driver.find_element(By.CLASS_NAME, 'search-form-v2-mmm').get_attribute('data-bem'))
    equip_data = json.loads(driver.find_element(By.CLASS_NAME, 'catalog__section').get_attribute('data-bem'))
    attrib_data = json.loads(driver.find_element(By.CLASS_NAME, 'sale-data-attributes').get_attribute('data-bem'))
    
    return {'super_gen_2': car_info_dict_en, 'super_gen_3': attrib_data, 'complectation_dict': equipment_list}

## Gathering URLs from MODEL pages in Regions

In [13]:
%%time
all_urls = []
from time import sleep
from random import randint

i = 0
for region in regions:
    for model in models:
        print(f"{i+1} Processing {model} from {region}")
        sleep(randint(30, 70)/100.)
        model_urls = get_urls_for_model_in_region(model, region)
        all_urls.append(model_urls)
        i += 1

1 Processing skoda from leningradskaya_oblast
Parsing 10 page for skoda in leningradskaya_oblast

WebDriverException: Message: unknown error: session deleted because of page crash
from unknown error: cannot determine loading status
from tab crashed
  (Session info: chrome=99.0.4844.84)
Stacktrace:
#0 0x5654d80fc7d3 <unknown>
#1 0x5654d7e5851f <unknown>
#2 0x5654d7e445bd <unknown>
#3 0x5654d7e43fd5 <unknown>
#4 0x5654d7e434a5 <unknown>
#5 0x5654d7e42190 <unknown>
#6 0x5654d7e4270c <unknown>
#7 0x5654d7e50cdf <unknown>
#8 0x5654d7e51862 <unknown>
#9 0x5654d7e5f78d <unknown>
#10 0x5654d7e629aa <unknown>
#11 0x5654d7e42b36 <unknown>
#12 0x5654d7e5f321 <unknown>
#13 0x5654d7ebfe64 <unknown>
#14 0x5654d7eac5a3 <unknown>
#15 0x5654d7e81ddc <unknown>
#16 0x5654d7e82de5 <unknown>
#17 0x5654d812d49d <unknown>
#18 0x5654d814660c <unknown>
#19 0x5654d812f205 <unknown>
#20 0x5654d8146ee5 <unknown>
#21 0x5654d8123070 <unknown>
#22 0x5654d8162488 <unknown>
#23 0x5654d816260c <unknown>
#24 0x5654d817bc6d <unknown>
#25 0x7fba0e5a4b1a <unknown>


## Getting main car info

In [None]:
from itertools import chain
all_urls = chain(*all_urls)

In [None]:
i = 1
for url in tqdm(all_urls):
    print(f"Processing url {i}: {url}")
    i += 1
    try:
        sleep(randint(30, 70)/100.)
        valid_data = valid_data.append(get_car_info(url), ignore_index=True)
    except NoSuchElementException:
        print('Error 404', end='\r')

0it [00:00, ?it/s]

Processing url 1: https://auto.ru/cars/used/sale/skoda/octavia/1115028761-873852a0/?geo_id=10174


1it [00:10, 10.26s/it]

Processing url 2: https://auto.ru/cars/used/sale/skoda/kodiaq/1115133171-b703eb06/?geo_id=10174


2it [00:19,  9.81s/it]

Processing url 3: https://auto.ru/cars/used/sale/skoda/kodiaq/1106474967-1e42ebec/?geo_id=10174


3it [00:27,  8.64s/it]

Processing url 4: https://auto.ru/cars/used/sale/skoda/octavia_rs/1114827681-f0c378b5/?geo_id=10174


4it [00:35,  8.71s/it]

Processing url 5: https://auto.ru/cars/used/sale/skoda/yeti/1115187987-33fc531b/?geo_id=10174


5it [00:43,  8.25s/it]

Processing url 6: https://auto.ru/cars/used/sale/skoda/karoq/1115187608-48fbe965/?geo_id=10174


6it [00:54,  9.19s/it]

Processing url 7: https://auto.ru/cars/used/sale/skoda/octavia/1115170039-2256bc06/?geo_id=10174


7it [01:02,  8.93s/it]

Processing url 8: https://auto.ru/cars/used/sale/skoda/octavia/1114613661-3a40ff20/?geo_id=10174


8it [01:11,  9.00s/it]

Processing url 9: https://auto.ru/cars/used/sale/skoda/octavia/1115139167-9ff5035d/?geo_id=10174


9it [01:24, 10.25s/it]

Processing url 10: https://auto.ru/cars/used/sale/skoda/yeti/1115025133-ffd03051/?geo_id=10174


10it [01:37, 11.15s/it]

Processing url 11: https://auto.ru/cars/used/sale/skoda/rapid/1115186468-55ac7224/?geo_id=10174


11it [01:55, 12.97s/it]

Processing url 12: https://auto.ru/cars/used/sale/skoda/octavia/1115186496-09f47a8d/?geo_id=10174


12it [02:04, 12.02s/it]

Processing url 13: https://auto.ru/cars/used/sale/skoda/felicia/1115187747-1ba2568b/?geo_id=10174


13it [02:15, 11.46s/it]

Processing url 14: https://auto.ru/cars/used/sale/skoda/octavia/1106508752-38a07599/?geo_id=10174


14it [02:23, 10.44s/it]

Processing url 15: https://auto.ru/cars/used/sale/skoda/rapid/1106142889-2d718d93/?geo_id=10174


15it [02:32, 10.03s/it]

Processing url 16: https://auto.ru/cars/used/sale/skoda/octavia/1115178755-4649ccae/?geo_id=10174


16it [02:40,  9.49s/it]

Processing url 17: https://auto.ru/cars/used/sale/skoda/octavia/1106270033-ffd18ff7/?geo_id=10174


17it [02:53, 10.52s/it]

Processing url 18: https://auto.ru/cars/used/sale/skoda/rapid/1115098821-612a4af4/?geo_id=10174


18it [03:03, 10.25s/it]

Processing url 19: https://auto.ru/cars/used/sale/skoda/kodiaq/1115187556-2aeb3867/?geo_id=10174


19it [03:17, 11.51s/it]

Processing url 20: https://auto.ru/cars/used/sale/skoda/fabia/1115077955-6540036c/?geo_id=10174


20it [03:26, 10.82s/it]

Processing url 21: https://auto.ru/cars/used/sale/skoda/rapid/1115083632-6a00ed21/?geo_id=10174


21it [03:41, 11.94s/it]

Processing url 22: https://auto.ru/cars/used/sale/skoda/rapid/1115036293-b6814ca7/?geo_id=10174


22it [03:54, 12.41s/it]

Processing url 23: https://auto.ru/cars/used/sale/skoda/octavia/1105553230-f26940af/?geo_id=10174


23it [04:04, 11.65s/it]

Processing url 24: https://auto.ru/cars/used/sale/skoda/roomster/1115180357-09be6fd4/?geo_id=10174


24it [04:13, 10.89s/it]

Processing url 25: https://auto.ru/cars/used/sale/skoda/yeti/1106423603-9b35cf15/?geo_id=10174


25it [04:23, 10.60s/it]

Processing url 26: https://auto.ru/cars/used/sale/skoda/rapid/1106400359-37d13193/?geo_id=10174


26it [04:36, 11.12s/it]

Processing url 27: https://auto.ru/cars/used/sale/skoda/rapid/1115049641-865e81e6/?geo_id=10174


27it [04:48, 11.50s/it]

Processing url 28: https://auto.ru/cars/used/sale/skoda/octavia_rs/1105582050-8f4d193c/?geo_id=10174


28it [04:59, 11.44s/it]

Processing url 29: https://auto.ru/cars/used/sale/skoda/octavia/1106417557-53b1b2fd/?geo_id=10174


29it [05:12, 11.78s/it]

Processing url 30: https://auto.ru/cars/used/sale/skoda/octavia/1115025281-88c77bed/?geo_id=10174


30it [05:25, 12.11s/it]

Processing url 31: https://auto.ru/cars/used/sale/skoda/rapid/1115017787-3faaa244/?geo_id=10174


31it [05:40, 13.00s/it]

Processing url 32: https://auto.ru/cars/used/sale/skoda/octavia/1106583866-4b70a8d7/?geo_id=10174


31it [06:14, 12.08s/it]


WebDriverException: Message: unknown error: session deleted because of page crash
from unknown error: cannot determine loading status
from tab crashed
  (Session info: chrome=99.0.4844.84)
Stacktrace:
#0 0x55d61578f7d3 <unknown>
#1 0x55d6154eb51f <unknown>
#2 0x55d6154d75bd <unknown>
#3 0x55d6154d6fd5 <unknown>
#4 0x55d6154d64a5 <unknown>
#5 0x55d6154d5190 <unknown>
#6 0x55d6154d570c <unknown>
#7 0x55d6154e3f9e <unknown>
#8 0x55d6154e4862 <unknown>
#9 0x55d6154f278d <unknown>
#10 0x55d6154f59aa <unknown>
#11 0x55d6154d5b36 <unknown>
#12 0x55d6154f2321 <unknown>
#13 0x55d615552e64 <unknown>
#14 0x55d61553f5a3 <unknown>
#15 0x55d615514ddc <unknown>
#16 0x55d615515de5 <unknown>
#17 0x55d6157c049d <unknown>
#18 0x55d6157d960c <unknown>
#19 0x55d6157c2205 <unknown>
#20 0x55d6157d9ee5 <unknown>
#21 0x55d6157b6070 <unknown>
#22 0x55d6157f5488 <unknown>
#23 0x55d6157f560c <unknown>
#24 0x55d61580ec6d <unknown>
#25 0x7fd479549b1a <unknown>


In [None]:
valid_data.to_pickle("data/20220327_lo_shkoda_audi_honda_volvo.pkl.zip", compression="zip")

## Getting additional information from catalog (links from main DF)

In [23]:
for car_url in tqdm(valid_data['catalog_url'].value_counts().index.tolist()):
    try:
        catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
    except NoSuchElementException:
        print('Error 404', end='\r')

  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = cata

In [30]:
valid_data.to_pickle("data/20220326_valid_data.pkl.zip", compression="zip")
catalog_data.to_pickle("data/20220326_catalog_data.pkl.zip", compression="zip")

In [29]:

print("super gen 2", catalog_data["super_gen_2"][5], sep="\n")
print("super gen 3", catalog_data["super_gen_3"][5], sep="\n")

super gen 2
{'engine_type': 'DIESEL', 'gear_type': 'ALL_WHEEL_DRIVE', 'transmission': 'AUTOMATIC', 'power': 238, 'power_kvt': 175, 'acceleration': '8.3', 'clearance_min': 184, 'fuel_rate': 9.0}
super gen 3
{'sale-data-attributes': {'asciiCat': 'cars', 'puid10': '1', 'category': 'cars', 'mark': 'INFINITI', 'model': 'FX', 'state': 'used', 'class': 'J', 'type': 'suv', 'segment': 'PREMIUM', 'group': 'family', 'power': 238, 'year': 2011}}


to extract

- super gen 2: power_kvt, acceleration, clearance_min, fuel_rate
- super gen 3: mark, model, class, year, group

join by

- model
- mark
- year