In [1]:
import re
import time
import json
import random
import os
import glob
from warnings import filterwarnings

import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException

from tqdm import tqdm

filterwarnings("ignore")

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [3]:
valid_data = pd.DataFrame()
catalog_data = pd.DataFrame()

In [4]:
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)



Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\polyape1\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


In [5]:
regions = [
    "leningradskaya_oblast",
    # "moskovskaya_oblast"
]
models = [
    "skoda",
    "audi",
    "honda",
    "volvo",
    "bmw",
    "nissan",
    "infiniti",
    "mercedes",
    "toyota",
    "lexus",
    "volkswagen"
]

## Function block

In [6]:
def get_urls_for_model_in_region(model: str, region: str) -> list:
    '''
    Function that takes model and region and parsing urls for vehicle to list
    '''
    result_list = []
    cnt = True
    page_num = 1
    
    while cnt is True and page_num <= 99:
        try:
            driver.get(f'https://auto.ru/{region}/cars/{model}/used/?page={page_num}')
            next_page = driver.find_element(By.CLASS_NAME, 'ListingItemTitle__link')
            page_links = driver.find_elements(By.CLASS_NAME, 'ListingItemTitle__link')

            for item in page_links:
                result_list.append(item.get_attribute('href'))
            print(f'Parsing {page_num} page for {model} in {region}', end='\r')
            page_num += 1
        except NoSuchElementException:
            cnt = False
            print()
            print(f'There are {page_num - 1} pages by {model}')
    
    return result_list

In [7]:
def get_car_info(url: str) -> list:
    '''
    Function to parse each car URL
    '''
    driver.get(url)
    
    year = int(driver.find_element(By.CLASS_NAME, 'CardInfoRow_year').text.split('\n')[1])
    
    car = driver.find_elements(By.CLASS_NAME,'CardBreadcrumbs__itemText')
    brand = car[3].text
    model = car[4].text
    
    car_url = url
    
    bodytype = driver.find_element(By.CLASS_NAME, 'CardInfoRow_bodytype').text.split('\n')[1]
    kmage = int(re.sub('\D', '', driver.find_element(By.CLASS_NAME, 'CardInfoRow_kmAge').text))
    color = driver.find_element(By.CLASS_NAME, 'CardInfoRow_color').text.split('\n')[1]
    
    engine = driver.find_element(By.CLASS_NAME, 'CardInfoRow_engine').text.split('/')
    
    try:
        engineDisplacement = float(re.findall('(\d+.\d+)', engine[0])[0])
        enginePower = int(re.findall('\d+', engine[1])[0])
        fuelType = engine[2]
    except IndexError:
        engineDisplacement = 0
        enginePower = 0
        fuelType = engine
    
    super_gen = json.loads(driver.find_element(By.ID, 'sale-data-attributes').get_attribute('data-bem'))['sale-data-attributes']
    
    vehicleTransmission = driver.find_element(By.CLASS_NAME, 'CardInfoRow_transmission').text.split('\n')[1]
    drive = driver.find_element(By.CLASS_NAME, 'CardInfoRow_drive').text.split('\n')[1]
    wheel = driver.find_element(By.CLASS_NAME, 'CardInfoRow_wheel').text.split('\n')[1]
    state = driver.find_element(By.CLASS_NAME, 'CardInfoRow_state').text.split('\n')[1]
    owner = driver.find_element(By.CLASS_NAME, 'CardInfoRow_ownersCount').text.split('\n')[1]
    pts = driver.find_element(By.CLASS_NAME, 'CardInfoRow_pts').text.split('\n')[1]
    customs = driver.find_element(By.CLASS_NAME, 'CardInfoRow_customs').text.split('\n')[1]
    
    try:
        owningTime = driver.find_element(By.CLASS_NAME, 'CardInfoRow_owningTime').text.split('\n')[1]
    except NoSuchElementException:
        owningTime = None
    
    try:
        description = driver.find_element(By.CLASS_NAME, 'CardDescriptionHTML').text
    except NoSuchElementException:
        description = None
    
    sell_id = int(re.findall('\d+', driver.find_element(By.CLASS_NAME, 'CardHead__id').text)[0])
    price = int(re.sub('\D', '', driver.find_element(By.CLASS_NAME, 'OfferPriceCaption__price').text))
    
    complect_list = []
    
    complect_data = driver.find_elements(By.CLASS_NAME, 'ComplectationGroupsDesktop__itemList')

    for item in complect_data:
        complect_list.extend(item.text.replace('\n', '').split('•')[1:])
    
    try:
        catalog_url = driver.find_element(By.CLASS_NAME, 'CardCatalogLink').get_attribute('href')
    except NoSuchElementException:
        catalog_url = 'No catalog link'
    
    
    data_dict = {
        'brand': brand, 'model': model, 'year': year, 'bodytype': bodytype, 'kmage': kmage,
        'color': color, 'engineDisplacement': engineDisplacement, 'enginePower': enginePower, 
        'fuelType': fuelType, 'super_gen': super_gen, 'vehicleTransmission': vehicleTransmission,
        'drive': drive, 'wheel': wheel, 'state': state, 'owner': owner, 'pts': pts, 'customs': customs, 
        'owningTime': owningTime, 'description': description, 'sell_id': sell_id, 'price': price, 
        'car_url': car_url, 'catalog_url': catalog_url, 'equipment_dict': complect_list
    }
    
    return data_dict

In [8]:
def get_dicts_from_catalog(url: str) -> list:
    '''
    Function to parse catalog URL
    '''
    driver.get(url)

    car_info_full = driver.find_elements(By.CLASS_NAME, 'list-values')
    
    car_info_dict_ru = {}
    
    for item in car_info_full:
        el = item.text.split('\n')
        for i in range(0, len(el) - 1, 2):
            car_info_dict_ru[el[i]] = el[i + 1]
    
    engine_type_dict = {'бензин': 'GASOLINE', 'дизель': 'DIESEL', 
                        'гибрид': 'HYBRID', 'электро': 'ELECTRO', 'газ': 'LPG',
                        'СУГ': 'LPG'}
    gear_type_dict = {'передний': 'FORWARD_CONTROL', 'полный': 'ALL_WHEEL_DRIVE', 'задний': 'REAR_DRIVE'}
    transmission_dict = {'автомат': 'AUTOMATIC', 'робот': 'ROBOT', 
                         'механика': 'MECHANICAL', 'вариатор': 'VARIATOR'}
    
    if car_info_dict_ru['Тип двигателя'] not in ['электро']:
        if car_info_dict_ru.get('Расход топлива, л город/трасса/смешанный') is not None:
            fuel_rate = float(car_info_dict_ru['Расход топлива, л город/трасса/смешанный'].split('/')[2])
        elif car_info_dict_ru.get('Расход топлива, л смешанный') is not None:
            fuel_rate = float(car_info_dict_ru['Расход топлива, л смешанный'])
        elif car_info_dict_ru.get('Расход топлива, л город/смешанный') is not None:
            fuel_rate = float(car_info_dict_ru['Расход топлива, л город/смешанный'].split('/')[1])
        elif car_info_dict_ru.get('Расход топлива, л город/трасса') is not None:
            fuel_rate = sum(map(float, car_info_dict_ru['Расход топлива, л город/трасса'].split('/'))) / 2
        else:
            fuel_rate = None
    else:
        fuel_rate = float(car_info_dict_ru['Запас хода на электричестве, км'])
        
    try:
        clearance_min = min(map(int, re.findall('\d+', car_info_dict_ru['Клиренс'])))
    except KeyError:
        clearance_min = None
    
    car_info_dict_en = {
        'engine_type': engine_type_dict[car_info_dict_ru['Тип двигателя']],
        'gear_type': gear_type_dict[car_info_dict_ru['Привод']],
        'transmission': transmission_dict[car_info_dict_ru['Коробка']],
        'power': int(re.findall('\d+', car_info_dict_ru['Мощность'])[0]),
        'power_kvt': int(re.findall('\d+', car_info_dict_ru['Максимальная мощность, л.с./кВт при об/мин'])[1]),
        'acceleration': car_info_dict_ru.get('Разгон до 100 км/ч, с'),
        'clearance_min': clearance_min,
        'fuel_rate': fuel_rate
    }
    
    url2 = url.replace('specifications', 'equipment')
    
    driver.get(url2)

    equipment_list = []

    equipment_data = driver.find_elements(By.CLASS_NAME, 'catalog__package-list-i')
    for item in equipment_data:
        equipment_list.append(item.text)
    
    car_data = json.loads(driver.find_element(By.CLASS_NAME, 'search-form-v2-mmm').get_attribute('data-bem'))
    equip_data = json.loads(driver.find_element(By.CLASS_NAME, 'catalog__section').get_attribute('data-bem'))
    attrib_data = json.loads(driver.find_element(By.CLASS_NAME, 'sale-data-attributes').get_attribute('data-bem'))
    
    return {'catalog_url': url, 'super_gen_2': car_info_dict_en, 'super_gen_3': attrib_data, 'complectation_dict': equipment_list}

## Gathering URLs from MODEL pages in Regions and getting main info from car page

In [9]:
if not os.path.exists('data/car_data'):
    os.makedirs('data/car_data')
    
for region in regions:
    for model in models:
        model_urls = get_urls_for_model_in_region(model, region)
        for url in tqdm(model_urls):
            time.sleep(random.random())
            try:
                valid_data = valid_data.append(get_car_info(url), ignore_index=True)
            except: # NoSuchElementException:
                print('Error 404', end='\r')
        
        valid_data.to_csv(f'data/car_data/{model}_{region}_car_data.csv')
        valid_data = pd.DataFrame()

Parsing 26 page for skoda in leningradskaya_oblast
There are 26 pages by skoda


  8%|███▊                                             | 74/946 [05:45<47:49,  3.29s/it]

Error 404

 12%|█████▌                                          | 110/946 [08:35<49:17,  3.54s/it]

Error 404

 14%|██████▍                                         | 128/946 [10:03<45:41,  3.35s/it]

Error 404

 20%|█████████▍                                      | 187/946 [14:25<52:23,  4.14s/it]

Error 404

 22%|██████████▌                                     | 209/946 [16:02<54:11,  4.41s/it]

Error 404

 24%|███████████▌                                    | 227/946 [17:15<36:06,  3.01s/it]

Error 404

 38%|██████████████████▎                             | 360/946 [26:48<35:27,  3.63s/it]

Error 404

 48%|███████████████████████▏                        | 456/946 [33:11<22:16,  2.73s/it]

Error 404

 55%|██████████████████████████▏                     | 516/946 [37:05<20:06,  2.81s/it]

Error 404

 62%|█████████████████████████████▋                  | 584/946 [42:21<23:07,  3.83s/it]

Error 404

 64%|██████████████████████████████▌                 | 602/946 [43:40<18:46,  3.28s/it]

Error 404

 71%|██████████████████████████████████              | 671/946 [49:20<21:58,  4.80s/it]

Error 404

 84%|████████████████████████████████████████        | 790/946 [58:19<09:11,  3.53s/it]

Error 404

 88%|████████████████████████████████████████▎     | 828/946 [1:01:06<05:46,  2.94s/it]

Error 404

 92%|██████████████████████████████████████████▎   | 869/946 [1:03:46<03:54,  3.04s/it]

Error 404

100%|██████████████████████████████████████████████| 946/946 [1:08:56<00:00,  4.37s/it]


Parsing 24 page for audi in leningradskaya_oblast
There are 24 pages by audi


  6%|██▊                                              | 51/890 [03:41<43:25,  3.11s/it]

Error 404

 12%|█████▊                                          | 107/890 [07:40<48:24,  3.71s/it]

Error 404

 14%|██████▊                                         | 127/890 [08:55<38:41,  3.04s/it]

Error 404

 20%|█████████▋                                      | 180/890 [12:20<37:59,  3.21s/it]

Error 404

 24%|███████████▌                                    | 215/890 [14:31<31:48,  2.83s/it]

Error 404

 30%|██████████████▎                                 | 266/890 [17:42<30:09,  2.90s/it]

Error 404

 31%|██████████████▊                                 | 275/890 [18:12<34:24,  3.36s/it]

Error 404

 37%|█████████████████▌                              | 326/890 [21:52<31:56,  3.40s/it]

Error 404

 47%|██████████████████████▍                         | 416/890 [28:01<25:02,  3.17s/it]

Error 404

 54%|██████████████████████████                      | 483/890 [32:36<20:03,  2.96s/it]

Error 404

 63%|██████████████████████████████▍                 | 564/890 [38:05<15:33,  2.86s/it]

Error 404

 80%|██████████████████████████████████████▍         | 712/890 [47:04<08:33,  2.88s/it]

Error 404

 80%|██████████████████████████████████████▌         | 716/890 [47:15<07:17,  2.51s/it]

Error 404

 81%|███████████████████████████████████████         | 725/890 [47:44<09:04,  3.30s/it]

Error 404

 88%|██████████████████████████████████████████▏     | 782/890 [51:08<04:46,  2.65s/it]

Error 404

 92%|████████████████████████████████████████████    | 817/890 [53:09<03:17,  2.71s/it]

Error 404

 93%|████████████████████████████████████████████▊   | 831/890 [53:57<03:22,  3.44s/it]

Error 404

 96%|██████████████████████████████████████████████  | 854/890 [55:16<02:04,  3.45s/it]

Error 404

 97%|██████████████████████████████████████████████▍ | 861/890 [55:38<01:14,  2.59s/it]

Error 404

100%|████████████████████████████████████████████████| 890/890 [57:19<00:00,  3.86s/it]


Parsing 6 page for honda in leningradskaya_oblast
There are 6 pages by honda


 14%|██████▊                                          | 30/214 [01:45<08:31,  2.78s/it]

Error 404

 25%|████████████▏                                    | 53/214 [03:10<08:24,  3.13s/it]

Error 404

 78%|█████████████████████████████████████▏          | 166/214 [09:52<02:08,  2.67s/it]

Error 404

 89%|██████████████████████████████████████████▌     | 190/214 [11:16<01:15,  3.15s/it]

Error 404

100%|████████████████████████████████████████████████| 214/214 [12:33<00:00,  3.52s/it]


Parsing 10 page for volvo in leningradskaya_oblast
There are 10 pages by volvo


 24%|███████████▉                                     | 89/366 [05:31<14:42,  3.19s/it]

Error 404

 31%|███████████████                                 | 115/366 [07:08<13:40,  3.27s/it]

Error 404

 46%|██████████████████████▎                         | 170/366 [10:39<11:08,  3.41s/it]

Error 404

100%|████████████████████████████████████████████████| 366/366 [23:07<00:00,  3.79s/it]


Parsing 39 page for bmw in leningradskaya_oblast
There are 39 pages by bmw


  2%|▉                                             | 29/1451 [01:56<1:13:39,  3.11s/it]

Error 404

  4%|█▊                                            | 57/1451 [03:39<1:03:26,  2.73s/it]

Error 404

 13%|██████                                       | 194/1451 [12:37<1:12:17,  3.45s/it]

Error 404

 16%|███████▏                                     | 233/1451 [15:18<1:04:47,  3.19s/it]

Error 404

 18%|████████▌                                      | 265/1451 [17:22<55:27,  2.81s/it]

Error 404

 21%|█████████▍                                   | 303/1451 [19:53<1:16:18,  3.99s/it]

Error 404

 22%|██████████▏                                    | 313/1451 [20:28<56:44,  2.99s/it]

Error 404

 26%|████████████                                   | 372/1451 [24:19<52:29,  2.92s/it]

Error 404

 28%|████████████▉                                  | 400/1451 [26:08<53:49,  3.07s/it]

Error 404

 32%|███████████████                                | 464/1451 [30:05<49:14,  2.99s/it]

Error 404

 35%|████████████████▎                              | 502/1451 [32:26<47:54,  3.03s/it]

Error 404

 38%|██████████████████                             | 557/1451 [35:46<46:59,  3.15s/it]

Error 404

 41%|███████████████████▏                           | 591/1451 [37:56<43:05,  3.01s/it]

Error 404

 43%|████████████████████▍                          | 630/1451 [40:17<48:26,  3.54s/it]

Error 404

 47%|██████████████████████▎                        | 689/1451 [44:01<40:02,  3.15s/it]

Error 404

 51%|████████████████████████▏                      | 746/1451 [48:02<36:54,  3.14s/it]

Error 404

 53%|█████████████████████████                      | 775/1451 [49:47<34:22,  3.05s/it]

Error 404

 56%|██████████████████████████▍                    | 816/1451 [52:25<32:26,  3.07s/it]

Error 404

 58%|███████████████████████████▎                   | 845/1451 [54:19<40:26,  4.00s/it]

Error 404

 64%|██████████████████████████████                 | 927/1451 [59:34<25:54,  2.97s/it]

Error 404

 66%|█████████████████████████████▋               | 956/1451 [1:01:16<21:11,  2.57s/it]

Error 404

 76%|█████████████████████████████████▋          | 1109/1451 [1:10:55<15:33,  2.73s/it]

Error 404

 81%|███████████████████████████████████▊        | 1180/1451 [1:15:15<11:54,  2.64s/it]

Error 404

 85%|█████████████████████████████████████▏      | 1228/1451 [1:18:10<10:07,  2.72s/it]

Error 404

 86%|█████████████████████████████████████▊      | 1245/1451 [1:19:05<09:03,  2.64s/it]

Error 404

 91%|████████████████████████████████████████    | 1323/1451 [1:23:56<05:55,  2.77s/it]

Error 404

 97%|██████████████████████████████████████████▌ | 1402/1451 [1:28:58<01:58,  2.43s/it]

Error 404

 99%|███████████████████████████████████████████▌| 1435/1451 [1:30:51<00:41,  2.59s/it]

Error 404

100%|████████████████████████████████████████████| 1451/1451 [1:31:45<00:00,  3.79s/it]


Parsing 21 page for nissan in leningradskaya_oblast
There are 21 pages by nissan


  7%|███▌                                             | 57/781 [03:28<33:56,  2.81s/it]

Error 404

 10%|████▊                                            | 76/781 [04:43<33:56,  2.89s/it]

Error 404

 15%|███████▍                                        | 120/781 [07:23<31:16,  2.84s/it]

Error 404

 28%|█████████████▍                                  | 218/781 [13:14<27:35,  2.94s/it]

Error 404

 32%|███████████████▍                                | 252/781 [15:13<26:57,  3.06s/it]

Error 404

 34%|███████████████                             | 268/781 [45:21<75:15:33, 528.14s/it]

Error 404

 34%|█████████████▊                          | 269/781 [4:15:55<591:36:37, 4159.76s/it]

Error 404

 35%|█████████████▊                          | 270/781 [4:39:47<474:18:10, 3341.47s/it]

Error 404

 37%|████████████████▎                           | 289/781 [4:41:56<1:15:05,  9.16s/it]

Error 404

 41%|██████████████████▊                           | 320/781 [4:45:37<41:25,  5.39s/it]

Error 404

 46%|████████████████████▉                         | 356/781 [4:49:38<36:03,  5.09s/it]

Error 404

 56%|█████████████████████████▋                    | 436/781 [4:58:28<30:45,  5.35s/it]

Error 404

 61%|████████████████████████████                  | 476/781 [5:02:55<24:14,  4.77s/it]

Error 404

 69%|███████████████████████████████▌              | 536/781 [5:07:56<10:51,  2.66s/it]

Error 404

 73%|█████████████████████████████████▌            | 569/781 [5:09:51<09:46,  2.76s/it]

Error 404

 79%|████████████████████████████████████▎         | 617/781 [5:12:41<07:27,  2.73s/it]

Error 404

 86%|███████████████████████████████████████▌      | 671/781 [5:15:58<04:51,  2.65s/it]

Error 404

 90%|█████████████████████████████████████████▎    | 701/781 [5:17:42<03:46,  2.84s/it]

Error 404

 98%|█████████████████████████████████████████████▏| 768/781 [5:21:40<00:36,  2.79s/it]

Error 404

100%|██████████████████████████████████████████████| 781/781 [5:22:25<00:00, 24.77s/it]


Parsing 3 page for infiniti in leningradskaya_oblast
There are 3 pages by infiniti


 30%|█████████████▊                                | 33/110 [02:08<03:37,  2.83s/it]

Error 404

 50%|███████████████████████                       | 55/110 [03:37<03:04,  3.36s/it]

Error 404

100%|█████████████████████████████████████████████| 110/110 [07:08<00:00,  3.89s/it]


Parsing 39 page for mercedes in leningradskaya_oblast
There are 39 pages by mercedes


  1%|▍                                          | 13/1434 [00:47<1:09:21,  2.93s/it]

Error 404

  3%|█▍                                         | 49/1434 [03:16<1:13:43,  3.19s/it]

Error 404

  6%|██▍                                        | 81/1434 [05:29<1:21:04,  3.60s/it]

Error 404

 10%|████▏                                     | 145/1434 [09:45<1:03:56,  2.98s/it]

Error 404

 11%|████▋                                     | 160/1434 [10:41<1:01:55,  2.92s/it]

Error 404

 15%|██████▍                                     | 210/1434 [14:05<58:27,  2.87s/it]

Error 404

 18%|███████▉                                    | 257/1434 [17:07<56:33,  2.88s/it]

Error 404

 20%|████████▋                                   | 283/1434 [18:48<59:35,  3.11s/it]

Error 404

 32%|█████████████▊                              | 452/1434 [29:38<50:06,  3.06s/it]

Error 404

 34%|██████████████▊                             | 483/1434 [31:48<51:12,  3.23s/it]

Error 404

 36%|███████████████▉                            | 520/1434 [34:17<44:52,  2.95s/it]

Error 404

 38%|████████████████▋                           | 545/1434 [36:03<55:54,  3.77s/it]

Error 404

 40%|█████████████████▍                          | 569/1434 [37:52<48:42,  3.38s/it]

Error 404

 43%|██████████████████▋                         | 611/1434 [40:38<44:46,  3.26s/it]

Error 404

 47%|████████████████████▋                       | 673/1434 [44:51<40:08,  3.16s/it]

Error 404

 48%|█████████████████████▏                      | 689/1434 [45:47<32:29,  2.62s/it]

Error 404

 52%|██████████████████████▊                     | 744/1434 [49:18<30:47,  2.68s/it]

Error 404

 56%|████████████████████████▍                   | 798/1434 [53:02<38:26,  3.63s/it]

Error 404

 58%|█████████████████████████▌                  | 834/1434 [55:10<29:56,  2.99s/it]

Error 404

 68%|████████████████████████████▌             | 976/1434 [1:03:54<20:18,  2.66s/it]

Error 404

 71%|█████████████████████████████            | 1017/1434 [1:06:23<18:49,  2.71s/it]

Error 404

 71%|█████████████████████████████▎           | 1025/1434 [1:06:50<19:37,  2.88s/it]

Error 404

 82%|█████████████████████████████████▍       | 1170/1434 [1:15:45<12:23,  2.82s/it]

Error 404

 84%|██████████████████████████████████▌      | 1211/1434 [1:18:14<09:33,  2.57s/it]

Error 404

 87%|███████████████████████████████████▊     | 1252/1434 [1:20:44<07:45,  2.56s/it]

Error 404

 97%|███████████████████████████████████████▉ | 1395/1434 [1:29:18<01:45,  2.69s/it]

Error 404

100%|█████████████████████████████████████████| 1434/1434 [1:31:34<00:00,  3.83s/it]


Parsing 23 page for toyota in leningradskaya_oblast
There are 23 pages by toyota


  6%|██▋                                           | 50/858 [03:30<39:10,  2.91s/it]

Error 404

 16%|███████▍                                     | 141/858 [09:11<34:32,  2.89s/it]

Error 404

 22%|█████████▊                                   | 188/858 [11:56<27:08,  2.43s/it]

Error 404

 31%|█████████████▊                               | 264/858 [17:19<31:05,  3.14s/it]

Error 404

 37%|████████████████▍                            | 314/858 [20:51<28:29,  3.14s/it]

Error 404

 41%|██████████████████▍                          | 351/858 [23:21<27:54,  3.30s/it]

Error 404

 47%|█████████████████████                        | 402/858 [26:43<21:38,  2.85s/it]

Error 404

 66%|█████████████████████████████▊               | 568/858 [37:31<14:18,  2.96s/it]

Error 404

 72%|████████████████████████████████▍            | 618/858 [40:51<12:26,  3.11s/it]

Error 404

 75%|█████████████████████████████████▉           | 647/858 [42:32<09:05,  2.59s/it]

Error 404

 89%|███████████████████████████████████████▊     | 760/858 [49:25<04:33,  2.79s/it]

Error 404

 93%|█████████████████████████████████████████▋   | 794/858 [51:23<02:41,  2.53s/it]

Error 404

 99%|████████████████████████████████████████████▋| 851/858 [54:34<00:17,  2.44s/it]

Error 404

100%|█████████████████████████████████████████████| 858/858 [54:56<00:00,  3.84s/it]


Parsing 5 page for lexus in leningradskaya_oblast
There are 5 pages by lexus


 43%|███████████████████▉                          | 71/164 [04:31<04:20,  2.81s/it]

Error 404

 63%|████████████████████████████▎                | 103/164 [06:30<03:19,  3.26s/it]

Error 404

 98%|███████████████████████████████████████████▉ | 160/164 [10:12<00:11,  2.84s/it]

Error 404

100%|█████████████████████████████████████████████| 164/164 [10:26<00:00,  3.82s/it]


Parsing 40 page for volkswagen in leningradskaya_oblast
There are 40 pages by volkswagen


  1%|▎                                          | 12/1490 [00:42<1:13:06,  2.97s/it]

Error 404

  4%|█▉                                         | 65/1490 [04:10<1:06:56,  2.82s/it]

Error 404

  7%|██▉                                       | 104/1490 [06:31<1:05:50,  2.85s/it]

Error 404

  9%|███▉                                      | 138/1490 [08:34<1:02:44,  2.78s/it]

Error 404

 11%|█████                                       | 171/1490 [10:38<57:33,  2.62s/it]

Error 404

 16%|██████▉                                     | 234/1490 [14:28<57:47,  2.76s/it]

Error 404

 23%|█████████▌                                | 339/1490 [21:27<1:00:00,  3.13s/it]

Error 404

 27%|███████████▋                                | 397/1490 [24:56<53:34,  2.94s/it]

Error 404

 33%|█████████████▊                            | 489/1490 [30:52<1:10:51,  4.25s/it]

Error 404

 35%|███████████████▍                            | 523/1490 [32:54<46:35,  2.89s/it]

Error 404

 36%|███████████████▊                            | 535/1490 [33:33<44:05,  2.77s/it]

Error 404

 40%|█████████████████▋                          | 598/1490 [37:17<40:07,  2.70s/it]

Error 404

 52%|██████████████████████▉                     | 778/1490 [51:08<40:26,  3.41s/it]

Error 404

 56%|████████████████████████▍                   | 829/1490 [55:44<33:00,  3.00s/it]

Error 404

 62%|████████████████████████▊               | 924/1490 [1:06:06<1:00:47,  6.44s/it]

Error 404

 66%|███████████████████████████▋              | 984/1490 [1:14:16<45:42,  5.42s/it]

Error 404

 70%|████████████████████████████▉            | 1050/1490 [1:23:47<40:29,  5.52s/it]

Error 404

 76%|███████████████████████████████          | 1129/1490 [1:34:00<30:30,  5.07s/it]

Error 404

 85%|██████████████████████████████████▋      | 1262/1490 [1:45:45<11:17,  2.97s/it]

Error 404

 90%|█████████████████████████████████████    | 1346/1490 [1:50:55<07:00,  2.92s/it]

Error 404

 94%|██████████████████████████████████████▌  | 1401/1490 [1:54:08<03:42,  2.50s/it]

Error 404

 98%|████████████████████████████████████████ | 1455/1490 [1:57:24<01:38,  2.82s/it]

Error 404

 98%|████████████████████████████████████████▎| 1467/1490 [1:58:03<00:59,  2.59s/it]

Error 404

100%|█████████████████████████████████████████| 1490/1490 [1:59:19<00:00,  4.81s/it]


## Getting additional information from catalog (links from main DF)

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 '2022-03-19_train-test_comparison.ipynb',
 '2022-03-31_train-test_EDA.ipynb',
 '20220327_model_counts.ipynb',
 'data',
 'lib',
 'LICENSE',
 'README.md',
 'requirements.txt',
 'UsedCars_Project_Module_6.ipynb',
 'UsedCars_Project_Module_6_getting_spb_cars.ipynb',
 'UsedCars_Project_Module_6_parser.ipynb',
 'UsedCars_Project_Module_6_parser_20220401.ipynb',
 'venv']

In [27]:
car_csvs = [c for c in os.listdir("data/car_data/") if c.endswith(".csv")]
df = pd.DataFrame()

for c in car_csvs:
    print("Reading", c, end=" ")
    df = df.append(pd.read_csv(f"data/car_data/{c}"))
    print("done")

df.set_index("Unnamed: 0", inplace=True)
df.reset_index(drop=True, inplace=True)
df.sample(5, random_state=42).T

Reading audi_leningradskaya_oblast_car_data.csv done
Reading bmw_leningradskaya_oblast_car_data.csv done
Reading honda_leningradskaya_oblast_car_data.csv done
Reading infiniti_leningradskaya_oblast_car_data.csv done
Reading lexus_leningradskaya_oblast_car_data.csv done
Reading mercedes_leningradskaya_oblast_car_data.csv done
Reading nissan_leningradskaya_oblast_car_data.csv done
Reading skoda_leningradskaya_oblast_car_data.csv done
Reading toyota_leningradskaya_oblast_car_data.csv done
Reading volkswagen_leningradskaya_oblast_car_data.csv done
Reading volvo_leningradskaya_oblast_car_data.csv done


Unnamed: 0,4047,8427,5484,3025,7757
brand,Mercedes-Benz,Volvo,Skoda,Mercedes-Benz,Volkswagen
model,C-Класс,S60,Kodiaq,V-Класс,Passat
year,1994,2013,2018,2014,2021
bodytype,седан,седан,внедорожник 5 дв.,минивэн,седан
kmage,330000,123000,146425,108712,25000
color,чёрный,чёрный,серебристый,чёрный,чёрный
engineDisplacement,1.8,2.0,1.4,2.1,1.4
enginePower,122,180,150,163,150
fuelType,Бензин,Бензин,Бензин,Дизель,Бензин
super_gen,"{'asciiCat': 'cars', 'category': 'cars', 'engi...","{'asciiCat': 'cars', 'category': 'cars', 'engi...","{'asciiCat': 'cars', 'category': 'cars', 'engi...","{'asciiCat': 'cars', 'category': 'cars', 'engi...","{'asciiCat': 'cars', 'category': 'cars', 'engi..."


In [33]:
df.brand.unique().tolist()

['Audi',
 'BMW',
 'Honda',
 'Infiniti',
 'Lexus',
 'Mercedes-Benz',
 'Nissan',
 'Skoda',
 'Toyota',
 'Volkswagen',
 'Volvo']

In [35]:
if not os.path.exists('data/catalog_car_data'):
    os.makedirs('data/catalog_car_data')
    
for single_brand in df.brand.unique():
    print("Processing", single_brand)
    for car_url in tqdm(df[df['brand'] == single_brand]['catalog_url'].value_counts().index.tolist()):
        if car_url != 'No catalog link':
            try:
                catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
            except: # NoSuchElementException:
                print('Error 404', end='\r')
    catalog_data.to_csv(f'data/catalog_car_data/{single_brand}_catalog_car_data.csv')
    catalog_data = pd.DataFrame()

Processing Audi


100%|█████████████████████████████████████████████| 309/309 [28:22<00:00,  5.51s/it]


Processing BMW


100%|█████████████████████████████████████████████| 476/476 [40:41<00:00,  5.13s/it]


Processing Honda


100%|█████████████████████████████████████████████| 132/132 [10:13<00:00,  4.65s/it]


Processing Infiniti


100%|███████████████████████████████████████████████| 53/53 [04:44<00:00,  5.38s/it]


Processing Lexus


100%|███████████████████████████████████████████████| 90/90 [07:18<00:00,  4.87s/it]


Processing Mercedes-Benz


100%|█████████████████████████████████████████████| 517/517 [40:00<00:00,  4.64s/it]


Processing Nissan


100%|█████████████████████████████████████████████| 274/274 [20:41<00:00,  4.53s/it]


Processing Skoda


100%|█████████████████████████████████████████████| 230/230 [20:21<00:00,  5.31s/it]


Processing Toyota


100%|█████████████████████████████████████████████| 347/347 [26:21<00:00,  4.56s/it]


Processing Volkswagen


100%|█████████████████████████████████████████████| 448/448 [36:39<00:00,  4.91s/it]


Processing Volvo


100%|█████████████████████████████████████████████| 189/189 [15:51<00:00,  5.03s/it]


In [50]:
cars_list = glob.glob('data/car_data/*.csv')
cars_df = pd.concat(map(pd.read_csv, cars_list))

catalog_cars_list = glob.glob('data/catalog_car_data/*.csv') 
catalog_df = pd.concat(map(pd.read_csv, catalog_cars_list))

In [51]:
del cars_df['Unnamed: 0']
del catalog_df['Unnamed: 0']
cars_df.reset_index(drop=True, inplace=True)
catalog_df.reset_index(drop=True, inplace=True)
catalog_df.drop_duplicates(subset=['catalog_url'], inplace=True, ignore_index=True)

In [52]:
merged_df = pd.merge(cars_df, catalog_df, on='catalog_url', how='left')

In [53]:
merged_df.sample(3).T

Unnamed: 0,6254,5724,84
brand,Toyota,Skoda,Audi
model,Corolla,Octavia,A6 allroad
year,2017,2013,2020
bodytype,седан,лифтбек,универсал 5 дв.
kmage,102100,140000,13832
color,чёрный,белый,коричневый
engineDisplacement,1.6,1.4,3.0
enginePower,122,140,249
fuelType,Бензин,Бензин,Дизель
super_gen,"{'asciiCat': 'cars', 'category': 'cars', 'engi...","{'asciiCat': 'cars', 'category': 'cars', 'engi...","{'asciiCat': 'cars', 'category': 'cars', 'engi..."


In [54]:
merged_df.to_pickle('data/20220401_spb_parsed_data.pkl.zip', compression='zip')

In [55]:
merged_df.shape

(8549, 27)