# Parsing train data from auto.ru for car price prediction

## Import

In [2]:
# import
import time
from collections.abc import Sequence
from math import ceil
import pandas as pd

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.webdriver import WebDriver
import asyncio
import aiohttp
import json
import re

import pprint as pp
from IPython.display import display

print('Import is completed.')

Import is completed.


## Functions

In [58]:
# noinspection PyBroadException
def parse_car_data(car_url:str, content, features_to_extract:list):
    """Parse data from car ad page content trying to be as (reasonably) close to test set as possible."""

    car_data = {}

    car_page = BeautifulSoup(content, 'html.parser')

    # 'bodyType'
    try: car_data['bodyType'] = car_page.find('li', class_='CardInfoRow CardInfoRow_bodytype').find('a', class_='Link Link_color_black').string
    except: pass

    # get info from script
    try:
        # 'brand', 'color', 'description',  'productionDate' and 'priceCurrency'
        ld_json_scripts = car_page.find_all('script', type="application/ld+json")
        for script in ld_json_scripts:
            cur_json = json.loads(script.string)
            if cur_json['@type'] == 'https://schema.org/Product':
                car_data.update({k:cur_json.get(k,None) for k in ['brand', 'color', 'description', 'productionDate']})
                car_data['priceCurrency'] = cur_json['offers']['priceCurrency']
                break
    except:
        pass

    # 'car_url'
    car_data['car_url'] = car_url

    # engine info
    try:
        engine_texts = car_page.find('li', class_='CardInfoRow CardInfoRow_engine').find_all('span')[-1].text.split('/')
        if '—ç–ª–µ–∫—Ç—Ä–æ' in engine_texts[2].lower():
            # 'enginePower'
            try: car_data['enginePower'] = engine_texts[0].split()[0]
            except: pass
        else:
            # 'engineDisplacement'
            try: car_data['engineDisplacement'] = engine_texts[0].split()[0]
            except: pass

            # 'enginePower'
            try: car_data['enginePower'] = engine_texts[1].split()[0]
            except: pass

        # 'fuelType'
        try: car_data['fuelType'] = engine_texts[-1].strip().lower()
        except: pass
    except:
        pass

    # get info from state script
    try:
        json_script_state = json.loads(car_page.find('script', type="application/json", id='initial-state').string)

        # 'complectation_dict'
        try: car_data['complectation_dict'] = str(json_script_state['card']['vehicle_info']['complectation'])
        except: pass

        # 'equipment_dict'
        try: car_data['equipment_dict'] = str(json_script_state['card']['vehicle_info']['equipment'])
        except: pass

        # 'super_gen'
        try: car_data['super_gen'] = str(json_script_state['card']['vehicle_info']['tech_param'])
        except: pass

        # 'modelDate'
        try: car_data['modelDate'] = json_script_state['card']['vehicle_info']['super_gen']['year_from']
        except: pass

        # 'model_info'
        try: car_data['model_info'] = str(json_script_state['card']['vehicle_info']['model_info'])
        except: pass

        # 'model_name' (no sense but let's just add to be more like test set)
        try: car_data['model_name'] = json_script_state['card']['vehicle_info']['model_info']['code']
        except: pass

        # 'name' (no sense but let's just add to be more like test set)
        try: car_data['name'] = json_script_state['card']['vehicle_info']['tech_param']['human_name']
        except: pass

        # 'numberOfDoors'
        try: car_data['numberOfDoors'] = json_script_state['card']['vehicle_info']['configuration']['doors_count']
        except: pass
    except:
        pass

    # 'image'
    try: car_data['image'] = 'https:' + car_page.select('link[as=image]')[0]['href']
    except: pass

    # 'mileage'
    try: car_data['mileage'] = ''.join(re.findall('\d',car_page.find('li', class_='CardInfoRow CardInfoRow_kmAge').find_all('span')[1].string))
    except: pass

    # 'parsing_unixtime'
    car_data['parsing_unixtime'] = int(time.time())

    # 'vehicleTransmission',
    try: car_data['vehicleTransmission'] = car_page.find('li', class_='CardInfoRow CardInfoRow_transmission').find_all('span')[1].string
    except: pass

    # '–í–ª–∞–¥–µ–ª—å—Ü—ã'
    try: car_data['–í–ª–∞–¥–µ–ª—å—Ü—ã'] = car_page.find('li', class_='CardInfoRow CardInfoRow_ownersCount').find_all('span')[1].text.replace(u'\xa0', u' ')
    except: pass

    # '–í–ª–∞–¥–µ–Ω–∏–µ'
    try: car_data['–í–ª–∞–¥–µ–Ω–∏–µ'] = car_page.find('li', class_='CardInfoRow CardInfoRow_owningTime').find_all('span')[1].text
    except: pass

    # '–ü–¢–°'
    try: car_data['–ü–¢–°'] = car_page.find('li', class_='CardInfoRow CardInfoRow_pts').find_all('span')[1].text
    except: pass

    # '–ü—Ä–∏–≤–æ–¥'
    try: car_data['–ü—Ä–∏–≤–æ–¥'] = car_page.find('li', class_='CardInfoRow CardInfoRow_drive').find_all('span')[1].text
    except: pass

    # '–†—É–ª—å'
    try: car_data['–†—É–ª—å'] = car_page.find('li', class_='CardInfoRow CardInfoRow_wheel').find_all('span')[1].text
    except: pass

    # '–°–æ—Å—Ç–æ—è–Ω–∏–µ'
    try: car_data['–°–æ—Å—Ç–æ—è–Ω–∏–µ'] = car_page.find('li', class_='CardInfoRow CardInfoRow_state').find_all('span')[1].text
    except: pass

    # '–¢–∞–º–æ–∂–Ω—è'
    try: car_data['–¢–∞–º–æ–∂–Ω—è'] = car_page.find('li', class_='CardInfoRow CardInfoRow_customs').find_all('span')[1].text
    except: pass

    # 'sell_id'
    # no need for training

    # 'vehicleConfiguration'
    # all this information is presented in other features - no need for train

    # 'vendor'
    # not found such info on site, but we can easily infer it later from test set

    # add price column
    try: car_data['Price'] = ''.join(re.findall('\d', car_page.find('span', class_='OfferPriceCaption__price').text))
    except: pass

    # fill not extracted features with None and rearrange output dict keys to follow test set format
    output_dict = {}
    for feature in features_to_extract+['Price']:
        if feature in car_data:
            output_dict[feature] = car_data[feature]
        else:
            output_dict[feature] = None

    return output_dict

def init_webdriver():
    """Webdriver initialization function. Configure webdriver to ensure correct car ads parsing."""

    # disable geolocation
    geoDisabled = webdriver.FirefoxOptions()
    geoDisabled.set_preference("geo.enabled", False)
    geoDisabled.set_preference("geo.provider.use_corelocation", False)
    geoDisabled.set_preference("geo.prompt.testing", False)
    geoDisabled.set_preference("geo.prompt.testing.allow", False)

    return webdriver.Firefox(options=geoDisabled)

def parse_car_urls(brand:str, driver:WebDriver):
    """Collect all car ad urls for specific brand via specific driver."""

    car_urls = []
    try:
        # brand_url = f'https://auto.ru/cars/{brand}/all/'
        brand_url = f'https://auto.ru/cars/{brand}/used/engine-electro/'
        driver.get(brand_url)
        brand_page = BeautifulSoup(driver.page_source, 'html.parser')

        # get number of pages for brand
        num_pages = int(brand_page.find('span', class_='ControlGroup ControlGroup_responsive_no ControlGroup_size_s ListingPagination__pages') \
                                    .find_all('a')[-1].text)

        for page_idx in range(1,num_pages+1):

            try:
                print(f'Brand \'{brand}\' (page {page_idx} from {num_pages})', end='\r', flush=True)
                page_url = brand_url + f'?page={page_idx}'
                driver.get(page_url)
                cur_page = BeautifulSoup(driver.page_source, 'html.parser')
                car_links = cur_page.find_all('a', class_='Link ListingItemTitle__link')
                car_urls.extend([link['href'] for link in car_links])
            except:
                pass

            time.sleep(1)

        print('')
    except:
        pass

    return car_urls

In [5]:
# load given test data
DATA_DIR = 'Data'
df_test = pd.read_csv(f'../{DATA_DIR}/test.csv')
display(df_test.head(5))
df_test.info()
df_test.describe()

Unnamed: 0,bodyType,brand,car_url,color,complectation_dict,description,engineDisplacement,enginePower,equipment_dict,fuelType,...,vehicleConfiguration,vehicleTransmission,vendor,–í–ª–∞–¥–µ–ª—å—Ü—ã,–í–ª–∞–¥–µ–Ω–∏–µ,–ü–¢–°,–ü—Ä–∏–≤–æ–¥,–†—É–ª—å,–°–æ—Å—Ç–æ—è–Ω–∏–µ,–¢–∞–º–æ–∂–Ω—è
0,–ª–∏—Ñ—Ç–±–µ–∫,SKODA,https://auto.ru/cars/used/sale/skoda/octavia/1...,—Å–∏–Ω–∏–π,,"–í—Å–µ –∞–≤—Ç–æ–º–æ–±–∏–ª–∏, –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–Ω—ã–µ –≤ –ø—Ä–æ–¥–∞–∂–µ, –ø—Ä–æ—Ö...",1.2 LTR,105 N12,"{""engine-proof"":true,""tinted-glass"":true,""airb...",–±–µ–Ω–∑–∏–Ω,...,LIFTBACK ROBOT 1.2,—Ä–æ–±–æ—Ç–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω–∞—è,EUROPEAN,3 –∏–ª–∏ –±–æ–ª–µ–µ,,–û—Ä–∏–≥–∏–Ω–∞–ª,–ø–µ—Ä–µ–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω
1,–ª–∏—Ñ—Ç–±–µ–∫,SKODA,https://auto.ru/cars/used/sale/skoda/octavia/1...,—á—ë—Ä–Ω—ã–π,,–õ–û–¢: 01217195\n–ê–≤—Ç–æ–ø—Ä–∞–≥–∞ –°–µ–≤–µ—Ä\n–î–∞–Ω–Ω—ã–π –∞–≤—Ç–æ–º–æ–±...,1.6 LTR,110 N12,"{""cruise-control"":true,""asr"":true,""esp"":true,""...",–±–µ–Ω–∑–∏–Ω,...,LIFTBACK MECHANICAL 1.6,–º–µ—Ö–∞–Ω–∏—á–µ—Å–∫–∞—è,EUROPEAN,1¬†–≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–ø–µ—Ä–µ–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω
2,–ª–∏—Ñ—Ç–±–µ–∫,SKODA,https://auto.ru/cars/used/sale/skoda/superb/11...,—Å–µ—Ä—ã–π,"{""id"":""20026336"",""name"":""Ambition"",""available_...","–í—Å–µ –∞–≤—Ç–æ–º–æ–±–∏–ª–∏, –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–Ω—ã–µ –≤ –ø—Ä–æ–¥–∞–∂–µ, –ø—Ä–æ—Ö...",1.8 LTR,152 N12,"{""cruise-control"":true,""tinted-glass"":true,""es...",–±–µ–Ω–∑–∏–Ω,...,LIFTBACK ROBOT 1.8,—Ä–æ–±–æ—Ç–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω–∞—è,EUROPEAN,1¬†–≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–ø–µ—Ä–µ–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω
3,–ª–∏—Ñ—Ç–±–µ–∫,SKODA,https://auto.ru/cars/used/sale/skoda/octavia/1...,–∫–æ—Ä–∏—á–Ω–µ–≤—ã–π,"{""id"":""20803582"",""name"":""Ambition"",""available_...",–ö–û–ú–ü–õ–ï–ö–¢ –ó–ò–ú–ù–ï–ô (–õ–ï–¢–ù–ï–ô) –†–ï–ó–ò–ù–´ –ü–û –°–ï–ó–û–ù–£ –í –ü–û...,1.6 LTR,110 N12,"{""cruise-control"":true,""roller-blind-for-rear-...",–±–µ–Ω–∑–∏–Ω,...,LIFTBACK AUTOMATIC 1.6,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,EUROPEAN,1¬†–≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–ø–µ—Ä–µ–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω
4,–ª–∏—Ñ—Ç–±–µ–∫,SKODA,https://auto.ru/cars/used/sale/skoda/octavia/1...,–±–µ–ª—ã–π,,–õ–û–¢: 01220889\n–ê–≤—Ç–æ–ø—Ä–∞–≥–∞ –°–µ–≤–µ—Ä\n\n–í—ã –º–æ–∂–µ—Ç–µ –ø–æ...,1.8 LTR,152 N12,"{""cruise-control"":true,""asr"":true,""esp"":true,""...",–±–µ–Ω–∑–∏–Ω,...,LIFTBACK AUTOMATIC 1.8,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,EUROPEAN,1¬†–≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–ø–µ—Ä–µ–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34686 entries, 0 to 34685
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   bodyType              34686 non-null  object
 1   brand                 34686 non-null  object
 2   car_url               34686 non-null  object
 3   color                 34686 non-null  object
 4   complectation_dict    6418 non-null   object
 5   description           34686 non-null  object
 6   engineDisplacement    34686 non-null  object
 7   enginePower           34686 non-null  object
 8   equipment_dict        24690 non-null  object
 9   fuelType              34686 non-null  object
 10  image                 34686 non-null  object
 11  mileage               34686 non-null  int64 
 12  modelDate             34686 non-null  int64 
 13  model_info            34686 non-null  object
 14  model_name            34686 non-null  object
 15  name                  34686 non-null

Unnamed: 0,mileage,modelDate,numberOfDoors,parsing_unixtime,productionDate,sell_id
count,34686.0,34686.0,34686.0,34686.0,34686.0,34686.0
mean,162009.767889,2007.074728,4.450816,1603287000.0,2009.264602,1098300000.0
std,100676.559489,7.415894,0.70304,149307.1,7.047661,19112250.0
min,1.0,1904.0,0.0,1603107000.0,1904.0,2665.0
25%,91153.5,2004.0,4.0,1603221000.0,2006.0,1099049000.0
50%,149779.5,2008.0,5.0,1603254000.0,2011.0,1100911000.0
75%,215000.0,2012.0,5.0,1603290000.0,2014.0,1101245000.0
max,1000000.0,2020.0,5.0,1603710000.0,2020.0,1101375000.0


In [4]:
# get html page for navigation
url = 'https://auto.ru'
response = requests.get(url)
print('GET url status=', response.status_code)
page = BeautifulSoup(response.text,'html.parser')
print(page.prettify()[:500], '.....')

GET url status= 200
<!DOCTYPE html>
<html data-reactroot="" lang="ru">
 <head>
  <link as="script" href="https://auto.ru/_crpd/2W5I8yt76/b2530f-ah6D/GgBdzIYYcCBRKMb4AePeu15H21gDkDZ4uvt444bcqkZoSu3X1jye0fP9ptkU_Lo5gMNeGZtzrJp19-pl9aVQB1uTzczE3XsqgWjC9RgmXPREglbaBlWuLmyu7viXzeqBogZuxpe1md4u_28cBjc98p8NmlyR-ikZ8vFq738mkkUbGMfdSZk-aNWvxbzepts9iQCb0wPTJ-lj5Cb03MnqCSWBIc9QctdXYrgvF809efYRy15VF3KkWn7_5-cx6VrLlI" nonce="53tRKdiuhAG010lXQYee+w==" rel="preload"/>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv .....


In [5]:
# get all brands presented on site
brand_links = page.find_all('a',class_='IndexMarks__item')
brand_urls = list(map(lambda a: a['href'], brand_links))
pp.pprint(brand_urls)
site_brands = [u.split('/')[-3] for u in brand_urls]
site_brands

['https://auto.ru/cars/vaz/all/',
 'https://auto.ru/cars/audi/all/',
 'https://auto.ru/cars/bmw/all/',
 'https://auto.ru/cars/chery/all/',
 'https://auto.ru/cars/chevrolet/all/',
 'https://auto.ru/cars/citroen/all/',
 'https://auto.ru/cars/daewoo/all/',
 'https://auto.ru/cars/ford/all/',
 'https://auto.ru/cars/geely/all/',
 'https://auto.ru/cars/haval/all/',
 'https://auto.ru/cars/honda/all/',
 'https://auto.ru/cars/hyundai/all/',
 'https://auto.ru/cars/infiniti/all/',
 'https://auto.ru/cars/kia/all/',
 'https://auto.ru/cars/land_rover/all/',
 'https://auto.ru/cars/lexus/all/',
 'https://auto.ru/cars/mini/all/',
 'https://auto.ru/cars/mazda/all/',
 'https://auto.ru/cars/mercedes/all/',
 'https://auto.ru/cars/mitsubishi/all/',
 'https://auto.ru/cars/nissan/all/',
 'https://auto.ru/cars/opel/all/',
 'https://auto.ru/cars/peugeot/all/',
 'https://auto.ru/cars/porsche/all/',
 'https://auto.ru/cars/renault/all/',
 'https://auto.ru/cars/skoda/all/',
 'https://auto.ru/cars/ssang_yong/all/',
 

['vaz',
 'audi',
 'bmw',
 'chery',
 'chevrolet',
 'citroen',
 'daewoo',
 'ford',
 'geely',
 'haval',
 'honda',
 'hyundai',
 'infiniti',
 'kia',
 'land_rover',
 'lexus',
 'mini',
 'mazda',
 'mercedes',
 'mitsubishi',
 'nissan',
 'opel',
 'peugeot',
 'porsche',
 'renault',
 'skoda',
 'ssang_yong',
 'subaru',
 'suzuki',
 'toyota',
 'volkswagen',
 'volvo',
 'gaz',
 'uaz']

In [8]:
# brands in test data
brands = [brand.lower() for brand in df_test['brand'].unique()]
brands

['skoda',
 'audi',
 'honda',
 'volvo',
 'bmw',
 'nissan',
 'infiniti',
 'mercedes',
 'toyota',
 'lexus',
 'volkswagen',
 'mitsubishi']

In [7]:
# check if we have all needed brands on site
brands <= site_brands

True

Great, we have all brands from test set on site.
So, get ad urls from site for all brands that are in test set.

In [11]:
# prepare webdriver to bypass site parsing protection
driver = init_webdriver()

try:
    driver.get('https://auto.ru/')
    time.sleep(1)

    # reject geolocation request in order to parse page by page (it is not allowed when you don't do it first..)
    no_location_button = driver.find_element_by_xpath('//span[@data-decision="no"]')
    no_location_button.click()
    time.sleep(3)

    # collect car urls for brands using already configured webdriver
    urls = []
    for brand in brands:
        print(f"Processing brand \'{brand}\':")
        urls.extend(parse_car_urls(brand, driver))

    # path = f'Data/car_urls__{time.strftime("%H_%M__%d_%m_%Y")}.csv'
    path = f'Data/car_electro_urls__{time.strftime("%H_%M__%d_%m_%Y")}.csv'
    pd.DataFrame(data=urls, columns=['car_url']).to_csv(path, index=False)

    print('Car urls parsing is completed.')
    print(f'Parsing results were saved to {path}.')
except Exception as e:
    print(e)
finally:
    driver.quit()

  no_location_button = driver.find_element_by_xpath('//span[@data-decision="no"]')


Processing brand 'skoda':
Processing brand 'audi':
Brand 'audi' (page 2 from 2)
Processing brand 'honda':
Processing brand 'volvo':
Processing brand 'bmw':
Brand 'bmw' (page 2 from 2)
Processing brand 'nissan':
Brand 'nissan' (page 7 from 7)
Processing brand 'infiniti':
Processing brand 'mercedes':
Processing brand 'toyota':
Processing brand 'lexus':
Processing brand 'volkswagen':
Processing brand 'mitsubishi':


In [20]:
# let's see what features are in test set
test_features = list(df_test.columns)
pp.pprint(test_features)

['bodyType',
 'brand',
 'car_url',
 'color',
 'complectation_dict',
 'description',
 'engineDisplacement',
 'enginePower',
 'equipment_dict',
 'fuelType',
 'image',
 'mileage',
 'modelDate',
 'model_info',
 'model_name',
 'name',
 'numberOfDoors',
 'parsing_unixtime',
 'priceCurrency',
 'productionDate',
 'sell_id',
 'super_gen',
 'vehicleConfiguration',
 'vehicleTransmission',
 'vendor',
 '–í–ª–∞–¥–µ–ª—å—Ü—ã',
 '–í–ª–∞–¥–µ–Ω–∏–µ',
 '–ü–¢–°',
 '–ü—Ä–∏–≤–æ–¥',
 '–†—É–ª—å',
 '–°–æ—Å—Ç–æ—è–Ω–∏–µ',
 '–¢–∞–º–æ–∂–Ω—è']


Let's parse data from collected car ads urls.

In [55]:
# load the most recent car ads urls
car_urls = pd.read_csv(DATA_DIR+'/car_urls__03_01__31_01_2022.csv')['car_url']
car_urls.shape

(42305,)

In [19]:
# load electro car urls
# alas have to do it separately.. because there were no electro car ads in basic parsing results, don't no why..
car_electro_urls = pd.read_csv(DATA_DIR+'/car_electro_urls__11_34__06_02_2022.csv')['car_url']
car_electro_urls.shape

(144,)

In [80]:
class Batcher:
    def __init__(self, src_list:Sequence, batch_size:int):
        self.src_list = src_list
        self.batch_size = batch_size
        self.batch_count = ceil(len(src_list) / batch_size)
        self._full_batch_count = len(src_list) // batch_size
        self.idx_from = None
        self.idx_to = None

    # noinspection PyRedundantParentheses
    def batches(self):
        for b_idx in range(self._full_batch_count):
            self.idx_from, self.idx_to = b_idx*self.batch_size, (b_idx + 1)*self.batch_size - 1
            yield (b_idx, self.src_list[self.idx_from: self.idx_to])
        if self.batch_size * self._full_batch_count < len(self.src_list):
            self.idx_from, self.idx_to = self.batch_size * self._full_batch_count, len(self.src_list)-1
            yield (self._full_batch_count, self.src_list[self.idx_from: self.idx_to])

async def get_car_data_async(task_idx, car_url, session, features_to_extract):
    try:
        # print(f'Task {task_idx} started.')
        # print(f'get_car_data_async(): get url {car_url}')
        async with session.get(url=car_url) as response:
            content = await response.read()

            # print(f'Task {task_idx} completed.')
            output_dict = parse_car_data(car_url, content.decode('utf-8'), features_to_extract)

    except Exception as e:
        print(f"Unable to get url {car_url} due to {e.__class__} with args={e.args}.")
        # create empty dict
        output_dict = dict.fromkeys(features_to_extract)

    return output_dict

async def process_batch_async(urls, features_to_extract):
    async with aiohttp.ClientSession() as session:
        ret = await asyncio.gather(*[get_car_data_async(i, url, session, features_to_extract) for i, url in enumerate(urls)])
        return ret

In [81]:
batcher = Batcher(car_electro_urls, batch_size=20)

# path = f'{DATA_DIR}/parsed_car_data__{time.strftime("%H_%M__%d_%m_%Y")}.csv'
path = f'{DATA_DIR}/parsed_car_electro_data__{time.strftime("%H_%M__%d_%m_%Y")}.csv'

for batch_idx, batch_urls in batcher.batches():
    print(f"Processing batch {batch_idx+1} from {batcher.batch_count}. "
          f"Urls range [{batcher.idx_from+1} to {batcher.idx_to+1}] from total {len(batcher.src_list)} urls")
    beg_time = time.time()
    batch_result = await process_batch_async(batch_urls, test_features)
    print(f'Batch processing time={time.time() - beg_time}')
    df_batch = pd.DataFrame(batch_result)
    df_batch.index += batch_idx * batcher.batch_size

    # append batch data to csv file
    with open(path,'a+', encoding='utf-8') as file:
        file.write(df_batch.to_csv(header=True if batch_idx == 0 else False, index_label='‚Ññ'))

    time.sleep(2)

print('Car ads parsing is completed.')
print(f'Parsing results were saved to {path}.')

Processing batch 1 from 8. Urls range [1 to 20] from total 144 urls
Batch processing time=2.6626741886138916
Processing batch 2 from 8. Urls range [21 to 40] from total 144 urls
Batch processing time=2.6264407634735107
Processing batch 3 from 8. Urls range [41 to 60] from total 144 urls
Batch processing time=2.6588096618652344
Processing batch 4 from 8. Urls range [61 to 80] from total 144 urls
Batch processing time=2.571946620941162
Processing batch 5 from 8. Urls range [81 to 100] from total 144 urls
Batch processing time=2.5104641914367676
Processing batch 6 from 8. Urls range [101 to 120] from total 144 urls
Batch processing time=2.6284894943237305
Processing batch 7 from 8. Urls range [121 to 140] from total 144 urls
Batch processing time=2.4719736576080322
Processing batch 8 from 8. Urls range [141 to 144] from total 144 urls
Batch processing time=0.9772720336914062
Car ads parsing is completed.
Parsing results were saved to Data/parsed_car_electro_data__13_16__06_02_2022.csv.


Check the amount of useful records in parsed data:

In [8]:
df_parsed = pd.read_csv(f'{DATA_DIR}/parsed_car_data__17_55__02_02_2022.csv')
df_parsed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42305 entries, 0 to 42304
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ‚Ññ                     42305 non-null  int64  
 1   bodyType              33195 non-null  object 
 2   brand                 41635 non-null  object 
 3   car_url               42305 non-null  object 
 4   color                 41635 non-null  object 
 5   complectation_dict    38510 non-null  object 
 6   description           41635 non-null  object 
 7   engineDisplacement    32624 non-null  float64
 8   enginePower           32624 non-null  float64
 9   equipment_dict        41552 non-null  object 
 10  fuelType              32624 non-null  object 
 11  image                 30612 non-null  object 
 12  mileage               33195 non-null  float64
 13  modelDate             40061 non-null  float64
 14  model_info            41552 non-null  object 
 15  model_name       

In [105]:
df_parsed[~df_parsed['bodyType'].isna() & ~df_parsed['Price'].isna()].shape

(30426, 34)

So we have ~30_000 (potentially) useful records for training. It is comparable with test set size (~35_000) - not bad and there is a hope.

One more check for sample representativity.

In [5]:
df_test['fuelType'].value_counts()

–±–µ–Ω–∑–∏–Ω     28601
–¥–∏–∑–µ–ª—å      5800
–≥–∏–±—Ä–∏–¥       223
—ç–ª–µ–∫—Ç—Ä–æ       55
–≥–∞–∑            7
Name: fuelType, dtype: int64

In [24]:
df_test[df_test['fuelType']=='—ç–ª–µ–∫—Ç—Ä–æ']

Unnamed: 0,bodyType,brand,car_url,color,complectation_dict,description,engineDisplacement,enginePower,equipment_dict,fuelType,...,vehicleConfiguration,vehicleTransmission,vendor,–í–ª–∞–¥–µ–ª—å—Ü—ã,–í–ª–∞–¥–µ–Ω–∏–µ,–ü–¢–°,–ü—Ä–∏–≤–æ–¥,–†—É–ª—å,–°–æ—Å—Ç–æ—è–Ω–∏–µ,–¢–∞–º–æ–∂–Ω—è
5241,–≤–Ω–µ–¥–æ—Ä–æ–∂–Ω–∏–∫ 5 –¥–≤.,AUDI,https://auto.ru/cars/used/sale/audi/e_tron/110...,–±–µ–ª—ã–π,,Audi e-tron Prestige quattro –æ—Ç Moscow Tesla C...,LTR,408 N12,"{""asr"":true,""esp"":true,""usb"":true,""e-adjustmen...",—ç–ª–µ–∫—Ç—Ä–æ,...,ALLROAD_5_DOORS AUTOMATIC,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,EUROPEAN,1¬†–≤–ª–∞–¥–µ–ª–µ—Ü,1 –≥–æ–¥ –∏ 3 –º–µ—Å—è—Ü–∞,–û—Ä–∏–≥–∏–Ω–∞–ª,–ø–æ–ª–Ω—ã–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω
7992,—Ö—ç—Ç—á–±–µ–∫ 5 –¥–≤.,BMW,https://auto.ru/cars/used/sale/bmw/i3/11010431...,—Å–∏–Ω–∏–π,,"–ê—Ç–æ–º–æ–±–∏–ª—å –∏–∑ –ì–µ—Ä–º–∞–Ω–∏–∏,–±–µ–∑ –ø—Ä–æ–±–µ–≥–∞ –ø–æ –†–§.\n–ù–∞ –≥...",LTR,170 N12,,—ç–ª–µ–∫—Ç—Ä–æ,...,HATCHBACK_5_DOORS AUTOMATIC,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,EUROPEAN,1¬†–≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–∑–∞–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω
8252,—Ö—ç—Ç—á–±–µ–∫ 5 –¥–≤.,BMW,https://auto.ru/cars/used/sale/bmw/i3/11012780...,—Å–µ—Ä—ã–π,,"–ü—Ä–æ–±–µ–≥ 54000 –∫–º, –∑–∞–ø–∞—Å —Ö–æ–¥–∞ –Ω–∞ –∑–∞—Ä—è–¥–∫–µ 130 –∫–º+...",LTR,170 N12,"{""park-assist-f"":true,""park-assist-r"":true}",—ç–ª–µ–∫—Ç—Ä–æ,...,HATCHBACK_5_DOORS AUTOMATIC,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,EUROPEAN,1¬†–≤–ª–∞–¥–µ–ª–µ—Ü,1 –º–µ—Å—è—Ü,–û—Ä–∏–≥–∏–Ω–∞–ª,–∑–∞–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω
8350,—Ö—ç—Ç—á–±–µ–∫ 5 –¥–≤.,BMW,https://auto.ru/cars/used/sale/bmw/i3/11002716...,–±–µ–ª—ã–π,"{""id"":""21624597"",""name"":""i3 120 Ah"",""available...","–ê–≤—Ç–æ–º–æ–±–∏–ª—å –≤ –Ω–∞–ª–∏—á–∏–∏ –ú–æ—Å–∫–≤–µ , —Å –ü–¢–° , –ø—Ä–æ–∏–∑–≤–æ–¥...",LTR,170 N12,"{""cruise-control"":true,""asr"":true,""esp"":true,""...",—ç–ª–µ–∫—Ç—Ä–æ,...,HATCHBACK_5_DOORS AUTOMATIC,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,EUROPEAN,1¬†–≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–∑–∞–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω
8684,—Ö—ç—Ç—á–±–µ–∫ 5 –¥–≤.,BMW,https://auto.ru/cars/used/sale/bmw/i3/11011745...,—á—ë—Ä–Ω—ã–π,,BMW I 3 REX\n—Ñ–µ–≤—Ä–∞–ª—å 2017 –≥–æ–¥–∞ –≤—ã–ø—É—Å–∫–∞.\n–ü—Ä–∏–æ–±...,LTR,170 N12,"{""cruise-control"":true,""glonass"":true,""navigat...",—ç–ª–µ–∫—Ç—Ä–æ,...,HATCHBACK_5_DOORS AUTOMATIC,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,EUROPEAN,1¬†–≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–∑–∞–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω
8722,—Ö—ç—Ç—á–±–µ–∫ 5 –¥–≤.,BMW,https://auto.ru/cars/used/sale/bmw/i3/11000973...,–≥–æ–ª—É–±–æ–π,,\n–ü—Ä–∏–±—ã–ª–æ –∏–∑ –ê–º–µ—Ä–∏–∫–∏ –≤ –∞–≤–≥—É—Å—Ç–µ 2020 –≥–æ–¥–∞. –ü–æ–ª–Ω...,LTR,170 N12,,—ç–ª–µ–∫—Ç—Ä–æ,...,HATCHBACK_5_DOORS AUTOMATIC,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,EUROPEAN,1¬†–≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–∑–∞–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω
8917,—Ö—ç—Ç—á–±–µ–∫ 5 –¥–≤.,BMW,https://auto.ru/cars/used/sale/bmw/i3/11008715...,–±–µ–∂–µ–≤—ã–π,,-–°–æ—Å—Ç–æ—è–Ω–∏–µ –Ω–æ–≤–æ–≥–æ –∞–≤—Ç–æ–º–æ–±–∏–ª—è .\n-Carfax –≤—Å—è –∏—Å...,LTR,170 N12,"{""glonass"":true,""roof-rails"":true}",—ç–ª–µ–∫—Ç—Ä–æ,...,HATCHBACK_5_DOORS AUTOMATIC,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,EUROPEAN,1¬†–≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–∑–∞–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω
10817,—Ö—ç—Ç—á–±–µ–∫ 5 –¥–≤.,BMW,https://auto.ru/cars/used/sale/bmw/i3/11012664...,—Å–µ—Ä—ã–π,,–ü—Ä–æ–¥–∞—é BMW I3 +REX .\n\n–ü–æ–ª–Ω–æ—Ü–µ–Ω–Ω—ã–π —ç–ª–µ–∫—Ç—Ä–æ–º–æ–±...,LTR,170 N12,,—ç–ª–µ–∫—Ç—Ä–æ,...,HATCHBACK_5_DOORS AUTOMATIC,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,EUROPEAN,1¬†–≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–∑–∞–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω
10936,—Ö—ç—Ç—á–±–µ–∫ 5 –¥–≤.,BMW,https://auto.ru/cars/used/sale/bmw/i3/10996733...,—á—ë—Ä–Ω—ã–π,,–ê–≤—Ç–æ–º–æ–±–∏–ª—å BMW i3 (–ø–æ–ª–Ω–æ—Å—Ç—å—é —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–∏–π )–∑–∞...,LTR,170 N12,"{""cruise-control"":true,""asr"":true,""tinted-glas...",—ç–ª–µ–∫—Ç—Ä–æ,...,HATCHBACK_5_DOORS AUTOMATIC,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,EUROPEAN,1¬†–≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–∑–∞–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω
11043,—Ö—ç—Ç—á–±–µ–∫ 5 –¥–≤.,BMW,https://auto.ru/cars/used/sale/bmw/i3/10993761...,—Å–µ—Ä—ã–π,,–°–æ—Å—Ç–æ—è–Ω–∏–µ –Ω–æ–≤–æ–≥–æ –∞–≤—Ç–æ–º–æ–±–∏–ª—è . \n–§–æ—Ç–æ –ò–ú–ï–ù–ù–û –≠–¢...,LTR,170 N12,"{""glonass"":true,""airbag-driver"":true,""aux"":tru...",—ç–ª–µ–∫—Ç—Ä–æ,...,HATCHBACK_5_DOORS AUTOMATIC,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,EUROPEAN,1¬†–≤–ª–∞–¥–µ–ª–µ—Ü,8 –º–µ—Å—è—Ü–µ–≤,–û—Ä–∏–≥–∏–Ω–∞–ª,–∑–∞–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω


In [3]:
df_parsed['fuelType'].value_counts()

–ë–µ–Ω–∑–∏–Ω          22927
–î–∏–∑–µ–ª—å           8004
–ì–∏–±—Ä–∏–¥           1376
–æ–±–æ—Ä—É–¥–æ–≤–∞–Ω–∏–µ      251
–ì–∞–∑                66
Name: fuelType, dtype: int64

Unfortunately we have no electro car here, but we have a little of them in test set. We'll parse them again and add to combined train set.

In [13]:
df_parsed_electro = pd.read_csv(f'{DATA_DIR}/parsed_car_electro_data__13_16__06_02_2022.csv')
display(df_parsed_electro.head())
df_parsed_electro.info()
df_parsed_electro.shape

Unnamed: 0,‚Ññ,bodyType,brand,car_url,color,complectation_dict,description,engineDisplacement,enginePower,equipment_dict,...,vehicleTransmission,vendor,–í–ª–∞–¥–µ–ª—å—Ü—ã,–í–ª–∞–¥–µ–Ω–∏–µ,–ü–¢–°,–ü—Ä–∏–≤–æ–¥,–†—É–ª—å,–°–æ—Å—Ç–æ—è–Ω–∏–µ,–¢–∞–º–æ–∂–Ω—è,Price
0,0,–≤–Ω–µ–¥–æ—Ä–æ–∂–Ω–∏–∫ 5 –¥–≤.,AUDI,https://auto.ru/cars/used/sale/audi/e_tron_spo...,–±–µ–ª—ã–π,"{'id': '22760877', 'name': 'Sport 55 quattro',...",–ë—É–¥—å –æ–¥–Ω–∏–º –∏–∑ –ø–µ—Ä–≤—ã—Ö –æ–±–ª–∞–¥–∞—Ç–µ–ª–µ–π —ç–ª–µ–∫—Ç—Ä–∏—á–µ—Å–∫–æ–π...,,408.0,"{'cruise-control': True, 'asr': True, 'esp': T...",...,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,,1 –≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–ø–æ–ª–Ω—ã–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω,8860000.0
1,1,–≤–Ω–µ–¥–æ—Ä–æ–∂–Ω–∏–∫ 5 –¥–≤.,AUDI,https://auto.ru/cars/used/sale/audi/q4/1106535...,—Å–µ—Ä—ã–π,{'id': '0'},–°–∞–º—ã–π –Ω–æ–≤—ã–π –∏ —Å–∞–º—ã–π —Ç–æ–ø–æ–≤—ã–π E-TRON Q4 !!!\r\n–¢...,,170.0,"{'usb': True, 'aux': True, 'projection-display...",...,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,,1 –≤–ª–∞–¥–µ–ª–µ—Ü,2 –º–µ—Å—è—Ü–∞,–û—Ä–∏–≥–∏–Ω–∞–ª,–∑–∞–¥–Ω–∏–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω,8170000.0
2,2,–≤–Ω–µ–¥–æ—Ä–æ–∂–Ω–∏–∫ 5 –¥–≤.,AUDI,https://auto.ru/cars/used/sale/audi/e_tron_s_s...,—Å–∏–Ω–∏–π,"{'id': '23050466', 'name': 'Quattro', 'availab...","–ù–æ–≤—ã–π –∞–≤—Ç–æ –∏–∑ –ì–µ—Ä–º–∞–Ω–∏–∏, –ø–æ–ª–Ω–æ—Å—Ç—å—é —Ä–∞—Å—Ç–∞–º–æ–∂–µ–Ω, ...",,503.0,"{'sport-seats': True, 'e-adjustment-wheel': Tr...",...,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,,1 –≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–ø–æ–ª–Ω—ã–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω,11430000.0
3,3,–≤–Ω–µ–¥–æ—Ä–æ–∂–Ω–∏–∫ 5 –¥–≤.,AUDI,https://auto.ru/cars/used/sale/audi/e_tron/111...,—á—ë—Ä–Ω—ã–π,"{'id': '22291462', 'name': 'Design 55 quattro'...",–ê–≤—Ç–æ–º–æ–±–∏–ª—å –ø—Ä–∏–æ–±—Ä–µ—Ç–µ–Ω –∏ –æ–±—Å–ª—É–∂–∏–≤–∞–ª—Å—è —É –æ—Ñ–∏—Ü–∏–∞–ª...,,408.0,"{'e-adjustment-wheel': True, 'multi-wheel': Tr...",...,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,,1 –≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–ø–æ–ª–Ω—ã–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω,10780000.0
4,4,–≤–Ω–µ–¥–æ—Ä–æ–∂–Ω–∏–∫ 5 –¥–≤.,AUDI,https://auto.ru/cars/used/sale/audi/e_tron_spo...,—á—ë—Ä–Ω—ã–π,"{'id': '22760308', 'name': '55 quattro', 'avai...",–ê–≤—Ç–æ–º–æ–±–∏–ª—å –ø—Ä–æ–¥–∞–µ—Ç—Å—è –∫–æ–º–ø–∞–Ω–∏–µ–π –ê–≤—Ç–æ–ø—Ä–µ–º—å–µ—Ä-–°–æ—á...,,408.0,"{'cruise-control': True, 'asr': True, 'esp': T...",...,–∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∞—è,,1 –≤–ª–∞–¥–µ–ª–µ—Ü,,–û—Ä–∏–≥–∏–Ω–∞–ª,–ø–æ–ª–Ω—ã–π,–õ–µ–≤—ã–π,–ù–µ —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞,–†–∞—Å—Ç–∞–º–æ–∂–µ–Ω,8310000.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ‚Ññ                     136 non-null    int64  
 1   bodyType              133 non-null    object 
 2   brand                 133 non-null    object 
 3   car_url               136 non-null    object 
 4   color                 133 non-null    object 
 5   complectation_dict    133 non-null    object 
 6   description           133 non-null    object 
 7   engineDisplacement    13 non-null     float64
 8   enginePower           133 non-null    float64
 9   equipment_dict        133 non-null    object 
 10  fuelType              133 non-null    object 
 11  image                 126 non-null    object 
 12  mileage               133 non-null    float64
 13  modelDate             133 non-null    float64
 14  model_info            133 non-null    object 
 15  model_name           

(136, 34)

In [15]:
# combine parsed data for non-electro and electro cars
df_parsed.append(df_parsed_electro, ignore_index=True).to_csv(f'{DATA_DIR}/parsed_car_data_all__06_02_2022.csv',index=False)

In [17]:
df_parsed_all = pd.read_csv(f'{DATA_DIR}/parsed_car_data_all__06_02_2022.csv')
df_parsed_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42441 entries, 0 to 42440
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ‚Ññ                     42441 non-null  int64  
 1   bodyType              33328 non-null  object 
 2   brand                 41768 non-null  object 
 3   car_url               42441 non-null  object 
 4   color                 41768 non-null  object 
 5   complectation_dict    38643 non-null  object 
 6   description           41768 non-null  object 
 7   engineDisplacement    32637 non-null  float64
 8   enginePower           32757 non-null  float64
 9   equipment_dict        41685 non-null  object 
 10  fuelType              32757 non-null  object 
 11  image                 30738 non-null  object 
 12  mileage               33328 non-null  float64
 13  modelDate             40194 non-null  float64
 14  model_info            41685 non-null  object 
 15  model_name       

## Trials