In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException

from tqdm import tqdm

import re
import time
import json

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [3]:
test_data = pd.read_pickle("data/test.pkl.zip", compression="zip")

In [4]:
test_data.iloc[12561]['description']

'Машина в отличном состоянии!!17.10.2020 было сделано Т.О заменено масло в коробке передач в двигателе,так же заменены все фильтра.Почти максимальная комплектация кроме люка!!! Без ключевой доступ. В ДТП не была не билась.Красился капот и передний бампер так как от времени были сколы.Капот фары туманки затянуты броне пленкой.Делали для себя супруга не ездит вся причина продажи.. Новая летняя резина. Продаю не спеша. '

In [5]:
valid_data = pd.DataFrame()
catalog_data = pd.DataFrame()

In [6]:
test_data['bodyType'].unique()

array(['лифтбек', 'внедорожник 5 дв.', 'хэтчбек 5 дв.', 'седан',
       'компактвэн', 'универсал 5 дв.', 'пикап одинарная кабина',
       'хэтчбек 3 дв.', 'купе', 'кабриолет', 'минивэн',
       'пикап двойная кабина', 'внедорожник 3 дв.', 'родстер', 'микровэн',
       'седан 2 дв.', 'купе-хардтоп', 'фастбек', 'тарга',
       'внедорожник открытый', 'лимузин', 'пикап полуторная кабина',
       'седан-хардтоп', 'фургон'], dtype=object)

In [7]:
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
# driver = webdriver.Chrome("./chromedriver", options=options)



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Driver [/home/user/.wdm/drivers/chromedriver/linux64/99.0.4844.51/chromedriver] found in cache


In [8]:
regions = [
    "leningradskaya_oblast",
    # "moskovskaya_oblast",
]
models = [
    # "skoda",
    # "audi",
    # "honda",
    # "volvo",
    # "bmw",
    "nissan",
    "infiniti",
    # "mercedes",
    "toyota",
    "lexus",
    # "volkswagen",
    "mitsubishi",
]


In [9]:
urls_list = []

In [10]:
def get_urls_for_model_in_region(model, region):
    
    result_list = []
    cnt = True
    page_num = 1
    
    while cnt is True:
        try:
            driver.get(f'https://auto.ru/{region}/cars/{model}/used/?page={page_num}')
            next_page = driver.find_element(By.CLASS_NAME, 'ListingItemTitle__link')
            page_links = driver.find_elements(By.CLASS_NAME, 'ListingItemTitle__link')

            for item in page_links:
                result_list.append(item.get_attribute('href'))
            print(f'Parsing {page_num} page for {model} in {region}', end='\r')
            page_num += 1
        except NoSuchElementException:
            cnt = False
            print()
            print(f'There are {page_num} pages by {model}')
    
    return result_list

In [11]:
def get_car_info(url):
        
    driver.get(url)
    
    year = int(driver.find_element(By.CLASS_NAME, 'CardInfoRow_year').text.split('\n')[1])
    
    car = driver.find_elements(By.CLASS_NAME,'CardBreadcrumbs__itemText')
    brand = car[3].text
    model = car[4].text
    
    car_url = url
    
    bodytype = driver.find_element(By.CLASS_NAME, 'CardInfoRow_bodytype').text.split('\n')[1]
    kmage = int(re.sub('\D', '', driver.find_element(By.CLASS_NAME, 'CardInfoRow_kmAge').text))
    color = driver.find_element(By.CLASS_NAME, 'CardInfoRow_color').text.split('\n')[1]
    
    engine = driver.find_element(By.CLASS_NAME, 'CardInfoRow_engine').text.split('/')
    engineDisplacement = float(re.findall('(\d+.\d+)', engine[0])[0])
    enginePower = int(re.findall('\d+', engine[1])[0])
    fuelType = engine[2]
    
    super_gen = json.loads(driver.find_element(By.ID, 'sale-data-attributes').get_attribute('data-bem'))['sale-data-attributes']
    
    vehicleTransmission = driver.find_element(By.CLASS_NAME, 'CardInfoRow_transmission').text.split('\n')[1]
    drive = driver.find_element(By.CLASS_NAME, 'CardInfoRow_drive').text.split('\n')[1]
    wheel = driver.find_element(By.CLASS_NAME, 'CardInfoRow_wheel').text.split('\n')[1]
    state = driver.find_element(By.CLASS_NAME, 'CardInfoRow_state').text.split('\n')[1]
    owner = driver.find_element(By.CLASS_NAME, 'CardInfoRow_ownersCount').text.split('\n')[1]
    pts = driver.find_element(By.CLASS_NAME, 'CardInfoRow_pts').text.split('\n')[1]
    customs = driver.find_element(By.CLASS_NAME, 'CardInfoRow_customs').text.split('\n')[1]
    
    try:
        owningTime = driver.find_element(By.CLASS_NAME, 'CardInfoRow_owningTime').text.split('\n')[1]
    except NoSuchElementException:
        owningTime = None
    
    try:
        description = driver.find_element(By.CLASS_NAME, 'CardDescriptionHTML').text
    except NoSuchElementException:
        description = None
    
    sell_id = int(re.findall('\d+', driver.find_element(By.CLASS_NAME, 'CardHead__id').text)[0])
    price = int(re.sub('\D', '', driver.find_element(By.CLASS_NAME, 'OfferPriceCaption__price').text))
    
    complect_list = []
    
    complect_data = driver.find_elements(By.CLASS_NAME, 'ComplectationGroupsDesktop__itemList')

    for item in complect_data:
        complect_list.extend(item.text.replace('\n', '').split('•')[1:])
    
    catalog_url = driver.find_element(By.CLASS_NAME, 'CardCatalogLink').get_attribute('href')
    
    data_dict = {
        'brand': brand, 'model': model, 'year': year, 'bodytype': bodytype, 'kmage': kmage,
        'color': color, 'engineDisplacement': engineDisplacement, 'enginePower': enginePower, 
        'fuelType': fuelType, 'super_gen': super_gen, 'vehicleTransmission': vehicleTransmission,
        'drive': drive, 'wheel': wheel, 'state': state, 'owner': owner, 'pts': pts, 'customs': customs, 
        'owningTime': owningTime, 'description': description, 'sell_id': sell_id, 'price': price, 
        'car_url': car_url, 'catalog_url': catalog_url, 'equipment_dict': complect_list
    }
    
    return data_dict

In [12]:
def get_dicts_from_catalog(url):
    
    driver.get(url)

    car_info_full = driver.find_elements(By.CLASS_NAME, 'list-values')
    
    car_info_dict_ru = {}
    
    for item in car_info_full:
        el = item.text.split('\n')
        for i in range(0, len(el) - 1, 2):
            car_info_dict_ru[el[i]] = el[i + 1]
    
    engine_type_dict = {'бензин': 'GASOLINE', 'дизель': 'DIESEL', 
                        'гибрид': 'HYBRID', 'электро': 'ELECTRO', 'газ': 'LPG'}
    gear_type_dict = {'передний': 'FORWARD_CONTROL', 'полный': 'ALL_WHEEL_DRIVE', 'задний': 'REAR_DRIVE'}
    transmission_dict = {'автомат': 'AUTOMATIC', 'робот': 'ROBOT', 
                         'механика': 'MECHANICAL', 'вариатор': 'VARIATOR'}

    car_info_dict_en = {
        'engine_type': engine_type_dict[car_info_dict_ru['Тип двигателя']],
        'gear_type': gear_type_dict[car_info_dict_ru['Привод']],
        'transmission': transmission_dict[car_info_dict_ru['Коробка']],
        'power': int(re.findall('\d+', car_info_dict_ru['Мощность'])[0]),
        'power_kvt': int(re.findall('\d+', car_info_dict_ru['Максимальная мощность, л.с./кВт при об/мин'])[1]),
        'acceleration': car_info_dict_ru.get('Разгон до 100 км/ч, с'),
        'clearance_min': min(map(int, re.findall('\d+', car_info_dict_ru['Клиренс']))),
        'fuel_rate': float(car_info_dict_ru['Расход топлива, л город/трасса/смешанный'].split('/')[2])
    }
    
    url2 = url.replace('specifications', 'equipment')
    
    driver.get(url2)

    equipment_list = []

    equipment_data = driver.find_elements(By.CLASS_NAME, 'catalog__package-list-i')
    for item in equipment_data:
        equipment_list.append(item.text)
    
    car_data = json.loads(driver.find_element(By.CLASS_NAME, 'search-form-v2-mmm').get_attribute('data-bem'))
    equip_data = json.loads(driver.find_element(By.CLASS_NAME, 'catalog__section').get_attribute('data-bem'))
    attrib_data = json.loads(driver.find_element(By.CLASS_NAME, 'sale-data-attributes').get_attribute('data-bem'))
    
    return {'super_gen_2': car_info_dict_en, 'super_gen_3': attrib_data, 'complectation_dict': equipment_list}

## Gathering URLs from MODEL pages in Regions

In [13]:
%%time
all_urls = []
# from time import sleep
# from random import randint

i = 0
for region in regions:
    for model in models:
        print(f"{i+1} Processing {model} from {region}")
        # sleep(randint(30, 70)/100.)
        model_urls = get_urls_for_model_in_region(model, region)
        all_urls.append(model_urls)
        i += 1

1 Processing nissan from leningradskaya_oblast
Parsing 20 page for nissan in leningradskaya_oblast
There are 21 pages by nissan
2 Processing infiniti from leningradskaya_oblast
Parsing 3 page for infiniti in leningradskaya_oblast
There are 4 pages by infiniti
3 Processing toyota from leningradskaya_oblast
Parsing 5 page for toyota in leningradskaya_oblast

## Getting main car info

In [None]:
for url in tqdm(model_spb_urls):
    try:
        valid_data = valid_data.append(get_car_info(url), ignore_index=True)
    except NoSuchElementException:
        print('Error 404', end='\r')

In [None]:
valid_data.to_pickle("data/20220327_lo_nissan_infiniti_toyota_lexus_mitsubishi.pkl.zip", compression="zip")

## Getting additional information from catalog (links from main DF)

In [23]:
for car_url in tqdm(valid_data['catalog_url'].value_counts().index.tolist()):
    try:
        catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
    except NoSuchElementException:
        print('Error 404', end='\r')

  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = catalog_data.append(get_dicts_from_catalog(car_url), ignore_index=True)
  catalog_data = cata

In [30]:
valid_data.to_pickle("data/20220326_valid_data.pkl.zip", compression="zip")
catalog_data.to_pickle("data/20220326_catalog_data.pkl.zip", compression="zip")

In [29]:

print("super gen 2", catalog_data["super_gen_2"][5], sep="\n")
print("super gen 3", catalog_data["super_gen_3"][5], sep="\n")

super gen 2
{'engine_type': 'DIESEL', 'gear_type': 'ALL_WHEEL_DRIVE', 'transmission': 'AUTOMATIC', 'power': 238, 'power_kvt': 175, 'acceleration': '8.3', 'clearance_min': 184, 'fuel_rate': 9.0}
super gen 3
{'sale-data-attributes': {'asciiCat': 'cars', 'puid10': '1', 'category': 'cars', 'mark': 'INFINITI', 'model': 'FX', 'state': 'used', 'class': 'J', 'type': 'suv', 'segment': 'PREMIUM', 'group': 'family', 'power': 238, 'year': 2011}}


to extract

- super gen 2: power_kvt, acceleration, clearance_min, fuel_rate
- super gen 3: mark, model, class, year, group

join by

- model
- mark
- year