In [1]:
import pandas as pd
import re
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
import json
from time import gmtime, strftime

In [2]:
marks = ['bmw', 'mercedes', 'volkswagen',
         'nissan', 'toyota', 'audi', 'skoda',
         'mitsubishi','volvo','honda', 'porsche',
         'infiniti', 'lexus', 'suzuki', 'mini']
# marks = ['bmw']  # only BMW in test data

In [3]:
def create_train_set(marks):
    """Main function. Accepts a list of car brands.
    Returns a list of machine brands obtained during parsing
    and the time spent in seconds"""
    begin = datetime.now()
    list_marks = parse_cars_parallel(marks)
    duration = (datetime.now() - begin).total_seconds()
    return [list_marks, duration]


def parse_cars_parallel(marks):
    """Multithreaded parsing"""
    pool = Pool(6)
    result = pool.map(get_list_cars, marks)
    pool.close()
    pool.join()
    return result


def get_list_cars(mark):
    """Collects all cars from one mark"""
    list_cars = []
    url_base = 'https://auto.ru/moskva/cars/' + mark + '/used/?sort=price-asc'
    for n in range(6):
        if n == 0:
            url = url_base + '&km_age_to=50000&page='
        elif n == 1:
            url = url_base + '&km_age_from=50001&km_age_to=100000&page='
        elif n == 2:
            url = url_base + '&km_age_from=100001&km_age_to=150000&page='
        elif n == 3:
            url = url_base + '&km_age_from=150001&km_age_to=200000&page='
        elif n == 4:
            url = url_base + '&km_age_from=200001&km_age_to=250000&page='
        else:
            url = url_base + '&km_age_from=250001&page='
        for i in range(1, 100):
            url_page = url + str(i)
            cars = get_all_cars_on_page(url_page)  # list of dictionaries
            if len(cars) > 0:
                list_cars.extend(cars)
            else:
                break
    create_csv(list_cars, mark)
    return mark


def create_csv(list_cars, mark):
    """Creates a csv file containing the mark's machines"""
    cols = ['bodyType', 'brand', 'color', 'fuelType',
            'modelDate', 'name', 'numberOfDoors',
            'productionDate', 'vehicleConfiguration',
            'vehicleTransmission', 'engineDisplacement',
            'enginePower', 'mileage', 'Привод', 'Руль',
            'Владельцы', 'ПТС', 'Комплектация', 'description',
            'Владение', 'seller_type', 'url', 'price']
    df = pd.DataFrame(list_cars, columns=cols)
    df.to_csv(f'{mark}.csv', index=False)
    t = strftime("%H:%M:%S", gmtime())
    print(t + ' ' + mark + '.csv создан')
    return 1


def get_all_cars_on_page(url):
    """Returns all machines on the page as
    list of dictionaries. One dictionary - one machine"""
    list_cars = []
    try:
        soup = get_soup(url)
        cars = soup.find('div', class_='ListingCars-module__container ListingCars-module__list') \
            .find_all('div', class_='ListingItem-module__container')
        for car in cars:
            car_info = get_car_info(car)
            if len(car_info) > 0:
                list_cars.append(car_info)
    except Exception:
        return []
    return list_cars


def get_car_info(car):
    """Accepts the BeautifulSoup object.
    Returns information about the machine as a dictionary"""
    car_info = {}
    region = car.find('span', class_="MetroListPlace__regionName MetroListPlace_nbsp").text
    if region == 'Москва':
        for meta in car.find_all('meta'):  # creating a dictionary from the itemprop properties
            car_info[meta['itemprop']] = meta['content'].replace("\xa0", " ")
        more_data = get_info_on_page(car_info['url'])
        car_info = {**car_info, **more_data}
    return car_info


def get_info_on_page(url):
    """Collects data about the machine and returns a dictionary"""
    try:
        soup = get_soup(url)
        info = (soup.find(class_='CardInfo')).find_all('li')
        list_li = [li.find_all('span') for li in info]
        car_info = {li[0].text.replace("\xa0", " "): li[1].text.replace("\xa0", " ") for li in list_li}
    except Exception:
        return {}
    try:
        info = soup.find(id="initial-state").text
        info = json.loads(info)
    except Exception:
        return car_info
    try:
        car_info['mileage'] = info['card']['state']['mileage']
    except Exception:
        pass
    try:
        car_info['description'] = info['card']['description']
    except Exception:
        pass
    try:
        car_info['Комплектация'] = info['card']['vehicle_info']['equipmentGroups']
    except Exception:
        pass
    try:
        car_info['seller_type'] = info['card']['seller_type']
    except Exception:
        pass
    return car_info


def get_soup(url):
    """Возвращает объект BeautifulSoup по входной ссылке url"""
    r = requests.get(url)
    r.encoding = 'utf-8'
    return BeautifulSoup(r.text, 'html.parser')

In [4]:
res = create_train_set(marks)

21:20:36 audi.csv создан
21:40:08 nissan.csv создан
21:41:48 toyota.csv создан
21:55:38 volkswagen.csv создан
22:13:39 volvo.csv создан
22:16:32 bmw.csv создан
22:17:47 skoda.csv создан
22:19:22 honda.csv создан
22:29:50 mercedes.csv создан
22:30:52 mitsubishi.csv создан
22:31:02 porsche.csv создан
22:32:36 suzuki.csv создан
22:34:39 mini.csv создан
22:35:47 infiniti.csv создан
22:38:12 lexus.csv создан


In [5]:
marks = ['bmw', 'mercedes', 'volkswagen',
         'nissan', 'toyota', 'audi', 'skoda',
         'mitsubishi','volvo','honda', 'porsche',
         'infiniti', 'lexus', 'suzuki', 'mini']
# marks = ['bmw']
train = pd.concat([pd.read_csv(marks[i] + ".csv") for i in range(len(marks))])
train.head()

Unnamed: 0,bodyType,brand,color,fuelType,modelDate,name,numberOfDoors,productionDate,vehicleConfiguration,vehicleTransmission,...,Привод,Руль,Владельцы,ПТС,Комплектация,description,Владение,seller_type,url,price
0,седан,BMW,чёрный,бензин,1987,2.5 MT,4,1990,SEDAN MECHANICAL 2.5,механическая,...,задний,Левый,3 или более,Оригинал,,Продам легенду!!! Заменен мотор (М50Б25) кузов...,,PRIVATE,https://auto.ru/cars/used/sale/bmw/5er/1014240...,150000
1,седан,BMW,белый,бензин,1995,2.0 AT,4,1998,SEDAN AUTOMATIC 2.0,автоматическая,...,задний,Левый,3 или более,Оригинал,,"Продам ласточку. Машина старая, если ищите нов...",8 месяцев,PRIVATE,https://auto.ru/cars/used/sale/bmw/5er/1098018...,190000
2,седан,BMW,серебристый,бензин,1998,1.9 MT,4,2001,SEDAN MECHANICAL 1.9,механическая,...,задний,Левый,1 владелец,Оригинал,,Срочно,,PRIVATE,https://auto.ru/cars/used/sale/bmw/3er/1099111...,195000
3,седан,BMW,зелёный,бензин,2000,2.5 AT,4,2001,SEDAN AUTOMATIC 2.5,автоматическая,...,задний,Левый,3 или более,Оригинал,"[{'name': 'Салон', 'values': ['Солнцезащитная ...","Авто 2001 года, двигать, коробка работают без ...",,PRIVATE,https://auto.ru/cars/used/sale/bmw/5er/1099311...,220000
4,седан,BMW,серый,бензин,1987,2.0 MT,4,1988,SEDAN MECHANICAL 2.0,механическая,...,задний,Левый,2 владельца,Оригинал,,Продаю машину по причине перехода на ф10 кузов...,,PRIVATE,https://auto.ru/cars/used/sale/bmw/5er/1099287...,220000


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34672 entries, 0 to 258
Data columns (total 23 columns):
bodyType                34672 non-null object
brand                   34672 non-null object
color                   34672 non-null object
fuelType                34672 non-null object
modelDate               34672 non-null int64
name                    34672 non-null object
numberOfDoors           34672 non-null int64
productionDate          34672 non-null int64
vehicleConfiguration    34672 non-null object
vehicleTransmission     34672 non-null object
engineDisplacement      34672 non-null object
enginePower             34672 non-null object
mileage                 34672 non-null int64
Привод                  34672 non-null object
Руль                    34672 non-null object
Владельцы               34664 non-null object
ПТС                     34669 non-null object
Комплектация            24539 non-null object
description             32952 non-null object
Владение               

In [7]:
train2 = train.drop_duplicates()

In [8]:
train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34669 entries, 0 to 258
Data columns (total 23 columns):
bodyType                34669 non-null object
brand                   34669 non-null object
color                   34669 non-null object
fuelType                34669 non-null object
modelDate               34669 non-null int64
name                    34669 non-null object
numberOfDoors           34669 non-null int64
productionDate          34669 non-null int64
vehicleConfiguration    34669 non-null object
vehicleTransmission     34669 non-null object
engineDisplacement      34669 non-null object
enginePower             34669 non-null object
mileage                 34669 non-null int64
Привод                  34669 non-null object
Руль                    34669 non-null object
Владельцы               34661 non-null object
ПТС                     34666 non-null object
Комплектация            24536 non-null object
description             32949 non-null object
Владение               

In [9]:
train2.to_csv('train.csv', index=False)