In [14]:
import pandas as pd
import re
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
import json
from time import gmtime, strftime

In [18]:
marks = ['bmw', 'mercedes', 'volkswagen',
         'nissan', 'toyota', 'audi', 'skoda',
         'mitsubishi','volvo','honda',
         'infiniti', 'lexus', 'suzuki']
marks = ['bmw']  # only BMW in test data

In [40]:
def create_train_set(marks):
    """Main function. Accepts a list of car brands.
    Returns a list of machine brands obtained during parsing
    and the time spent in seconds"""
    begin = datetime.now()
    list_marks = parse_cars_parallel(marks)
    duration = (datetime.now() - begin).total_seconds()
    return [list_marks, duration]


def parse_cars_parallel(marks):
    """Multithreaded parsing"""
    pool = Pool(6)
    result = pool.map(get_list_cars, marks)
    pool.close()
    pool.join()
    return result


def get_list_cars(mark):
    """Collects all cars from one mark"""
    list_cars = []
    url_base = 'https://auto.ru/moskva/cars/' + mark + '/used/?sort=price-asc'
    for n in range(6):
        if n == 0:
            url = url_base + '&km_age_to=50000&page='
        elif n == 1:
            url = url_base + '&km_age_from=50001&km_age_to=100000&page='
        elif n == 2:
            url = url_base + '&km_age_from=100001&km_age_to=150000&page='
        elif n == 3:
            url = url_base + '&km_age_from=150001&km_age_to=200000&page='
        elif n == 4:
            url = url_base + '&km_age_from=200001&km_age_to=250000&page='
        else:
            url = url_base + '&km_age_from=250001&page='
        for i in range(1, 100):
            url_page = url + str(i)
            cars = get_all_cars_on_page(url_page)  # list of dictionaries
            if len(cars) > 0:
                list_cars.extend(cars)
            else:
                break
    create_csv(list_cars, mark)
    return mark


def create_csv(list_cars, mark):
    """Creates a csv file containing the mark's machines"""
    cols = ['bodyType', 'brand', 'color', 'fuelType',
            'modelDate', 'name', 'numberOfDoors',
            'productionDate', 'vehicleConfiguration',
            'vehicleTransmission', 'engineDisplacement',
            'enginePower', 'mileage', 'Привод', 'Руль',
            'Владельцы', 'ПТС', 'Комплектация', 'description',
            'Владение', 'seller_type', 'url', 'price']
    df = pd.DataFrame(list_cars, columns=cols)
    df.to_csv(f'{mark}.csv', index=False)
    t = strftime("%H:%M:%S", gmtime())
    print(t + ' ' + mark + '.csv создан')
    return 1


def get_all_cars_on_page(url):
    """Returns all machines on the page as
    list of dictionaries. One dictionary - one machine"""
    list_cars = []
    try:
        soup = get_soup(url)
        cars = soup.find('div', class_='ListingCars-module__container ListingCars-module__list') \
            .find_all('div', class_='ListingItem-module__container')
        for car in cars:
            car_info = get_car_info(car)
            if len(car_info) > 0:
                list_cars.append(car_info)
    except Exception:
        return []
    return list_cars


def get_car_info(car):
    """Accepts the BeautifulSoup object.
    Returns information about the machine as a dictionary"""
    car_info = {}
    region = car.find('span', class_="MetroListPlace__regionName MetroListPlace_nbsp").text
    if region == 'Москва':
        for meta in car.find_all('meta'):  # creating a dictionary from the itemprop properties
            car_info[meta['itemprop']] = meta['content'].replace("\xa0", " ")
        more_data = get_info_on_page(car_info['url'])
        car_info = {**car_info, **more_data}
    return car_info


def get_info_on_page(url):
    """Collects data about the machine and returns a dictionary"""
    try:
        soup = get_soup(url)
        info = (soup.find(class_='CardInfo')).find_all('li')
        list_li = [li.find_all('span') for li in info]
        car_info = {li[0].text.replace("\xa0", " "): li[1].text.replace("\xa0", " ") for li in list_li}
    except Exception:
        return {}
    try:
        info = soup.find(id="initial-state").text
        info = json.loads(info)
    except Exception:
        return car_info
    try:
        car_info['mileage'] = info['card']['state']['mileage']
    except Exception:
        pass
    try:
        car_info['description'] = info['card']['description']
    except Exception:
        pass
    try:
        car_info['Комплектация'] = info['card']['vehicle_info']['equipmentGroups']
    except Exception:
        pass
    try:
        car_info['seller_type'] = info['card']['seller_type']
    except Exception:
        pass
    return car_info


def get_soup(url):
    """Возвращает объект BeautifulSoup по входной ссылке url"""
    r = requests.get(url)
    r.encoding = 'utf-8'
    return BeautifulSoup(r.text, 'html.parser')

70:80: E501 line too long (98 > 79 characters)
76:5: E722 do not use bare 'except'
85:80: E501 line too long (91 > 79 characters)
87:80: E501 line too long (82 > 79 characters)
100:80: E501 line too long (103 > 79 characters)
117:80: E501 line too long (82 > 79 characters)


In [41]:
res = create_train_set(marks)

14:45:40 bmw.csv создан


In [42]:
marks = ['bmw', 'mercedes', 'volkswagen',
         'nissan', 'toyota', 'audi', 'skoda',
         'mitsubishi','volvo','honda',
         'infiniti', 'lexus', 'suzuki']
marks = ['bmw']
train = pd.concat([pd.read_csv(marks[i] + ".csv") for i in range(len(marks))])
train.head()

Unnamed: 0,bodyType,brand,color,fuelType,modelDate,name,numberOfDoors,productionDate,vehicleConfiguration,vehicleTransmission,...,Привод,Руль,Владельцы,ПТС,Комплектация,description,Владение,seller_type,url,price
0,седан,BMW,белый,бензин,1995,2.0 AT,4,1998,SEDAN AUTOMATIC 2.0,автоматическая,...,задний,Левый,3 или более,Оригинал,,"Продам ласточку. Машина старая, если ищите нов...",7 месяцев,PRIVATE,https://auto.ru/cars/used/sale/bmw/5er/1098018...,190000
1,седан,BMW,пурпурный,дизель,1987,2.5 MT,4,1992,SEDAN MECHANICAL 2.5,механическая,...,задний,Левый,1 владелец,Оригинал,"[{'name': 'Прочее', 'values': ['Защита картера...",Все вопросы по телефону! \nВсе новое! Полное в...,,PRIVATE,https://auto.ru/cars/used/sale/bmw/5er/1095955...,350000
2,внедорожник 5 дв.,BMW,синий,бензин,1999,4.4 AT,5,2001,ALLROAD_5_DOORS AUTOMATIC 4.4,автоматическая,...,полный,Левый,3 или более,Оригинал,"[{'name': 'Прочее', 'values': ['Защита картера...","Капитальный ремонт двигателя, масло не ест воо...",,PRIVATE,https://auto.ru/cars/used/sale/bmw/x5/10990932...,420000
3,седан,BMW,бежевый,бензин,1936,2.0 MT,4,1938,SEDAN MECHANICAL 2.0,механическая,...,задний,Левый,2 владельца,Оригинал,,"Мотор и кпп - Ford Sierra. \nНа ходу, стоит на...",15 лет и 2 месяца,PRIVATE,https://auto.ru/cars/used/sale/bmw/326/1092958...,440000
4,седан,BMW,синий,бензин,2008,2.0 AT,4,2010,SEDAN AUTOMATIC 2.0,автоматическая,...,задний,Левый,3 или более,Оригинал,"[{'name': 'Безопасность', 'values': ['Антипроб...",В комплекте зимняя резина на 16’’ литых дисках...,7 лет и 1 месяц,PRIVATE,https://auto.ru/cars/used/sale/bmw/3er/1098784...,680000


In [43]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4926 entries, 0 to 4925
Data columns (total 23 columns):
bodyType                4926 non-null object
brand                   4926 non-null object
color                   4926 non-null object
fuelType                4926 non-null object
modelDate               4926 non-null int64
name                    4926 non-null object
numberOfDoors           4926 non-null int64
productionDate          4926 non-null int64
vehicleConfiguration    4926 non-null object
vehicleTransmission     4926 non-null object
engineDisplacement      4926 non-null object
enginePower             4926 non-null object
mileage                 4926 non-null int64
Привод                  4926 non-null object
Руль                    4926 non-null object
Владельцы               4924 non-null object
ПТС                     4926 non-null object
Комплектация            3481 non-null object
description             4755 non-null object
Владение                2177 non-null obje

In [44]:
train2 = train.drop_duplicates()

In [45]:
train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4913 entries, 0 to 4925
Data columns (total 23 columns):
bodyType                4913 non-null object
brand                   4913 non-null object
color                   4913 non-null object
fuelType                4913 non-null object
modelDate               4913 non-null int64
name                    4913 non-null object
numberOfDoors           4913 non-null int64
productionDate          4913 non-null int64
vehicleConfiguration    4913 non-null object
vehicleTransmission     4913 non-null object
engineDisplacement      4913 non-null object
enginePower             4913 non-null object
mileage                 4913 non-null int64
Привод                  4913 non-null object
Руль                    4913 non-null object
Владельцы               4911 non-null object
ПТС                     4913 non-null object
Комплектация            3471 non-null object
description             4743 non-null object
Владение                2172 non-null obje

In [11]:
train2.to_csv('train.csv', index=False)