In [None]:
# libraries that will be used in the project
import requests
from bs4 import BeautifulSoup

import pandas as pd

import re
import time
import json

In [None]:
test = pd.read_csv('test.csv')

---
# PART I
collect advertises urls from searching pages

In [None]:
"""
    FUNCTION
    Collection all car urls for a specified brand and model from the `auto.ru` website.
    For each brand and model the number of available pages is calculated and the urls
    from all these pages are saved into a list.
"""

def collect_car_urls(
        brand: str,
        model: str
):  # sad smile
    """
    :param brand: brand name
    :paran model: model name
    :return: list urls from all web pages
    """

    # constant:
    # tail of url: ask to order of view as table
    TAIL='&output_type=table/'

    page = 1

    # url, get, soup
    main_url = f'https://auto.ru/cars/{brand}/{model}/all/?sort=price-desc&page='
    # urls only used cars
    # main_url = f'https://auto.ru/cars/{brand}/{model}/used/?sort=price-desc&page='
    # urls only used cars for the last 7 days (page filters has also 1, 2, 3, 14, 21, 31 days)
    # main_url = f'https://auto.ru/cars/{brand}/{model}/used/?sort=price-desc&top_days=7&page=
    main_response = requests.get(main_url+str(page)+TAIL)
    main_soap = BeautifulSoup(main_response.content.decode('utf-8'), 'html.parser')
    # delete unbreakable space from text

    # find data from page to calculate # of pages.
    try:
        _ = main_soap.find('span', class_='ButtonWithLoader__content').text.replace(u'\xa0', '')

        # calculate total page number of specified brand
        urls_total = int(re.findall(r'\d+', _)[0])
        ads_per_page = len(main_soap.find_all('a', class_='Link ListingItemTitle__link'))
        pages_num = urls_total // ads_per_page + 1

    # if no data - return message about it
    except:
        print(f'IMPORTANT. There is no data for {brand} {model}.')
        return []

    print(f"Total pages for {brand} {model} is {pages_num}.")

    # prepare var for collect urls
    all_urls = []

    # cycle to collect urls from search page
    for page_num in range(1, pages_num):

        # print step of processing (every 10 pages), you can make another step
        if page_num % 10 == 0:
            print(f"...Extracting page {page_num} from {pages_num}.")

        # url, get, soup

        page_response = requests.get(main_url+str(page_num)+TAIL)
        time.sleep(0.1)                             # sleep to avoid CAPTCHA or ban
        page_soap = BeautifulSoup(page_response.content.decode('utf-8'), 'html.parser')

        # collecting all advertisement's urls
        all_urls.extend([a.get('href') for a in page_soap.find_all('a', class_='Link ListingItemTitle__link')])

    return all_urls

---
### Main cycle
---

In [None]:
# select brands and models, which we will need to collect, as they are in test dataset
# take it from test dataset.
models = list(test[['brand', 'model_name']].
              groupby(['brand', 'model_name'])['model_name'].
              count().
              to_dict())

In [None]:
# make some variables to collect data and create empty file
urls = []
list_urls = pd.DataFrame({'car_url': urls})

In [None]:
list_urls.to_csv('list_urls.csv', index=False)

In [None]:
"""
    ! WARNING ! Collecting data is long process.
    -------------------------------------------
    every 20 pages takes approximately 55 sec.
"""

for brand, model in models:

    # printing a message about the current brand-model
    print(f"Extracting data for the brand {brand} {model}:")

    # collecting all urls from all pages with specified brand
    urls = collect_car_urls(brand, model)

    # saving records to the file
    list_urls = pd.DataFrame({'car_url': urls})
    list_urls.to_csv('list_urls.csv', index=False, header=False, mode='a')
    print(f"Extracted data for the {brand} {model} saved to file")
    print('-----------------------------------------------------')

---
# RESULT

In [None]:
# total parsed urls
df = pd.read_csv('list_urls.csv')
len(df)
# 122_472

In [None]:
# but we have some duplicates
df[df.duplicated()].shape[0]
# 558

# drop them
df.drop_duplicates(inplace=True)
len(df)
# 121_914

In [None]:
# collecting urls contain urls of new auto, but webpages of new cars as it turned out,
# have different structure and my function can't collect all necessary data.
# So we need to filter out only used car.
df['brand']=df.car_url.apply(lambda q: q.split('/')[6])
df['model']=df.car_url.apply(lambda q: q.split('/')[7])
df['novice']=df.car_url.apply(lambda q: q.split('/')[4])
df_used = df[df.novice == 'used']

In [None]:
all_urls = df_used.car_url.to_list()
len(all_urls)
# 99_096

---
# PART II
Get information about every cars from theres web pages

In [None]:
# make a list of aimed features
features = test.columns.to_list() + ['price']

In [None]:
"""
    For the specified url extract information from the `auto.ru` webpage about all necessary features mentioning in the extracted_columns list, such as model_name, mileage, price, etc.
    The function returns the list of values for all specified features in the same order as its fields are presented in the extracted_columns list.
"""
def extract_url_data(url: str):
    """
    :param url: webpage url
    :return: list of values in order as fields are in test dataset
    """

    global features

    response = requests.get(url)
    page = BeautifulSoup(response.content.decode('utf-8'), 'html.parser')

    print("*")
    # main set of the features
    try:
        data = json.loads(page.find('script', type="application/ld+json").string)
    except:
        pass

    # features "url", "priceCurrency" and "price"
    # will take them from the nested dictionary "offers"
    try:
        off = data['offers']
        data['car_url'],data['priceCurrency'],data['price'] = off['url'],off['priceCurrency'],off['price']
    except:
        pass

    # feature "mileage"
    # taking it from page
    try:
        card = page.find(
            'li', class_='CardInfoRow CardInfoRow_kmAge').find_all('span')[-1].text.replace(u'\xa0', u'')
        data['mileage'] = int(re.findall(r'\d+', card)[0])
    except:
        pass

    # feature "bodyType"
    # taking it from page
    try:
        card = page.find('li', class_='CardInfoRow CardInfoRow_bodytype').find_all('span')[-1].text
        data['bodyType'] = card
    except:
        pass

    # print("**")
    # feature "model_name"
    # taking it from page
    try:
        data['model_name'] = page.find_all(
            'div', class_='InfoPopup InfoPopup_theme_plain InfoPopup_withChildren BreadcrumbsPopup')[1].text
    except:
        pass

    # feature "parsing_unixtime"
    # calculating it from real time of parsing
    try:
        data['parsing_unixtime'] = int(time.time())
    except:
        pass

    # feature "sell_id"
    # taking it from page
    try:
        data['sell_id'] = int(re.findall(
            r'\d+', page.find('div', class_='CardHead__infoItem CardHead__id').text)[0])
    except:
        pass

    # feature "super_gen"
    # taking it from page
    try:
        data['super_gen'] = json.loads(page.find('div', id="sale-data-attributes").get('data-bem'))
    except:
        pass

    # print('***')
    # feature "Владельцы"
    # taking it from page
    try:
        data['Владельцы'] = page.find(
            'li', class_='CardInfoRow CardInfoRow_ownersCount').find_all('span')[-1].text.replace(u'\xa0', u' ')
    except:
        pass

    # feature "Владение"
    # taking it from page
    try:
        data['Владение'] = page.find(
            'li', class_='CardInfoRow CardInfoRow_owningTime').find_all('span')[-1].text
    except:
        pass

    # feature "ПТС"
    # taking it from page
    try:
        data['ПТС'] = page.find(
            'li', class_='CardInfoRow CardInfoRow_pts').find_all('span')[-1].text
    except:
        pass

    # feature "Привод"
    # taking it from page
    try:
        data['Привод'] = page.find(
            'li', class_='CardInfoRow CardInfoRow_drive').find_all('span')[-1].text
    except:
        pass

    # feature "Руль"
    # taking it from page
    try:
        data['Руль'] = page.find('li', class_='CardInfoRow CardInfoRow_wheel').find_all('span')[-1].text
    except:
        pass

    # feature "Состояние"
    # taking it from page
    try:
        data['Состояние'] = page.find(
            'li', class_='CardInfoRow CardInfoRow_state').find_all('span')[-1].text
    except:
        pass

    # feature "Таможня"
    # taking it from page
    try:
        data['Таможня'] = page.find(
            'li', class_='CardInfoRow CardInfoRow_customs').find_all('span')[-1].text
    except:
        pass

    # feature "description"
    # replacing some noise with spaces in values
    try:
        data['description'] = re.sub('\W+', ' ', data['description'])
    except:
        pass

    #print('** **')
    # feature "vehicleTransmission"
    # taking it from page
    try:
        card = page.find('li', class_="CardInfoRow CardInfoRow_transmission").find_all('span')[-1].text
        data['vehicleTransmission'] = card
    except:
        pass

    # feature "engineDisplacement", "enginePower", "fuelType"
    # taking them from page
    try:
        card = page.find('li', class_="CardInfoRow CardInfoRow_engine").find_all('span')[-1].text
        card = card.replace(u'\xa0', u' ').split(' / ')
        data['engineDisplacement'], data['enginePower'], data['fuelType'] = card
    except:
        pass

    #print('*** **')
    # feature "complectation_dict"
    # will take the dict from page
    try:
        catalog_url = page.find('a', class_='Link SpoilerLink CardCatalogLink SpoilerLink_type_default').get('href')
        response_catalog = requests.get(catalog_url)
        page_catalog = BeautifulSoup(response_catalog.content.decode('utf-8'), 'html.parser')
    except:
        pass
    try:
        json_data_catalog = json.loads(
            page_catalog.find('script', type="application/json", id='initial-state').string)
    except:
        pass
    try:
        data['complectation_dict'] = json_data_catalog['state']['compare']['selected'][0]['specifications']
    except:
        pass

    # feature "numberOfDoors"
    # taking it from the "complectation dict"
    try:
        data['numberOfDoors'] = json_data_catalog['state']['compare']['selected'][0]['specifications']['doors-count']
    except:
        pass

    print('*** ***')
    # feature "equipment_dict"
    # will take the dict from page
    try:
        json_data_equip = json.loads(
            page.find('script', type="application/json", id='initial-state').string)
    except:
        pass
    try:
        data['equipment_dict'] = json_data_equip['card']['vehicle_info']['equipment']
    except:
        pass

    # making a list with data from webpage in order of columns of test dataset
    output = []
    try:
        for col in features:
            output.append(data.get(col, None))
    except:
        pass

    # two features are not found in webpage: model_info, vendor

    if not output:
        output = [None] * len(features)

    return output

---
### Main cycle (PART II)
---

In [None]:
# initiate variables to collect data
final_list = []
final_df = pd.DataFrame(columns=features)

In [None]:
# create file to add data.
""" WARNING
    If you started after interrupt - do NOT execute this block of script:
    it just clean up already collected data
"""
final_df.to_csv('train.csv', index=False, header=True)

In [None]:
"""
    ! WARNING ! Collecting data is long process.
    -------------------------------------------
    approximately 50 records takes 90-100 sec.
"""

final_list = []
for n, url in enumerate(all_urls):
    # printing a message about the current status of the program execution
    if n % 50 == 0:
        print(f"The # of current processing URL is {n}, url is {url}.")
    # printing a message about reaching 1000 records for saving to a file
    if n % 1000 == 0:

        # make dataframe from list of collected data and add it to csv-file
        final_df = pd.DataFrame(data=final_list, columns=features)
        final_df.to_csv('train.csv', index=False, header=False, mode='a')
        print(f"Collected data was add to csv-file.")

        final_list = []
    final_list.append(extract_url_data(url))


---
# Some helpful script
1. add collected data when proces was interrupted

In [None]:
# if process was interrupted, we can ad collected data to file
final_df = pd.DataFrame(data=final_list, columns=features)
print(f'Add to file: {final_df.shape[0]}')

In [None]:
final_df.to_csv('train.csv', index=False, header=False, mode='a')
final_df = pd.DataFrame(columns=features)
final_list=[]

2. calculate urls index, to start again after interrupt

In [None]:
train = pd.read_csv('train.csv')
df = pd.read_csv('list_urls.csv')
df.drop_duplicates(inplace=True)
#
#df['novice']=df.car_url.apply(lambda q: q.split('/')[4])
#df_used = df[df.novice == 'used']

# making list with urls
all_urls = df_used.car_url.to_list()
# find index of last record url from train
all_urls.index(train.car_url.iloc[-1])

# the founded index must be esed in main cycle above as:
# for n, url in enumerate(all_urls[index+1:]):

---
# RESULT

In [None]:
train = pd.read_csv('train.csv')
print(f'Train data shape : {train.shape},\n test data shape : {test.shape}')
# Train data shape : (103794, 33),
#  test data shape : (34686, 33)