# Car Sales from 2015 to 2023

In [1]:
import pandas as pd
from tqdm import tqdm
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
# automate download
from selenium.webdriver.chrome.service import Service
driver_path=ChromeDriverManager().install()

In [2]:
def init_driver():
    service = Service(driver_path)
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
    prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
    chrome_options.add_experimental_option("prefs", prefs)
    chrome_options.add_argument('lang=zh_CN.utf-8')
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

In [3]:
def get_model_year(year):
    """
    Get model sales data of one year.
    """
    url = 'https://xl.16888.com/style-%d01-%d12-1.html' % (year, year)

    driver = init_driver()

    try:
        driver.get(url)
    except:
        print("No Data.")
    wait = WebDriverWait(driver, 10)

    # define data file
    data = []
    page = 1
    # iterate pages
    while True:

        # get items
        table = driver.find_element(By.CSS_SELECTOR, 'body > div:nth-child(5) > div.xl-section.clr > div.xl-section-r.fr > div > div.xl-table-view > div.xl-table-data > table > tbody')
        rows = table.find_elements(By.TAG_NAME, 'tr')
        for row in rows:
            # sales data
            cells = row.find_elements(By.TAG_NAME, 'td')
            line = [cell.text for cell in cells[:-1]]
            if line:
                model = line[1]
                sale = line[2]
                data.append([model, sale])
        
        page += 1
        
        try:
            # wait for next page botton
            next_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.lineBlock.next')))
            # click next page
            next_page.click()
            # wait for next page
            wait.until(EC.staleness_of(next_page))
        except:
            print('All pages done!')
            break
    driver.close()
    
    df = pd.DataFrame(data, columns=['model', 'sales'])
    df['class'] = None
    df['fuel'] = None
    df['weight'] = None
    df['frontB'] = None
    df['rearB'] = None
    df['link'] = None

    return df

In [4]:
def get_model_link(df):
    url = 'https://auto.16888.com/'
    driver = init_driver()
    try:
        driver.get(url)
    except:
        print("No Data.")
    wait = WebDriverWait(driver, 10)
    for i in tqdm(range(df.shape[0])):
        # get link
        line = df.loc[i]
        try:
            link = driver.find_element(By.LINK_TEXT, line['model'])
            link = link.get_attribute('href')
            df.loc[i, 'link'] = link
        except:
            pass
    driver.close()
    return df

In [5]:
def get_model_config(df, year):
    df['year'] = year

    # initialize driver
    for i in tqdm(range(df.shape[0])):
        link = df.loc[i, 'link']
        if link:
            n = 0
            while True:
                n += 1
                try:
                    driver = init_driver()
                    driver.get(link)
                    break
                except:
                    if n <= 10:
                        print("Create driver failed. Retry...")
                    else:
                        break
            if n > 10:
                continue
        else:
            continue
        
        # get config data
        try:
            link = driver.find_element(By.LINK_TEXT, '参数配置')
            link.click()
        except:
            continue

        # select a year
        find_status = 0
        for y in list(range(year, 2025, 1)):
            try:
                type_year = driver.find_element(By.XPATH, '//*[@id="Year%d"]' % y) 
                type_year.click()
                find_status = 1
                break
            except:
                pass
        if find_status == 0:
            for y in list(range(year, 2000, -1)):
                try:
                    type_year = driver.find_element(By.XPATH, '//*[@id="Year%d"]' % y) 
                    type_year.click()
                    find_status = 1
                    break
                except:
                    pass
        if find_status == 1:
            config_table = driver.find_element(By.ID, 'config_main')  # data table
            config_data = config_table.text
            # config data extraction
            config_data = config_data.split('\n')
            car_class = config_data[5].split()[1:]
            fuel_type = config_data[49].split()[1:]
            weight = config_data[26].split()[1:]
            front_brake = config_data[66].split()[1:]
            rear_brake = config_data[67].split()[1:]
            # save
            df.loc[i, 'fuel'] = str(fuel_type)
            df.loc[i, 'class'] = str(car_class)
            df.loc[i, 'weight'] = str(weight)
            df.loc[i, 'frontB'] = str(front_brake)
            df.loc[i, 'rearB'] = str(rear_brake)
        else:
            pass
        driver.close()
    return df

In [6]:
def get_sales(year):
    df = get_model_year(year)
    df = get_model_link(df)
    df = get_model_config(df, year)
    return df

In [7]:
for year in range(2000,2015,1):
    try:
        sales = get_model_year(int(year))
        sales = get_model_link(sales)
        sales = get_model_config(sales, int(year))
        sales.to_excel("data/car_sale_%d.xlsx" % year, index=False)
    except:
        pass

All pages done!


0it [00:00, ?it/s]
0it [00:00, ?it/s]


All pages done!


0it [00:00, ?it/s]
0it [00:00, ?it/s]


All pages done!


0it [00:00, ?it/s]
0it [00:00, ?it/s]


All pages done!


0it [00:00, ?it/s]
0it [00:00, ?it/s]


All pages done!


0it [00:00, ?it/s]
0it [00:00, ?it/s]


All pages done!


0it [00:00, ?it/s]
0it [00:00, ?it/s]


All pages done!


0it [00:00, ?it/s]
0it [00:00, ?it/s]


All pages done!


100%|██████████| 95/95 [01:01<00:00,  1.55it/s]
100%|██████████| 95/95 [07:42<00:00,  4.87s/it]


All pages done!


100%|██████████| 114/114 [01:28<00:00,  1.29it/s]
100%|██████████| 114/114 [09:19<00:00,  4.91s/it]


All pages done!


100%|██████████| 132/132 [02:07<00:00,  1.04it/s]
 25%|██▌       | 33/132 [02:37<08:35,  5.21s/it]

Create driver failed. Retry...


100%|██████████| 132/132 [13:00<00:00,  5.91s/it]


All pages done!


100%|██████████| 174/174 [02:22<00:00,  1.22it/s]
100%|██████████| 174/174 [18:20<00:00,  6.33s/it]


All pages done!


100%|██████████| 212/212 [03:10<00:00,  1.11it/s]
 92%|█████████▏| 194/212 [23:24<01:13,  4.09s/it]

Create driver failed. Retry...


100%|██████████| 212/212 [26:43<00:00,  7.56s/it]


All pages done!


100%|██████████| 257/257 [04:00<00:00,  1.07it/s]
100%|██████████| 257/257 [26:28<00:00,  6.18s/it]


All pages done!


100%|██████████| 312/312 [04:32<00:00,  1.14it/s]
100%|██████████| 312/312 [26:09<00:00,  5.03s/it]


All pages done!


100%|██████████| 373/373 [05:43<00:00,  1.09it/s]
100%|██████████| 373/373 [30:16<00:00,  4.87s/it]
