# Web crawler

In [1]:
import pandas as pd
import numpy as np

# Webscraping libraries
from urllib.request import urlopen # url inspector
from bs4 import BeautifulSoup
import re
from selenium import webdriver # connects to chrome browser
import warnings
warnings.filterwarnings('ignore')

# Web crawler imports
import requests
from requests import get

# Web crawlers random seeds/time delays
from time import sleep
from random import randint

# image viewer for cell outputs
from IPython.display import display, Markdown, Latex, Image, display_html, HTML

## Set up the crawler

In [478]:
def web_crawler():
    
    
    """
    
    Function use:
    
    This functions asks for an input of the body style of cars you want to scrape.
    It then asks for a number of pages to scrape (10 cars per page). Finally, it
    it asks for a name to save the file as a csv in the current directory.
    
    Function output:
    
    The function returns a cleaned data frame of cars scraped from autovillage.com
    dropping all the missing values within the websites interface, and removing any
    duplicated cars.
    
    Save or make a variable:
    
    You can leave the save input empty and the file will not be saved.
    
    You can set set a variable for this function in jupyter notebooks and manipulate
    the created pandas object as normal.
    
    
    """
    
    
    
    # Features
    price =[] # car price
    year_make_model =[] # year made, brand name, model
    eng_tran =[] # engine size and transmission type
    door_body =[] # number of doors and body style
    mileage =[] # number of miles on the odometer
    
    # body style available inputs
    all_styles = ["saloon", "hatchback", "4x4", "estate", "coupe", "convertible", "mpv"]
    
    # take user input of body style. If left blank all body inputs will be taken
    print("Body styles = saloon, hatchback, 4x4, estate, coupe, convertible, or mpv.")
    input_body = input("Enter a body style (or leave blank for all): ")
    input_body = input_body.replace(",", "")
    
    if input_body=="":
        bodystyle = all_styles
    else: 
        bodystyle = list(input_body.split())
    
    
    # amount of pages input
    input_pages = input("Enter the amount of pages you want to scrape: ")
    pages = range(0, int(input_pages))
    
    
    for i in pages:
        for p in bodystyle:
            url= 'https://www.autovillage.co.uk/used-car/page/{}/filter/bodystyle/{}'.format(i, p)
            html= urlopen(url)
            autovillage_page= html.read()
            soup= BeautifulSoup(autovillage_page, "html.parser")

            # parsed into the container that holds information about the cars
            container= soup.findAll("div", {"class":"ucatid20"})
            # parsed to be price only
            container2= soup.findAll("div", {"class":"avprice"})

            for item in container2:
                #price
                price.append(item.text)

            for item in range(0,len(container)):

                #year, make, and model
                car_names= container[item].div.findAll("div", {"class":"item"})[0]
                year_make_model.append(car_names.get_text().strip())

                #engine size and transmission type
                tran = container[item].div.span
                eng_tran.append(tran.get_text())

                # number of doors and car body type
                door_bod = container[item].div.findAll("div", {"class":"item"})[2].span
                door_body.append(door_bod.get_text())

                # Car mileage
                car_mileage = container[item].div.findAll("div", {"class":"item"})[3].span 
                # fix null objects where certain features were not entered
                if car_mileage is None:
                    mileage.append("")
                else:
                    for item in car_mileage:
                           mileage.append(item)


        # make the df
        df = pd.DataFrame({'price':price, 
                           'mileage':mileage, 
                           'door/body':door_body, 
                           'eng/tran':eng_tran, 
                           'year/make/model':year_make_model})        
         
    # clean the df
    # remove my pound signs and commas from the price column

    df['price'] = df['price'].str.replace("£|,","")

    # remove span html flags from mileage

    df['mileage'] = df['mileage'].str.replace("miles|,", "")

    
    # remove door from door/body
    df['door/body'] = df['door/body'].str.replace("Door", "")

    # remove cc from eng (may convert this to litres later)
    df['eng/tran'] = df['eng/tran'].str.replace("cc", "")

    # remove class from mercedes so that it can follow a year-model-class format like the other cars do
    df['year/make/model'] = df['year/make/model'].str.replace("Class", "")
    df['year/make/model'] = df['year/make/model'].str.replace("Land Rover", "Land-Rover")
    df['year/make/model'] = df['year/make/model'].str.replace("Range Rover Sport", "Range-Rover-Sport")
    df['year/make/model'] = df['year/make/model'].str.replace("Discovery Sport", "Discovery-Sport")
    df['year/make/model'] = df['year/make/model'].str.replace("Aston Martin", "Aston-Martin")
    df['year/make/model'] = df['year/make/model'].str.replace("Alfa Romeo", "Alfa-Romeo")
    df['year/make/model'] = df['year/make/model'].str.replace("Corvette", "Chevrolet")
    df['year/make/model'] = df['year/make/model'].str.replace("C7", "Corvette-C7")
    # split door count and body style

    df[['door_count','body_style']]= df['door/body'].str.split(expand=True)

    # split engine size and transmission

    df[['engine_size(cc)', 'transmission']] = df['eng/tran'].str.split(expand=True)

    # split year , make, and model into seperate columns

    df['year'] = df['year/make/model'].str.split(' ', expand=True)[0]
    df['brand'] = df['year/make/model'].str.split(' ', expand=True)[1]
    df['model'] = df['year/make/model'].str.split(' ', expand=True)[2]

    # drop the labels that were split and rename the old ones to include measurement unit
    df.drop(labels=['door/body','eng/tran', 'year/make/model'], axis=1, inplace=True)
    df.rename(columns={"mileage": "mileage(mi)", "price":"price(£)"}, inplace=True)
    
    # convert strings to integers
    df['price(£)'] = pd.to_numeric(df['price(£)'], errors='coerce')
    df['mileage(mi)'] = pd.to_numeric(df['mileage(mi)'], errors='coerce')
    df['door_count'] = pd.to_numeric(df['door_count'], errors='coerce')
    df['engine_size(cc)'] = pd.to_numeric(df['engine_size(cc)'], errors='coerce')
    df['year'] = pd.to_numeric(df['year'], errors='coerce')
    
    
    # Drop nulls
    df.dropna(axis=0, inplace=True)
    
    
    # Drop duplicates
    df.drop_duplicates(keep='first', inplace=True)
    
    # Create the save with user defined name. If left blank the save wont happen
    input_save = input("Name to save file as (leave blank to not save): ")
    save_file_name = str(input_save)

    file_path = save_file_name+".csv"
    result = df.to_csv(file_path)
    
    return df

In [479]:
web_crawler()

Body styles = saloon, hatchback, 4x4, estate, coupe, convertible, or mpv.


Enter a body style (or leave blank for all):  coupe
Enter the amount of pages you want to scrape:  1
Name to save file as (leave blank to not save):  


Unnamed: 0,price(£),mileage(mi),door_count,body_style,engine_size(cc),transmission,year,brand,model
0,19998,28826,2.0,Coupe,1995.0,Automatic,2018.0,BMW,4
1,25000,20613,5.0,Coupe,2993.0,Automatic,2015.0,BMW,X4
2,149850,1500,2.0,Coupe,3996.0,Automatic,2019.0,Bentley,Continental
3,46950,350,2.0,Coupe,2996.0,Automatic,2019.0,Mercedes-Benz,C
4,76940,4763,2.0,Coupe,5200.0,Automatic,2017.0,Audi,R8
5,13890,82000,4.0,Coupe,1995.0,Automatic,2016.0,BMW,4
6,12090,31647,2.0,Coupe,1499.0,Automatic,2016.0,BMW,2
7,12395,88200,2.0,Coupe,1968.0,Manual,2015.0,Audi,A5
9,6490,73000,4.0,Coupe,1591.0,Manual,2012.0,Hyundai,Veloster
