# Web crawler

In [1]:
import pandas as pd
import numpy as np

# Webscraping libraries
from urllib.request import urlopen # url inspector
from bs4 import BeautifulSoup
import re








## Set up the crawler

In [154]:
def web_crawler():
  
    print("""
    Auto Village Webscraper!
    
    A car web scraping program created by Paul Aleksis Williams.
    
    Program description:
    
    
    This programs scrapes cars listed on the autovillage.co.uk website, cleans the
    entries, drops duplicated entries, drops entries with missing values, and
    returns a data frame that contains 1 target variable and 8 features.
    
    Program inputs:
    
    
    1. Input the body style of cars you want to scrape ot leave empty for all.
       options: saloon, hatchback, 4x4, estate, coupe, convertible, mpv. For
       multiple styles seperate input with a comma.
    
    2. Input the number of pages to scrape there are 10 cars per page. If left
       blank, it will scrape 1 page per body style.
    
    3. Input a name for the file to save it as a csv in the current directory.
       If left blank, It will not save but you can set it as a variable to view
       the table.
       
       
    Program output:
    
    
    The function returns a cleaned data frame of cars scraped from autovillage.co.uk
    dropping all the missing values within the websites interface, and removing any
    duplicated cars.
    
    
    Program save feature:
    
    
    You can leave the save input empty and the file will not be saved.
    
    You can set a variable for this function in jupyter notebooks and manipulate
    the created pandas object as normal.
    
    
    """)
    
    
    
    # Features
    price =[] # car price
    year_make_model =[] # year made, brand name, model
    eng_tran =[] # engine size and transmission type
    door_body =[] # number of doors and body style
    mileage =[] # number of miles on the odometer
    
    # body style available inputs
    all_styles = ["saloon", "hatchback", "4x4", "estate", "coupe", "convertible", "mpv"]
    
    # take user input of body style. If left blank all body inputs will be taken
    print("\n")
    print("Inputs:")
    print("------------------------------------------------------------------------------------------------------")
    input_body = input("Enter a body style (or leave blank for all): ")
    input_body = input_body.lower().replace(",", "")
    
    if input_body=="":
        bodystyle = all_styles
    else: 
        bodystyle = list(input_body.split())
    
    
    # amount of pages input
    input_pages = input("Enter the amount of pages you want to scrape (or leave blank for 1): ")
    
    if input_pages =="":
        pages= [1]
    else:
        pages = range(0, int(input_pages)+1)

    
    for i in pages:
        for p in bodystyle:
            url= 'https://www.autovillage.co.uk/used-car/page/{}/filter/bodystyle/{}'.format(i, p)
            html= urlopen(url)
            autovillage_page= html.read()
            soup= BeautifulSoup(autovillage_page, "html.parser")

            # parsed into the container that holds information about the cars
            container= soup.findAll("div", {"class":"ucatid20"})
            # parsed to be price only
            container2= soup.findAll("div", {"class":"avprice"})

            for item in container2:
                #price
                price.append(item.text)

            for item in range(0,len(container)):

                #year, make, and model
                car_names= container[item].div.findAll("div", {"class":"item"})[0]
                year_make_model.append(car_names.get_text().strip())

                #engine size and transmission type
                tran = container[item].div.span
                eng_tran.append(tran.get_text())

                # number of doors and car body type
                door_bod = container[item].div.findAll("div", {"class":"item"})[2].span
                door_body.append(door_bod.get_text())

                # Car mileage
                car_mileage = container[item].div.findAll("div", {"class":"item"})[3].span 
                # fix null objects where certain features were not entered
                if car_mileage is None:
                    mileage.append("")
                else:
                    for item in car_mileage:
                           mileage.append(item)


        # make the df
        df = pd.DataFrame({'price':price, 
                           'mileage':mileage, 
                           'door/body':door_body, 
                           'eng/tran':eng_tran, 
                           'year/make/model':year_make_model})        
         
    # clean the df
    # remove my pound signs and commas from the price column

    df['price'] = df['price'].str.replace("£|,","")

    # remove span html flags from mileage

    df['mileage'] = df['mileage'].str.replace("miles|,", "")

    
    # remove door from door/body
    df['door/body'] = df['door/body'].str.replace("Door", "")

    # remove cc from eng (may convert this to litres later)
    df['eng/tran'] = df['eng/tran'].str.replace("cc", "")

    # remove class from mercedes so that it can follow a year-model-class format like the other cars do
    df['year/make/model'] = df['year/make/model'].str.replace("Class", "")
    df['year/make/model'] = df['year/make/model'].str.replace("Land Rover", "Land-Rover")
    df['year/make/model'] = df['year/make/model'].str.replace("Range Rover Sport", "Range-Rover-Sport")
    df['year/make/model'] = df['year/make/model'].str.replace("Discovery Sport", "Discovery-Sport")
    df['year/make/model'] = df['year/make/model'].str.replace("Aston Martin", "Aston-Martin")
    df['year/make/model'] = df['year/make/model'].str.replace("Alfa Romeo", "Alfa-Romeo")
    df['year/make/model'] = df['year/make/model'].str.replace("Corvette", "Chevrolet")
    df['year/make/model'] = df['year/make/model'].str.replace("C7", "Corvette-C7")
    # split door count and body style

    df[['door_count','body_style']]= df['door/body'].str.split(expand=True)

    # split engine size and transmission

    df[['engine_size(cc)', 'transmission']] = df['eng/tran'].str.split(expand=True)

    # split year , make, and model into seperate columns

    df['year'] = df['year/make/model'].str.split(' ', expand=True)[0]
    df['brand'] = df['year/make/model'].str.split(' ', expand=True)[1]
    df['model'] = df['year/make/model'].str.split(' ', expand=True)[2]

    # drop the labels that were split and rename the old ones to include measurement unit
    df.drop(labels=['door/body','eng/tran', 'year/make/model'], axis=1, inplace=True)
    df.rename(columns={"mileage": "mileage(mi)", "price":"price(£)"}, inplace=True)
    
    # convert strings to integers
    df['price(£)'] = pd.to_numeric(df['price(£)'], errors='coerce')
    df['mileage(mi)'] = pd.to_numeric(df['mileage(mi)'], errors='coerce')
    df['door_count'] = pd.to_numeric(df['door_count'], errors='coerce')
    df['engine_size(cc)'] = pd.to_numeric(df['engine_size(cc)'], errors='coerce')
    df['year'] = pd.to_numeric(df['year'], errors='coerce')
    
    
    # Drop nulls
    df.dropna(axis=0, inplace=True)
    
    
    # Drop duplicates
    df.drop_duplicates(keep='first', inplace=True)
    
    # Count how many cars got scraped
    print("\n")
    print("Output:")
    print("------------------------------------------------------------------------------------------------------")
    print("Done! you scraped: ", len(df), "cars!")
    
    # Create the save with user defined name. If left blank the save wont happen
    input_save = input("Name to save file as (leave blank to not save): ")
    save_file_name = str(input_save)

    file_path = save_file_name+".csv"
    result = df.to_csv(file_path)
    
    print("------------------------------------------------------------------------------------------------------")
    if save_file_name == "":
        print("File Not Saved!")
    else:
        print("File saved as {}".format(save_file_name+".csv"), "in the current directory!")
    
    return df

## Test it:

In [153]:
df1 = web_crawler()


    Auto Village Webscraper!
    
    A car web scraping program created by Paul Aleksis Williams.
    
    Program description:
    
    
    This programs scrapes cars listed on the autovillage.co.uk website, cleans the
    entries, drops duplicated entries, drops entries with missing values, and
    returns a data frame that contains 1 target variable and 8 features.
    
    Program inputs:
    
    
    1. Input the body style of cars you want to scrape ot leave empty for all.
       options: saloon, hatchback, 4x4, estate, coupe, convertible, mpv. For
       multiple styles seperate input with a comma.
    
    2. Input the number of pages to scrape there are 10 cars per page. If left
       blank, it will scrape 1 page per body style.
    
    3. Input a name for the file to save it as a csv in the current directory.
       If left blank, It will not save but you can set it as a variable to view
       the table.
       
       
    Program output:
    
    
    The function re

Enter a body style (or leave blank for all):  
Enter the amount of pages you want to scrape (or leave blank for 1):  


------------------------------------------------------------------------------------------------------
Output:
------------------------------------------------------------------------------------------------------
Done! you scraped:  68 cars!


Name to save file as (leave blank to not save):  


------------------------------------------------------------------------------------------------------
File Not Saved!


In [147]:
df1

Unnamed: 0,price(£),mileage(mi),door_count,body_style,engine_size(cc),transmission,year,brand,model
0,8995,55000,4.0,Saloon,1595,Manual,2013.0,Mercedes-Benz,C
1,11500,16296,4.0,Saloon,2000,Manual,2017.0,Mazda,3
2,10790,62800,4.0,Saloon,2179,Automatic,2014.0,Jaguar,XF
3,9499,56606,4.0,Saloon,1995,Manual,2013.0,BMW,5
4,24990,9784,4.0,Saloon,1998,Automatic,2017.0,BMW,5
...,...,...,...,...,...,...,...,...,...
65,10990,30000,5.0,MPV,1560,Automatic,2017.0,Citroen,Berlingo
66,16649,49444,5.0,MPV,1997,Manual,2016.0,Ford,Galaxy
67,14295,10000,5.0,MPV,1364,Automatic,2017.0,Vauxhall,Zafira
68,9495,61650,5.0,MPV,1598,Manual,2015.0,Renault,Grand


In [30]:
from IPython.display import HTML, display

In [87]:
def car_viewer():
    
    input_body = input("What body type of car do you want to view? ")
    bodystyle = str(input_body)
    
    
    input_brand = input("What brand do you want to view? ")
    brand= str(input_brand.title().replace(" ", "-"))
    
    
    
    input_pages = input("What page do you want to view? ")
    pages = int(input_pages)
    
    my_url = 'https://www.autovillage.co.uk/used-car/{}/page/{}/filter/bodystyle/{}'.format(brand, pages, bodystyle)

    
    my_client = urlopen(my_url) # open up a connection to the webpage
    image_viewer =my_client.read() # reads all the html from the webpage
    
    image_soup = BeautifulSoup(image_viewer, "html.parser")
    container_image = image_soup.findAll("div", {"class":"ucatid20"})
    
    for item in range(0,len(container_image)):
        cars = display(HTML(str(container_image[item].findAll("div", {"class":"mb5"})[0].img)))
    
    return cars

In [98]:
df = web_crawler()

Auto Village Webscraper!
------------------------------------------------------------------------------------------------------
Body style choices (seperate with commas): saloon, hatchback, 4x4, estate, coupe, convertible, or mpv.
Pages: a single integer (note there are 10 cars per page)
Save File: name you want to call your csv file
------------------------------------------------------------------------------------------------------


Inputs:
------------------------------------------------------------------------------------------------------


Enter a body style (or leave blank for all):  
Enter the amount of pages you want to scrape:  3


------------------------------------------------------------------------------------------------------
You scraped:  195 cars


Name to save file as (leave blank to not save):  


------------------------------------------------------------------------------------------------------
File Not Saved!


In [99]:
df.head()

Unnamed: 0,price(£),mileage(mi),door_count,body_style,engine_size(cc),transmission,year,brand,model
0,7498,46152.0,4.0,Saloon,1685.0,Manual,2014.0,Hyundai,I40
1,26190,7126.0,4.0,Saloon,1500.0,Automatic,2019.0,Mercedes-Benz,C
2,15600,25997.0,4.0,Saloon,1600.0,Manual,2016.0,Mercedes-Benz,C
4,17498,57290.0,4.0,Saloon,1968.0,Automatic,2015.0,Audi,A6
5,7999,38000.0,4.0,Saloon,1968.0,Manual,2014.0,Volkswagen,Passat


In [100]:
car_viewer()

What body type of car do you want to view?  saloon
What brand do you want to view?  hyundai
What page do you want to view?  1
