# Saloons

### Imports

In [1]:
import pandas as pd
import numpy as np

# Webscraping libraries
from urllib.request import urlopen # url inspector
from bs4 import BeautifulSoup
import re
from selenium import webdriver # connects to chrome browser
import warnings
warnings.filterwarnings('ignore')

# Web crawler imports
import requests
from requests import get

# Web crawlers random seeds/time delays
from time import sleep
from random import randint

# image viewer for cell outputs
from IPython.display import display, Markdown, Latex, Image, display_html, HTML

## View Cars

In [2]:
body_type=["Saloon", 
           "Hatchback", 
           "Estate", 
           "Coupe", 
           "Convertible", 
           "4x4", 
           "MPV"]

In [3]:
my_url = 'https://www.autovillage.co.uk/used-car/filter/bodystyle/saloon'
my_client = urlopen(my_url) # open up a connection to the webpage
image_viewer =my_client.read() # reads all the html from the webpage

In [4]:
image_soup = BeautifulSoup(image_viewer, "html.parser")
container_image = image_soup.findAll("div", {"class":"ucatid20"}) # found the ucatid20 tag from inspecting webpage and selecting the entire container


In [5]:
# Lets view our cars by using a for loop with the image flag we parsed into

for item in range(0,len(container_image)):
    display(HTML(str(container_image[item].findAll("div", {"class":"mb5"})[0].img)))


## Web Crawler

In [6]:
# Features
price =[] # car price
year_make_model =[] # year made, brand name, model
eng_tran =[] # engine size and transmission type
door_body =[] # number of doors and body style
mileage =[] # number of miles on the odometer

In [7]:
# set up crawler (150 page test limit to return 1,500 cars)

for i in range(0,150): #range of pages to scrape
    url= 'https://www.autovillage.co.uk/used-car/page/{}/filter/bodystyle/saloon'.format(i)
    html= urlopen(url)
    autovillage_page= html.read()
    soup= BeautifulSoup(autovillage_page, "html.parser")
    
    # parsed into the container that holds information about the cars
    container= soup.findAll("div", {"class":"ucatid20"})
    # parsed to be price only
    container2= soup.findAll("div", {"class":"avprice"})
    
    for item in container2:
        #price
        price.append(item.text)
       
    for item in range(0,len(container)):
    
        #year, make, and model
        car_names= container[item].div.findAll("div", {"class":"item"})[0]
        year_make_model.append(car_names.get_text().strip())
    
        #engine size and transmission type
        tran = container[item].div.span
        eng_tran.append(tran.get_text())
    
        # number of doors and car body type
        door_bod = container[item].div.findAll("div", {"class":"item"})[2].span
        door_body.append(door_bod.get_text())
    
        # Car mileage
        car_mileage = container[item].div.findAll("div", {"class":"item"})[3].span
        mileage.append(car_mileage)
        

## Shape of Features

In [8]:
# lets count how many cars we have in our features it should be 10
print("Rows in price:",len(price))
print("Rows in mileage:",len(mileage))
print("Rows in door count/body style:",len(door_body))
print("Rows in engine size/transmission:",len(eng_tran))
print("Rows in year/make/model:",len(year_make_model))

Rows in price: 1500
Rows in mileage: 1500
Rows in door count/body style: 1500
Rows in engine size/transmission: 1500
Rows in year/make/model: 1500


## Create Data Frame

In [9]:
# Create the DataFrame
saloon_df = pd.DataFrame({'price':price, 
                       'mileage':mileage, 
                       'door/body':door_body, 
                       'eng/tran':eng_tran, 
                       'year/make/model':year_make_model})

In [10]:
saloon_df

Unnamed: 0,price,mileage,door/body,eng/tran,year/make/model
0,"£23,993","[ 11,337 miles]",4 Door Saloon,1969cc Automatic,2019 Volvo S90
1,"£9,500","[ 89,100 miles]",4 Door Saloon,1968cc Automatic,2016 Volkswagen Passat
2,"£30,498",[ 9 miles],4 Door Saloon,1395cc Automatic,2019 Volkswagen Passat
3,"£39,990","[ 1,612 miles]",Saloon,1969cc Automatic,2019 Volvo S60
4,"£7,995","[ 93,000 miles]",4 Door Saloon,2993cc Automatic,2010 Jaguar XF
...,...,...,...,...,...
1495,"£19,498","[ 20,575 miles]",4 Door Saloon,1395cc Automatic,2018 Audi A4
1496,"£1,495","[ 197,562 miles]",4 Door Saloon,1997cc Manual,2008 Volvo S40
1497,"£12,295","[ 97,600 miles]",4 Door Saloon,1995cc Automatic,BMW 3 Series
1498,"£5,495","[ 99,501 miles]",4 Door Saloon,1968cc Manual,2012 Volkswagen Passat


## Clean the Data Frame

I can use `str.extract()` ,`str.replace()` . and `.astype(int)` to clean my data.

## Save the Data Frame

In [11]:
# Create our save directory path
saloon_save_path = '../../Raw-Data/saloon.csv'
saloon_df.to_csv(saloon_save_path) # running this cell multiple times overwrites save