# Coupes

### Imports

In [1]:
import pandas as pd
import numpy as np

# Webscraping libraries
from urllib.request import urlopen # url inspector
from bs4 import BeautifulSoup
import re
from selenium import webdriver # connects to chrome browser
import warnings
warnings.filterwarnings('ignore')

# Web crawler imports
import requests
from requests import get

# Web crawlers random seeds/time delays
from time import sleep
from random import randint

# image viewer for cell outputs
from IPython.display import display, Markdown, Latex, Image, display_html, HTML

## View Cars

In [2]:
body_type=["Saloon", 
           "Hatchback", 
           "Estate", 
           "Coupe", 
           "Convertible", 
           "4x4", 
           "MPV"]

In [3]:
my_url = 'https://www.autovillage.co.uk/used-car/filter/bodystyle/coupe'
my_client = urlopen(my_url) # open up a connection to the webpage
image_viewer =my_client.read() # reads all the html from the webpage

In [4]:
image_soup = BeautifulSoup(image_viewer, "html.parser")
container_image = image_soup.findAll("div", {"class":"ucatid20"}) # found the ucatid20 tag from inspecting webpage and selecting the entire container


In [5]:
# Lets view our cars by using a for loop with the image flag we parsed into

for item in range(0,len(container_image)):
    display(HTML(str(container_image[item].findAll("div", {"class":"mb5"})[0].img)))


## Web Crawler

In [6]:
# Features
price =[] # car price
year_make_model =[] # year made, brand name, model
eng_tran =[] # engine size and transmission type
door_body =[] # number of doors and body style
mileage =[] # number of miles on the odometer

In [7]:
# set up crawler (150 page test limit) to return 1,500 cars

for i in range(0,150): #range of pages to scrape
    url= 'https://www.autovillage.co.uk/used-car/page/{}/filter/bodystyle/coupe'.format(i)
    html= urlopen(url)
    autovillage_page= html.read()
    soup= BeautifulSoup(autovillage_page, "html.parser")
    
    
    container= soup.findAll("div", {"class":"ucatid20"})
    container2= soup.findAll("div", {"class":"avprice"})
    
    for item in container2:
    #price
        price.append(item.text)
        # web scraper code use soup as conection
    for item in range(0,len(container)):
    
        #year, make, and model
        car_names= container[item].div.findAll("div", {"class":"item"})[0]
        year_make_model.append(car_names.get_text().strip())
    
        #engine size and transmission type
        tran = container[item].div.span
        eng_tran.append(tran.get_text())
    
        # number of doors and car body type
        door_bod = container[item].div.findAll("div", {"class":"item"})[2].span
        door_body.append(door_bod.get_text())
    
        # Car mileage
        car_mileage = container[item].div.findAll("div", {"class":"item"})[3].span
        mileage.append(car_mileage)

## Shape of Features

In [8]:
# lets count how many cars we have in our features it should be 10
print("Rows in price:",len(price))
print("Rows in mileage:",len(mileage))
print("Rows in door count/body style:",len(door_body))
print("Rows in engine size/transmission:",len(eng_tran))
print("Rows in year/make/model:",len(year_make_model))

Rows in price: 1500
Rows in mileage: 1500
Rows in door count/body style: 1500
Rows in engine size/transmission: 1500
Rows in year/make/model: 1500


## Create Data Frame

In [9]:
# Create the DataFrame
coupe_df = pd.DataFrame({'price':price, 
                       'mileage':mileage, 
                       'door/body':door_body, 
                       'eng/tran':eng_tran, 
                       'year/make/model':year_make_model})

In [10]:
coupe_df

Unnamed: 0,price,mileage,door/body,eng/tran,year/make/model
0,"£11,250","[ 2,460 miles]",2 Door Coupe,1000cc Automatic,2019 Smart Fortwo
1,"£26,990","[ 1,350 miles]",3 Door Coupe,3700cc Manual,2019 Nissan 370Z
2,"£18,300","[ 18,064 miles]",2 Door Coupe,1995cc Automatic,2017 BMW 4 Series
3,"£23,665","[ 39,600 miles]",Coupe,4000cc Manual,1999 TVR Cerbera
4,"£10,379","[ 93,500 miles]",5 Door Coupe,1995cc Automatic,2015 BMW 4 Series
...,...,...,...,...,...
1495,"£114,077","[ 24,300 miles]",2 Door Coupe,6600cc Automatic,2014 Rolls-Royce Wraith
1496,"£6,340","[ 34,779 miles]",3 Door Coupe,999cc Manual,2016 Seat Ibiza
1497,"£69,995","[ 79,000 miles]",2 Door Coupe,5343cc Manual,1973 Jaguar E-Type
1498,"£22,790",[ 123 miles],2 Door Coupe,Manual,2019 BMW 2 Series


## Clean the Data Frame

I can use `str.extract()` ,`str.replace()` . and `.astype(int)` to clean my data.

## Save the Data Frame

In [11]:
# Create our save directory path
coupe_save_path = '../../Raw-Data/coupe.csv'
coupe_df.to_csv(coupe_save_path) # running this cell multiple times overwrites save