### In this project, a web scraper will be built to scrape vehicle images (and other features) into a MySQL database for later analysis. 

In [1]:
# Import from the necessary modules.

from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
from urllib.request import urlretrieve
import re
import pymysql
import time
import random
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from PIL import Image

In [2]:
"""
Load pycodestyle_mage and run flake8 to check code compliance with PEP8
standards.
"""

%load_ext pycodestyle_magic
%flake8_on

1: E999 SyntaxError: invalid syntax


In [3]:
"""
Create a short function that uses a driver object to get the
HTML page source, which can be used to create a BeautifulSoup
object for parsing.
"""


def driver_page_source(sel_driver):
    """
    Function that uses the driver to return the page source
    that is then used to create a BeautifulSoup object.
    """
    page_source = sel_driver.page_source
    soup_obj = BeautifulSoup(page_source, 'html.parser')
    return soup_obj

In [4]:
"""
Create a function that will connect to the MySQL database and enter
the data from the variables into the database.
"""


def store_data(year, make, model, kms, private, image1, image2, image3,
               price):
    """
    This function will transfer the vehicle data from each vehicle
    into the MySQL database.
    """
    try:
        # Connect to database with connection object.
        conn = pymysql.connect(host='localhost', user='root',
                               passwd='Afshin123', db='mysql', charset='utf8')
        # Use connection object to create cursor.
        cur = conn.cursor()
        # Use cursor to execute SQL commands.
        cur.execute('USE Images_DB')
        cur.execute('INSERT INTO Cars_Practice (Car_Year, Car_Make, Car_Model,'
                    'Car_Kms, Car_Private, Car_Image1, Car_Image2, Car_Image3,'
                    'Car_Price) VALUES ("{}", "{}", "{}", "{}", "{}", "{}",'
                    '"{}", "{}", "{}")'.format(year, make, model, kms,
                                               private, image1, image2, image3,
                                               price))
        # Commit the information to the database.
        cur.connection.commit()
        # Ensure connection/cursor objects are closed.
    finally:
        cur.close()
        conn.close()

In [5]:
"""
Create a function that will alter the image for the project purposes.
"""


def alter_image(downloaded_image):
    """
    This function will take in a downloaded image with the drive location
    of the image, extract the image name, load it into Pillow, convert it
    to grayscale, ensure the size is 133x100 pixels, and finally transform
    it into, and return, a NumPy array.
    """
    # Get image name only.
    image_name = downloaded_image[0]
    # Create a Pillow Image object and load it with the image
    image_load = Image.open(image_name)
    # Change the image to grayscale
    image_gs = image_load.convert(mode='L')
    # Ensure size of image is 133x100 pixels
    image_resize = image_gs.resize((133, 100))
    # Transform image to NumPy array
    image_array = np.asarray(image_resize)
    return image_array

In [6]:
"""
Next, create a Selenium driver object to use a Google Chrome
webdriver.  Set required options
"""

chrome_options = Options()  # Initialize object
# chrome_options.add_argument('--headless')
# Use fake user-agent header.
chrome_options.add_argument('user-agent=Mozilla/5.0 (X11; Ubuntu;'
                            'Linux x86_64;rv:68.0) Gecko/20100101'
                            'Firefox/68.0')
driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver',
                          options=chrome_options)
website = 'http://www.autotrader.ca/cars/'
driver.get(website)
time.sleep(random.randint(2, 4))

# Click the page once to go to the main page if there is any overlay.
actions = ActionChains(driver)
actions.move_by_offset(random.randint(50, 75), random.randint(50, 75))
actions.click().perform()
time.sleep(random.randint(2, 4))

# Change the sort parameter so that vehicles are sorted by date,
# not by the default sort.  Note, this can also be done with
# the URL.
sort_select = Select(driver.find_element_by_id('sortBy'))
sort_select.select_by_value('CreatedDateDesc')
time.sleep(random.randint(3, 4))

In [8]:
"""
Create an empty set of vehicle links.  Create a page loop that will get
a page source and use that to return all relevant ad links.
If the ad links list returned is not empty, then a random link will be
selected from the returned list of links.  If that link is
unique, then vehicle information will be gathered.  The driver will
then return to the main page and select another randon link to go to.

When the loop is completed, we need to then go onto the next page.
"""

vehicle_links = set()  # Set ensures information is gathered from unique links.

i = 0  # Ad counter
k = 0  # Page counter
page = 0  # URL counter

while k < 2:  # Website page loop
    # Use the driver_page_source function to get a BeautifulSoup object.
    soup_obj = driver_page_source(driver)
    # Use the soup_obj to put ad links for the page into a list.
    returned = soup_obj.find_all('a', {'href': re.compile('^(/a/)'),
                                 'class': 'main-photo click'})
    # If the returned list is not empty, continue into ad loop.
    if returned != []:
        while i < random.randint(3, len(returned)):
            # Select a random link from the list of page links returned
            random_link = returned[random.randint(0, len(returned)-1)]
            # If the random link is not a duplicate (is unique),
            # add it to the set of links
            if random_link.attrs['href'] not in vehicle_links:
                new_listing_url = random_link.attrs['href']
                vehicle_links.add(new_listing_url)
                # Go to the unique ad link
                driver.get('http://autotrader.ca{}'.format(new_listing_url))
                time.sleep(random.randint(5, 7))

                # Use driver_page_source function to get the page source
                # for the ad.
                soup_obj_new = driver_page_source(driver)
                
                # Use the soup_new_obj to find h1 to get year,
                # make, and model of vehicle.
                title = soup_obj_new.find('h1').get_text().upper().split()
                print(title)
                try:
                    # The car year should be an integer.
                    car_year = int(title[0])
                except ValueError:
                    # If the string cannot be turned into an integer,
                    # leave it as a string and continue.
                    car_year = title[0]
                car_make = title[1]
                car_model = title[2]
                
                # Next, use the soup object to find the desired images.
                vehicle_images = soup_obj_new.find_all('img', {'class':
                                                       'col-xs-12 col-md-12'
                                                       'vdp-gallery-thumbphotos'})
                # With the list of images, only 3 images will be downloaded and placed
                # into an image list.  Note that the contents of the below loop can
                # easily be made into a function, as well.
                j = 0
                img_list = []
                for image in vehicle_images:
                    # Download the image and name it.
                    naming = urlretrieve(image['src'], ('photo' + str(k) + str(i)
                                                        + str(j)) + '.jpg')
                    # Use the alter_image function to return an altered image array
                    # and append that image to the list.
                    img_list.append(alter_image(naming))
                    j += 1
                    if j == 3:
                        break
                # Once out of the loop, use try/except to continue if there are no
                # images, or only 1 or 2 images of the vehicle.
                try:
                    car_image1 = img_list[0]
                except IndexError:
                    car_image1 = ''
                try:
                    car_image2 = img_list[1]
                except IndexError:
                    car_image2 = ''
                try:
                    car_image3 = img_list[2]
                except IndexError:
                    car_image3 = ''

                # Get kilometers of the vehicle
                table_data = soup_obj_new.find('div', {'id': 'vdp-specs-content'}) \
                                         .find('table').find_all('td')
                car_kms = table_data[0].get_text().split()[0].replace(',', '')
                try:
                    car_kms = int(car_kms)
                except ValueError:
                    car_kms = ''

                # Find out if the car is a private sale or not
                car_private = 1
                if soup_obj_new.find('div', {'class': 'vdp-private-icon'}) is None:
                    car_private = 0

                # Finally, get the price of the vehicle
                car_price = soup_obj_new.find('h2').get_text().replace(',', '') \
                                        .replace('$', '')
                try:
                    car_price = int(car_price)
                except ValueError:
                    car_price = ''

                # With all of the ad information, this information can now be
                # transferred to a database and committed there.
                store_data(car_year, car_make, car_model, car_kms, car_private,
                           car_image1, car_image2, car_image3, car_price)

                i += 1  # Increase loop count for ad.
                driver.back()  # Return to ads page.
                time.sleep(random.randint(3, 5))
            else:
                continue
        i = 0  # Return inner loop count to 0 for next page of ads.
        k += 1  # Increase page count.
        page += 15  # Go to the next page with the URL.
        driver.get('http://autotrader.ca/cars/?rcp=15&rcs={}&srt=9'
                   '&prx=-1&hprc=True&wcp=True&inMarket=advancedSearch'
                   .format(str(page)))
        time.sleep(random.randint(3, 5))
    else:
        continue
driver.close()

['2014', 'TOYOTA', 'HIGHLANDER', 'LE', 'BACKUP', 'CAM', '|', 'BLUETOOTH', '|', 'AUTO', 'HEADLIGHTS', '-', 'MARKHAM']
['2019', 'LINCOLN', 'MKZ', 'RESERVE', '-', 'VAUGHAN']
['2013', 'JAGUAR', 'XF', '3.0L', 'SUPERCHARGED', 'AWD', 'BLACK/BLACK', 'NAVIGATION', '-', 'NORTH', 'YORK']
['2010', 'DODGE', 'RAM', '1500', 'QUAD-HITCH-MARCHEPIEDS++', '-', 'GRANBY']
['2015', 'GMC', 'SIERRA', '2500', 'SLE+BOITE', '8', 'PIED+ECRAN.TACT', '-', 'TERREBONNE']
['2012', 'CHEVROLET', 'CRUZE', 'LT', 'TURBO', 'W/1SA', '-', 'CALEDONIA']
['2017', 'KIA', 'FORTE', '4DR', 'SDN', 'AUTO', 'EX', 'CAMERA', 'DE', 'RECULE', 'PNEU', 'HIVER', '-', 'MCMASTERVILLE']
['2013', 'VOLKSWAGEN', 'PASSAT', 'COMFORTLINE', '-', 'BELLEVILLE']
['2019', 'NISSAN', 'MURANO', 'PLATINUM', '-', 'ETOBICOKE']
['2018', 'BMW', 'X3', 'XDRIVE30I', '-', 'QUÉBEC']
['2013', 'TOYOTA', 'RAV4', 'XLE', '|', 'AWD,', 'NAV,', 'SUNROOF,', 'HEATED', 'SEATS,', 'SUNROOF', '-', 'CALEDONIA']
['2011', 'MAZDA', 'MAZDA3', 'BERLINE', '4', 'PORTES,', 'BOÎTE', 'AUTOMATI

41:1: W293 blank line contains whitespace
55:1: W293 blank line contains whitespace
59:56: E128 continuation line under-indented for visual indent
59:80: E501 line too long (82 > 79 characters)
60:80: E501 line too long (86 > 79 characters)
61:80: E501 line too long (83 > 79 characters)
67:80: E501 line too long (81 > 79 characters)
69:80: E501 line too long (83 > 79 characters)
75:80: E501 line too long (82 > 79 characters)
91:80: E501 line too long (84 > 79 characters)
101:80: E501 line too long (83 > 79 characters)
105:80: E501 line too long (81 > 79 characters)
