In [1]:
# Importing libraries

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import requests
import time
import datetime
import smtplib
import csv
import pandas as pd

In [None]:
# Scraping the site; Amazon uses Javascript, so regular use of BeautifulSoup won't work as it won't display Javascript.
# Selenium and a webdriver will be used.

chrome_options = Options()
browser = webdriver.Chrome('E:\Portfolio Projects\Web Scraping\chromedriver.exe')
browser.get("https://www.amazon.ca/Practical-Statistics-Data-Scientists-Essential/dp/149207294X/ref=sr_1_5?keywords=statistics&qid=1637166906&sr=8-5")
time.sleep(3)
soup1 = BeautifulSoup(browser.page_source, 'html.parser')
soup2 = BeautifulSoup(soup1.prettify(), 'html.parser')
browser.quit()
print(soup2)

In [3]:
# Checking if it can find items in the scraping, let's see if it can find the title...

title = soup2.find(id = 'productTitle').get_text()
print(title)
price = soup2.find(id = 'price').get_text()
print(price)
today = datetime.date.today()
print(today)


           Practical Statistics for Data Scientists: 50+ Essential Concepts Using R and Python
          

                       $40.00
                      
2021-11-17


In [4]:
# A little clean up

price = price.strip()[1:]
title = title.strip()
print(title)
print(price)

Practical Statistics for Data Scientists: 50+ Essential Concepts Using R and Python
40.00


In [5]:
# Creating a simple dataset from this.

header = ['Title', 'Price', 'Date']
data = [title, price, today]

with open('DemoWebScraperDataset.csv', 'w', newline = '', encoding = 'UTF8') as m:
    writer = csv.writer(m)
    writer.writerow(header)
    writer.writerow(data)

In [10]:
# Now the dataframe is made, we can automate the insertion of new data as opposed to continuously recreating the csv.
def check_price():
    chrome_options = Options()
    browser = webdriver.Chrome('E:\Portfolio Projects\Web Scraping\chromedriver.exe')
    browser.get("https://www.amazon.ca/Practical-Statistics-Data-Scientists-Essential/dp/149207294X/ref=sr_1_5?keywords=statistics&qid=1637166906&sr=8-5")
    time.sleep(3)
    soup1 = BeautifulSoup(browser.page_source, 'html.parser')
    soup2 = BeautifulSoup(soup1.prettify(), 'html.parser')
    browser.quit()
    title = soup2.find(id = 'productTitle').get_text()
    price = soup2.find(id = 'price').get_text()
    price = price.strip()[1:]
    title = title.strip()
    import datetime
    today = datetime.date.today()
    import csv
    header = ['Title', 'Price', 'Date']
    data = [title, price, today]
    with open('DemoWebScraperDataset.csv', 'a+', newline = '', encoding = 'UTF8') as m:
        writer = csv.writer(m)
        writer.writerow(data)

In [9]:
# I can even make it email me to tell me if the price dropped, create a function to send an email to me...

def send_mail():
    server = smtplib.SMTP_SSL('smtp.gmail.com', 465)
    server.ehlo()
    server.starttls()
    server.ehlo(
    # This of course, is just an example account.
    server.login('example@gmail.com, 'xxxxxxxxxxxxxx')
    subject = "The item you wanted is on sale."
    body = 'What are you waiting for? The item is on sale!'
    msg = f'Subject: {subject}/n/n{body}'
    server.sendmail(
        'example@gmail.com', 
        msg
    )

Unnamed: 0,Title,Price,Date
0,Practical Statistics for Data Scientists: 50+ ...,40.0,2021-11-17
1,Practical Statistics for Data Scientists: 50+ ...,40.0,2021-11-17


In [None]:
# Adding it all together...
# We have a function that will check the price of this item, however it can be changed to any URL or item.
# Then it will take the price and name of the item along with the date it was scraped, and append it into the dataframe.
# Finally, the embedded function will email me should the price, or any variable of my choosing change or meet a requirement.

def check_price():
    # Setup for Selenium.
    chrome_options = Options()
    browser = webdriver.Chrome('E:\Portfolio Projects\Web Scraping\chromedriver.exe')
    browser.get("https://www.amazon.ca/Practical-Statistics-Data-Scientists-Essential/dp/149207294X/ref=sr_1_5?keywords=statistics&qid=1637166906&sr=8-5")
    time.sleep(3)
    soup1 = BeautifulSoup(browser.page_source, 'html.parser')
    soup2 = BeautifulSoup(soup1.prettify(), 'html.parser')
    browser.quit()
    # Cleaning and processing.
    title = soup2.find(id = 'productTitle').get_text()
    price = soup2.find(id = 'price').get_text()
    price = price.strip()[1:]
    title = title.strip()
    import datetime
    today = datetime.date.today()
    import csv
    header = ['Title', 'Price', 'Date']
    data = [title, price, today]
    with open('DemoWebScraperDataset.csv', 'a+', newline = '', encoding = 'UTF8') as m:
        writer = csv.writer(m)
        writer.writerow(data)
    # Embedded function that will email me if the price drops below 40 or specific threshold.
    if (price < 40):
        send_mail()

In [None]:
# Now, we have a price checker automation!
# I can run it constantly via a while(True) and have it check every 5 seconds, every minute, or hour, or day.
# Here, it will check the price everyday. This is a function I use when annual sales are coming up to check on what I want!

while(True):
    check_price()
    time.sleep(86400)