In [25]:
# Import libraries

from bs4 import BeautifulSoup
import requests
import time
import datetime
import smtplib    # Used to send emails to ourselves

In [26]:
# We have to tell BeautifulSoup and requests where we are getting this data from

# Connect to website

URL = 'https://www.amazon.in/gp/product/B07YYCKR11/ref=ox_sc_saved_title_4?smid=A2WK4OB3ROODF0&psc=1'

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
# User-Agent part is specific to our computer - get from this link - https://httpbin.org/get

page = requests.get(URL, headers = headers) 
# This is where we'll start getting the data

soup1 = BeautifulSoup(page.content, "html.parser")   # Pulling in all the html
# print(soup1)
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")  # Pulling in the content, Prettify jusy makes things look better
# print(soup2)

# soup2 was all the html that we can get
# Now we'll specify what things we actually want - eg. product title
# We can get this id by clicking on inspect element of the particular webpage

title = soup2.find(id = "productTitle").get_text()
print(title)
reviews = soup2.find(id = "acrCustomerReviewText").get_text()
print(reviews)
price = soup2.find("span", attrs={'class':'a-offscreen'}).get_text()
print(price)


            WOW Skin Science Coconut Milk Hair Mask with Coconut Milk, 200 ml
           

             732 ratings
            

                        ₹349.00
                       


In [27]:
# Need to clean this up - eg. too much blank space

reviews = reviews.strip()[-9::-1]   #removing blank space and the word ratings
reviews = reviews[-1::-1]

title = title.strip()

price = price.strip()[1:]

print(reviews)
print(title)
print(price)

732
WOW Skin Science Coconut Milk Hair Mask with Coconut Milk, 200 ml
349.00


In [28]:
# Timestamp for when we collected this data

import datetime

today = datetime.date.today()
print(today)

2023-01-18


In [29]:
# We will create a csv file to insert this data into

import csv
 
header = ["Title", "Price", "Reviews", "Date"]

# Right now our data is in string format - we want to make it a list

data = [title, price, reviews, today]
type(data)  # It's a list now

with open("AmazonWebScraperDataset.csv", "w", newline ="", encoding = 'UTF8') as f:
# newline specifies how to end a line in the csv - by default it leaves a blank line (space) - "" will not leave that blank line
# UTF8 - common encoding, 8 means 8-bit values are used in the encoding
    writer = csv.writer(f)
    writer.writerow(header)   # Initial insertion of data into the csv
    writer.writerow(data)
    
# We are creating the csv and then inserting the header and then the data

# This overwrites data everytime we run it

In [34]:
import pandas as pd

df = pd.read_csv("AmazonWebScraperDataset.csv")
print(df)

                                               Title  Price  Reviews  \
0  WOW Skin Science Coconut Milk Hair Mask with C...  349.0      732   
1  WOW Skin Science Coconut Milk Hair Mask with C...  349.0      732   
2  WOW Skin Science Coconut Milk Hair Mask with C...  349.0      732   
3  WOW Skin Science Coconut Milk Hair Mask with C...  349.0      732   

         Date  
0  2023-01-18  
1  2023-01-18  
2  2023-01-18  
3  2023-01-18  


In [31]:
# We now need to append more data to this csv

with open("AmazonWebScraperDataset.csv", "a+", newline ="", encoding = 'UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(data)

In [32]:
# We need to automate this whole process

def check_price():
    URL = 'https://www.amazon.in/gp/product/B07YYCKR11/ref=ox_sc_saved_title_4?smid=A2WK4OB3ROODF0&psc=1'
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
    page = requests.get(URL, headers = headers) 
    soup1 = BeautifulSoup(page.content, "html.parser")   
    soup2 = BeautifulSoup(soup1.prettify(), "html.parser")  
    title = soup2.find(id = "productTitle").get_text()
    reviews = soup2.find(id = "acrCustomerReviewText").get_text()
    price = soup2.find("span", attrs={'class':'a-offscreen'}).get_text()

    reviews = reviews.strip()[-9::-1]   
    reviews = reviews[-1::-1]
    title = title.strip()
    price = price.strip()[1:]

    today = datetime.date.today()
    
    header = ["Title", "Price", "Reviews", "Date"]
    data = [title, price, reviews, today]
    
    with open("AmazonWebScraperDataset.csv", "a+", newline ="", encoding = 'UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(data)

In [None]:
while(True):
    check_price()
    time.sleep(86400)   # Every 24 hours - 86400 sec, it runs through this entire process
    
# We are checking the review count every 24 hours

In [35]:
df = pd.read_csv("AmazonWebScraperDataset.csv")
print(df)

                                               Title  Price  Reviews  \
0  WOW Skin Science Coconut Milk Hair Mask with C...  349.0      732   
1  WOW Skin Science Coconut Milk Hair Mask with C...  349.0      732   
2  WOW Skin Science Coconut Milk Hair Mask with C...  349.0      732   
3  WOW Skin Science Coconut Milk Hair Mask with C...  349.0      732   

         Date  
0  2023-01-18  
1  2023-01-18  
2  2023-01-18  
3  2023-01-18  


In [36]:
# If we want to get an email everytime the price of the product drops below a certain value

def send_mail():
    server = smtplib.SMTP_SSL('smtp.gmail.com',465)
    server.ehlo()
    #server.starttls()
    server.ehlo()
    server.login('pratibhasazawal@gmail.com','dmumt2912')
    
    subject = "The mask you want is below ₹300! Now is your chance to buy!"
    body = "Pratibha, This is the moment we have been waiting for. Now is your chance to pick up the mask of your dreams. Don't mess it up! Link here: https://www.amazon.in/gp/product/B07YYCKR11/ref=ox_sc_saved_title_4?smid=A2WK4OB3ROODF0&psc=1"
   
    msg = f"Subject: {subject}\n\n{body}"
    
    server.sendmail(
        'pratibhasazawal@gmail.com',
        msg
     )

In [37]:
def check_price2():
    URL = 'https://www.amazon.in/gp/product/B07YYCKR11/ref=ox_sc_saved_title_4?smid=A2WK4OB3ROODF0&psc=1'
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
    page = requests.get(URL, headers = headers) 
    soup1 = BeautifulSoup(page.content, "html.parser")   
    soup2 = BeautifulSoup(soup1.prettify(), "html.parser")  
    title = soup2.find(id = "productTitle").get_text()
    reviews = soup2.find(id = "acrCustomerReviewText").get_text()
    price = soup2.find("span", attrs={'class':'a-offscreen'}).get_text()

    reviews = reviews.strip()[-9::-1]   
    reviews = reviews[-1::-1]
    title = title.strip()
    price = price.strip()[1:]

    today = datetime.date.today()
    
    header = ["Title", "Price", "Reviews", "Date"]
    data = [title, price, reviews, today]
    
    with open("AmazonWebScraperDataset.csv", "a+", newline ="", encoding = 'UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(data)
        
    if (price <= 300):
        send_mail()