# Dataset Creation - Amazon Web Scraping
- **Overview**: 
    - This projects focuses on creating a dataset for price of a single product fluctuating overtime
    - It starts with **Connecting and Scrapping HTML from Amazon** and storing things into variable, while cleaning it.
    - It also holds a script which automates the process to run the data once every day and then stores the Name, Price and Date (the script ran) into an excel file
- **Functions and Libraries used**
    - BeautifulSoup
    - requests
    - time
    - datetime
    - csv
    - pandas

# Output
- Clean Dataset with product and it's updated pricing at per day basis

In [80]:
from bs4 import BeautifulSoup
import requests
import time
import datetime
import csv
import pandas as pd

In [70]:
# Connecting to Website

URL = 'https://www.amazon.ca/Apple-27-inch-10th-Generation-Intel-Core-Processor/dp/B08F8HK6KF/ref=sr_1_6?crid=1RM2LE7P12X31&keywords=imac&qid=1672001818&sprefix=%2Caps%2C464&sr=8-6'

# Get headers from http://httpbin.org/get
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54",
     "Accept-Encoding": "gzip, deflate", 
     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
     "Connection":"close",
    "Upgrade-Insecure-Requests":"1"}

page = requests.get(URL, headers=headers)

soup1 = BeautifulSoup(page.content, "html.parser")

soup2 = BeautifulSoup(soup1.prettify(), "html.parser") # makes the pulled content readable

title = soup2.find(id='productTitle').get_text() # Inspect and check what you want, we wanted product title 
price = soup2.find(id='corePriceDisplay_desktop_feature_div').get_text() # the number we get is split in price and decimal

print (title)
print(price)





           2020 Apple iMac (27-Inch, 3.3GHz 6-Core 10th-Generation Intel Core I5 Processor, 8GB RAM, 512GB SSD) - English
          





            $2,449.99
           


             $
            

             2,449
             
              .
             


             99
            














In [72]:
# Cleaning the data - since we are making a dataset

title = title.strip()


#print (price)
price = price.split() # so we split the string and we choose the whole price 
#print (price)
price = price[0].replace('$','') # and remove the Dollar sign since we need to add it in excel for dataset


print (title)
print (price)
type(title)

# importing date for when we got the data
today= datetime.date.today()

print(today)

2020 Apple iMac (27-Inch, 3.3GHz 6-Core 10th-Generation Intel Core I5 Processor, 8GB RAM, 512GB SSD) - English
2,449.99
2022-12-25


In [58]:
# Writing scrapped stuff into CSV for creating a dataset

header = ['Title', 'Price', 'Date']
data = [title, price, today]

with open('AmazonWebScraperDataset.csv','w', newline ='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerow(data)


In [68]:
df = pd.read_csv(r"C:\Users\sunraj\Desktop\Web scraper - Amazon\AmazonWebScraperDataset.csv")
print(df)

                                               Title     Price        Date
0  2020 Apple iMac (27-Inch, 3.3GHz 6-Core 10th-G...  2,449.99  2022-12-25


In [None]:
# Appendig the data in csv

with open('AmazonWebScraperDataset.csv','a+', newline ='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(data)

# Automating process
- To automate the process we create a function which we add to a timer function to call it periodically 

In [None]:
def check_price():
    URL = 'https://www.amazon.ca/Apple-27-inch-10th-Generation-Intel-Core-Processor/dp/B08F8HK6KF/ref=sr_1_6?crid=1RM2LE7P12X31&keywords=imac&qid=1672001818&sprefix=%2Caps%2C464&sr=8-6'

    # Get headers from http://httpbin.org/get
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54",
         "Accept-Encoding": "gzip, deflate", 
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
         "Connection":"close",
        "Upgrade-Insecure-Requests":"1"}

    page = requests.get(URL, headers=headers)

    soup1 = BeautifulSoup(page.content, "html.parser")

    soup2 = BeautifulSoup(soup1.prettify(), "html.parser") # makes the pulled content readable

    title = soup2.find(id='productTitle').get_text() # Inspect and check what you want, we wanted product title 
    price = soup2.find(id='corePriceDisplay_desktop_feature_div').get_text() # the number we get is split in price and decimal
    
    
    # Cleaning the data - since we are making a dataset
    title = title.strip()
    
    price = price.split() # so we split the string and we choose the whole price 
   
    price = price[0].replace('$','') # and remove the Dollar sign since we need to add it in excel for dataset

    # importing date for when we got the data
    today= datetime.date.today()
    
    # Writing scrapped stuff into CSV for creating a dataset
    header = ['Title', 'Price', 'Date']
    data = [title, price, today]
    
    # Appendig the data in csv

    with open('AmazonWebScraperDataset.csv','a+', newline ='', encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(data)

In [None]:
# Adding a timmer - to create a dataset, and to run the function after fixed intervals
    # just make sure to re-run this after restarting your device or loosing the connection
    # Only run the script that creats the header once
    
while(True):
    check_price()
    time.sleep(86400) #time is in seconds, and this is 1 day

In [78]:
df = pd.read_csv(r"C:\Users\sunraj\Desktop\Web scraper - Amazon\AmazonWebScraperDataset.csv")
print(df)

                                               Title     Price        Date
0  2020 Apple iMac (27-Inch, 3.3GHz 6-Core 10th-G...  2,449.99  2022-12-25
1  2020 Apple iMac (27-Inch, 3.3GHz 6-Core 10th-G...  2,449.99  2022-12-25
