# Amazon Web Scraper Project

As a Data Analyst, I am tasked to build an Amazon Web Scraper Python script that extracts HTML from a web page, retrieves product datapoints, and puts the data into a CSV file & DataFrame.

Run the Kernel for cells 1-4. Then try out cells 5 & 6 individually to see how the script scrapes data from one url vs multiple urls.

You can change the 'seconds' argument to have the scraper repeatedly scrape the url(s) until the Kernel is interrupted.

Review results by opening the CSV or viewing the DataFrame

#### Import Modules

In [1]:
from bs4 import BeautifulSoup
import requests
import datetime
import csv
import pandas as pd
import os
import time

#### Hard-Code Variables

In [2]:
directory_path = os.getcwd() + "\\"
csv_name = "AmazonProducts.csv"

default_url = ["https://www.amazon.com/Funny-Data-Systems-Business-Analyst/dp/B07FNW9FGJ/"]

# Retrieved from "https://httpbin.org/get" to set user-agent
page_headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"}

table_headers = ['Title','Price','Date','Time']

#### Write Amazon Scraper Class to Encapsulate Web Scraping Methods

In [3]:
class AmazonScraper():
    # Assign hard-coded variables to instance variables
    def __init__(self, directory_path, csv_name, default_url, page_headers, table_headers):
        self.directory_path = directory_path
        self.csv_name = csv_name
        self.url_list = default_url
        self.current_url = default_url[0]
        self.page_headers = page_headers
        self.table_headers = table_headers
        self.table_data = None
        self.page_response = None
        self.soup = None
        self.df = None
    
    # Run an HTTP Request to retrieve web page HTML
    def HTTPRequest(self):
        self.page_response = requests.get(self.current_url)
        return self.page_response.status_code
    
    # Make soup with the HTML received from the HTML request
    def MakeSoup(self):
        self.soup = BeautifulSoup(self.page_response.content, "html.parser")
        return self.soup
    
    # Retrieve specific datapoints from the soup, clean, & store in table data list
    def RetrieveData(self):
        title = self.soup.find(id='productTitle').get_text().strip()
        
        whole = self.soup.find(class_="a-price-whole").get_text().replace('.','').strip()
        fraction = self.soup.find(class_="a-price-fraction").get_text().strip()
        price = f"{whole}.{fraction}"
        
        date = datetime.date.today()
        time = datetime.datetime.now().time()
        
        self.table_data = [title,price,date,time]
        
        return self.table_data
    
    # Create a CSV using the table headers if a CSV with the stored file name does not exist
    def CreateCSV(self):
        if os.path.exists(self.csv_name):
            return False

        with open(self.csv_name, 'w', newline='', encoding='UTF8') as f:
            writer = csv.writer(f)
            writer.writerow(self.table_headers)
        return True
    
    # Drop a CSV that has the stored file name if it does exist
    def DropCSV(self):
        if os.path.exists(self.csv_name):
            os.remove(self.csv_name)
            return True
        else:
            return False
    
    # Append table data to a CSV file (create CSV if non-existant)
    def AddDataToCSV(self):
        if not os.path.exists(self.csv_name):
            self.CreateCSV()
        
        with open(self.csv_name, 'a+', newline = '', encoding='UTF8') as f:
            writer = csv.writer(f)
            writer.writerow(self.table_data)
        return True
    
    # Build a DataFrame with the specified CSV in the directory path
    def BuildDataFrame(self):
        self.df = pd.read_csv(self.directory_path + self.csv_name)
        return self.df
    
    # Run all class methods with customizations for scraping & csv/df processing
    def AutoPageScraper(self, custom_urls=False, drop_csv=False, seconds=-1):
        if custom_urls:
            self.url_list = custom_urls
        else:
            self.url_list = default_url
        if drop_csv:
            self.DropCSV()
        
        while True:
            for url in self.url_list:
                self.current_url = url
                self.HTTPRequest()
                self.MakeSoup()
                self.RetrieveData()
                self.AddDataToCSV()
                self.BuildDataFrame()
            if seconds > 0:
                time.sleep(seconds)
            else:
                break
        
        return self.df

#### Initialize AmazonScraper Object

In [4]:
amazon_scraper = AmazonScraper(directory_path, csv_name, default_url, page_headers, table_headers)

#### Run AutoPageScraper Function with Default URL

In [5]:
amazon_scraper.AutoPageScraper(custom_urls=False, drop_csv=True, seconds=-1)

Unnamed: 0,Title,Price,Date,Time
0,Funny Got Data MIS Data Systems Business Analy...,16.99,2024-05-04,00:27:20.121257


#### Run AutoPageScraper Function with Custom URLs

In [6]:
custom_urls = ['https://www.amazon.com/Converse-Unisex-Chuck-Taylor-Shield/dp/B01GDH9DII/',
           'https://www.amazon.com/Nintendo-SwitchTM-Neon-Blue-Joy%E2%80%91ConTM-Switch/dp/B0BFJWCYTL/',
           'https://www.amazon.com/Super-Mario-Odyssey-Nintendo-Switch/dp/B01MY7GHKJ/']
amazon_scraper.AutoPageScraper(custom_urls, drop_csv=True, seconds=-1)

Unnamed: 0,Title,Price,Date,Time
0,Converse Men's Chuck Taylor All Star Canvas Hi...,83.36,2024-05-04,00:27:21.484753
1,Nintendo Switch™ with Neon Blue and Neon Red J...,199.99,2024-05-04,00:27:22.937506
2,Super Mario Odyssey - US Version,38.66,2024-05-04,00:27:24.296561
