#Introduction and Set Up

This project serves as an Amazon Web Scraper designed to retrieve information about items available for sale on Amazon and convert them into a structured database. Leveraging the capabilities of the requests, BeautifulSoup, and pandas libraries, the script efficiently conducts web scraping and data manipulation tasks. Furthermore, the project employs seaborn and matplotlib for effective data visualization.

In [261]:
import requests
from bs4 import BeautifulSoup
import pandas

In [262]:
# Use this link to change to your custom header:https://httpbin.org/get (Grab your User-Agent)
custom_headers ={}

# Breakdown of scraping amazon with BeautifulSoup

In [None]:
# This section is condensed in the following section as a single function. This is the breakdown.
# In this section, we demonstrate how to scrape information from an Amazon product page as an example.
# We use a sample product URL and set up the necessary headers for the HTTP request.


# Example URL for Amazon product
url1 = "https://www.amazon.ca/Maple-Treat-Canada-Medium-1000ml/dp/B00KL86NUC"

# Make a request to the URL with custom headers
page = requests.get(url1, headers=custom_headers)

# Make sure the return is 200
page

In [None]:
# Create BeautifulSoup objects for parsing the page
soup1 = BeautifulSoup(page.content, "html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")

# Find the title and the price
title = soup2.find(id='productTitle').get_text()
price = soup2.find(class_='a-offscreen').get_text()

print(title)
print(price)

In [None]:
## Clean it up a bit
price = price.strip()[:]
title = title.strip()

print(title)
print(price)

In [None]:
# Brand of the company
brand = soup2.find(class_='a-span9').get_text()
brand = brand.strip()
print(brand)

# Package Volume
units_all = soup2.find(class_='a-spacing-small po-unit_count')
units = units_all.find(class_='a-size-base po-break-word').get_text()
units = units.strip()
print(units)

# Number of items
quant_all = soup2.find(class_='a-spacing-small po-number_of_items')
quant = quant_all.find(class_='a-size-base po-break-word').get_text()
quant = quant.strip()
print(quant)

# Unit Volume
item_all = soup2.find(class_='a-spacing-small po-item_volume')
items = item_all.find(class_='a-size-base po-break-word').get_text()
items = items.strip()
print(items)

In [None]:
import pandas as pd
# Create a Pandas DataFrame
data = {'Title': [title], 'Price':[price],'Brand': [brand], 'Package Total': [units], 'Number of Items': [quant], 'Unit Volume': [items]}
df = pd.DataFrame(data)

# Display the DataFrame
df.head()

# Amazon Scraping Condensed Function

In [259]:
def scrape_amazon_product(url):
    try:
      # Send a request to the provided URL with custom headers
        page = requests.get(url, headers=custom_headers)
        soup = BeautifulSoup(page.content, "html.parser")

        # Extract product information from the page
        title_element = soup.find(id='productTitle')
        # Extract the product title or set to "N/A" if not found
        title = title_element.get_text().strip() if title_element else "N/A"

        # Extract the product price or set to "N/A" if not found
        price_element = soup.find(class_='a-offscreen')
        price = price_element.get_text().strip()[1:] if price_element else "N/A"

        # Extract the product brand or set to "N/A" if not found
        brand_element = soup.find(class_='a-span9')
        brand = brand_element.get_text().strip() if brand_element else "N/A"

        # Extracting units, quant, and items based on the provided page structure]

        # Extract the product package total or set to "N/A" if not found
        units_all = soup.find(class_='a-spacing-small po-unit_count')
        units_element = units_all.find(class_='a-size-base po-break-word') if units_all else None
        units = units_element.get_text().strip() if units_element else "N/A"

        # Extract the product quantity or set to "N/A" if not found
        quant_all = soup.find(class_='a-spacing-small po-number_of_items')
        quant_element = quant_all.find(class_='a-size-base po-break-word') if quant_all else None
        quant = quant_element.get_text().strip() if quant_element else "N/A"

        # Extract the product unit volume or set to "N/A" if not found
        item_all = soup.find(class_='a-spacing-small po-item_volume')
        item_element = item_all.find(class_='a-size-base po-break-word') if item_all else None
        items = item_element.get_text().strip() if item_element else "N/A"


        # Create a Pandas DataFrame to store the extracted data
        data = {
            'Title': [title],
            'Price': [price],
            'Brand': [brand],
            'Package Total': [units],
            'Number of Items': [quant],
            'Unit Volume': [items],
            'URL': [url]
        }
        df = pd.DataFrame(data)

        return df

    except Exception as e:
      # Handle exceptions and print an error message
        print(f"Error processing URL {url}: {e}")
        return pd.DataFrame()

# Example usage:
test = scrape_amazon_product('https://www.amazon.ca/Maple-Treat-Canada-Medium-1000ml/dp/B00KL86NUC')

# Display the DataFrame
test.head()

Unnamed: 0,Title,Price,Brand,Package Total,Number of Items,Unit Volume,URL
0,"100% Pure Maple Syrup, Dark, Robust Taste, 1 L...",13.99,The Maple Treat,1000 milliliter,1,1000 Milliliters,https://www.amazon.ca/Maple-Treat-Canada-Mediu...


#Creating an Amazon product list from your Google Search

In [None]:
from googlesearch import search

# Define the query for Google search. inurl:dp for products.
# Changing this query to anything else is the simpliest way to use this product.
query = "maple syrup site:amazon.ca inurl:dp"

# Perform a Google search and get the URLs
amazon_links = list(search(query, num=125, stop=125, pause=2))

# Display the links
for i, link in enumerate(amazon_links, start=1):
    print(f"{i}. {link}")


In [None]:
# Explanation: Since many links may not work, we need to filter and retain only the valid ones.

import requests
from googlesearch import search

# Function to filter valid links
def filter_valid_links(links):
    valid_links = []

     # Iterate through the links and check their validity
    for i, link in enumerate(links, start=1):
        try:
            response = requests.get(link, timeout=5)

            # Check if the response status code is 200 (OK)
            if response.status_code == 200:
                valid_links.append(link)
                #print(f"{i}. {link} - Status Code: {response.status_code}")
                pass
            else:
                #print(f"{i}. {link} - Status Code: {response.status_code}. Skipping.")
                pass

        except requests.RequestException as e:
            #print(f"{i}. {link} - Error: {e}. Skipping.")
            pass

    return valid_links


# Filter valid links using the defined function
valid_links = filter_valid_links(amazon_links)


# Create a Pandas DataFrame from the valid links
validlinks_df = pd.DataFrame({"Valid Links": valid_links})
validlinks_df

In [None]:
validlinks_df

# Create final data set and cleaning it

In [230]:
# Initiate the list
scraped_data_list = []

# Iterate through each valid link in the DataFrame
for url in validlinks_df['Valid Links']:
    # Call the function to scrape Amazon product details from the valid URL's
    scraped_data = scrape_amazon_product(url)

    # Check if the scraping was successful (not None)
    if scraped_data is not None:
        scraped_data_list.append(scraped_data)

# Combine the scraped data into a DataFrame
result_df = pd.concat(scraped_data_list, ignore_index=True)
result_df

In [233]:
import numpy as np

# Replace 'N/A' with NaN in the entire DataFrame
result_df.replace('N/A', np.nan, inplace=True)

# Drop rows with missing values in 'Unit Volume', 'Price', and 'Number of Items'
result_df.dropna(subset=['Unit Volume', 'Price','Number of Items'], inplace=True)

# Reset the index after dropping rows
result_df.reset_index(drop=True, inplace=True)

# Create a 'volume_ml' column by converting 'Unit Volume' to a consistent unit (milliliters) and multiplying by 'Number of Items'
result_df['volume_ml'] = result_df['Unit Volume'].replace({' Milliliters': '', ' Liters': '000',' Gallons': ''}, regex=True).astype(float) * result_df['Number of Items'].astype(float)

# Adjust 'volume_ml' for gallons to milliliters conversion
result_df['volume_ml'] = np.where(result_df['Unit Volume'].str.contains('Gallons', case=False), result_df['volume_ml'] * 3785.41, result_df['volume_ml'])

# Calculate the 'price_100ml' column by dividing 'Price' by 'volume_ml' and multiplying by 100
result_df['price_100ml'] = result_df['Price'].astype(float)/result_df['volume_ml'].astype(float)*100


In [None]:
# Return the cleaned and processed Amazon product list
result_df

#Graphing it

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Setting up the figure size for the plot
plt.figure(figsize=(10, 6))

# Creating a histogram plot using Seaborn
sns.histplot(result_df['price_100ml'], bins=20, kde=True, palette="rocket", edgecolor='black')

# Adding labels to the x and y axes
plt.xlabel('Price per 100ml')
plt.ylabel('Frequency')

# Adding a title to the plot
plt.title('Distribution of Price per 100ml')

# Displaying the plot
plt.show()
