In [9]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import logging

# Logging configuration
logging.basicConfig(level=logging.INFO)

# Headers for the HTTP request
headers = {
    "authority": "www.amazon.com",
    "pragma": "no-cache",
    "cache-control": "no-cache",
    "dnt": "1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.77 Safari/537.36",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "sec-fetch-site": "none",
    "sec-fetch-mode": "navigate",
    "sec-fetch-dest": "document",
    "accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
}

# URL of the page to scrape
URL = "https://www.amazon.com/Heat-Storm-HS-1500-PHX-WIFI-Infrared-Heater/product-reviews/B07JXRWJ8D/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"

# Fetch the HTML content of the page
logging.info(f"Fetching URL: {URL}")
resp = requests.get(URL, headers=headers)
page_html = resp.text

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(page_html, "lxml")

# Find all the review elements
reviews = soup.find_all("div", {"class": "a-section celwidget"})

print(reviews)


INFO:root:Fetching URL: https://www.amazon.com/Heat-Storm-HS-1500-PHX-WIFI-Infrared-Heater/product-reviews/B07JXRWJ8D/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews


[<div class="a-section celwidget" id="customer_review-R37TE6Q4IOYWKP"><div class="a-row a-spacing-mini" data-hook="genome-widget"><a class="a-profile" data-a-size="small" href="/gp/profile/amzn1.account.AGGBODWMFF4E7L56GAT4U2UYV4AQ/ref=cm_cr_arp_d_gw_btm?ie=UTF8"><div aria-hidden="true" class="a-profile-avatar-wrapper"><div class="a-profile-avatar"><img class="a-lazy-loaded" data-src="https://images-na.ssl-images-amazon.com/images/S/amazon-avatars-global/default._CR0,0,1024,1024_SX48_.png" src="https://images-na.ssl-images-amazon.com/images/G/01/x-locale/common/grey-pixel.gif"/><noscript><img src="https://images-na.ssl-images-amazon.com/images/S/amazon-avatars-global/default._CR0,0,1024,1024_SX48_.png"/></noscript></div></div><div class="a-profile-content"><span class="a-profile-name">Samuel Armstrong</span></div></a></div><div class="a-row"><a class="a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold" data-hook="review-title" href="/gp/customer-review

In [12]:
# List to hold the extracted review data
all_results = []

# Iterate through each review element and extract the required information
for review in reviews:
    review_text = review.find("span", {"class": "a-size-base review-text review-text-content"}).get_text().strip()
    review_date = review.find("span", {"class": "review-date"}).get_text()
    #review_header = review_header = review.find("a", {"data-hook": "review-title"}).find("span").get_text().strip()
    stars = review.find("span", {"class": "a-icon-alt"}).get_text().strip()
    product = review.find("a", {"class": "a-size-mini a-link-normal a-color-secondary"}).get_text().strip()

    # Append the extracted data to the results list
    all_results.append({
        "review_text": review_text,
        "review_date": review_date,
        #"review_title": review_header,
        "review_stars": stars,
        "review_flavor": product,
    })

# Convert the results to a DataFrame
out = pd.DataFrame.from_records(all_results)

# Log the shape of the DataFrame
logging.info(f"{out.shape[0]} reviews were extracted.")

# Save the DataFrame to a CSV file
save_name = f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.csv"
logging.info(f"Saving to {save_name}")
out.to_csv(save_name, index=False)
logging.info('Done yayy')


INFO:root:10 reviews were extracted.


INFO:root:Saving to 2024-05-30-07-51-29.csv
INFO:root:Done yayy
