In [29]:
import re
import requests
from bs4 import BeautifulSoup

In [30]:
def is_amazon_url(url):
    # Regular expression to check if the URL is an Amazon product URL
    amazon_pattern = r"(https?://)?(www\.)?amazon\.(com|in|co\.uk|de|ca|fr|co\.jp|it|es|nl|com\.mx|com\.au|com\.br|ae|sg|sa)/.*"
    return bool(re.match(amazon_pattern, url))

In [31]:
def scrape_amazon_reviews_and_category(product_url):
    if not is_amazon_url(url):
        raise ValueError("The provided URL is not an Amazon product URL.")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Referer': 'https://www.google.com/',
        'DNT': '1',  
        'Upgrade-Insecure-Requests': '1',
    }
    
    response = requests.get(product_url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract Category
        category_section = soup.select_one('#wayfinding-breadcrumbs_container')  # Modify selector based on actual structure
        if category_section:
            categories = [cat.text.strip() for cat in category_section.select('a')]
        else:
            categories = []
        
        # Extract Reviews
        reviews = []
        for review in soup.select('.review'):
            title = review.select_one('.review-title').text.strip() if review.select_one('.review-title') else None
            content = review.select_one('.review-text').text.strip() if review.select_one('.review-text') else None
            rating = review.select_one('.review-rating').text.strip() if review.select_one('.review-rating') else None
            
            reviews.append({
                "title": title,
                "content": content,
                "rating": rating,
            })
        
        return {
            "categories": categories,
            "reviews": reviews,
        }
    else:
        print("Failed to fetch the page. Status Code:", response.status_code)
        return {}

In [33]:
# Example usage
try:
    url = input("Enter the Amazon URL: ")  # Replace with an actual product URL
    result = scrape_amazon_reviews_and_category(url)

    # Print extracted data
    print("\nCategories:")
    print(" > ".join(result['categories']))
    print("\nReviews:")
    for review in result['reviews']:
        print("Title:", review['title'])
        print("Rating:", review['rating'])
        print("Content:", review['content'])
        print("-" * 80)
except ValueError as e:
    print(e)



Categories:
Clothing, Shoes & Jewelry > Men > Clothing > Shirts > Button-Down Shirts

Reviews:
Title: 5.0 out of 5 stars
Comfortable and Versatile
Rating: 5.0 out of 5 stars
Content: It is super comfy. It can be casual comfort or it can be worn as a semi-formal top. It looks incredible. It fits great. It is light and breathable. Great material. Soft and gentle to the skin. Great quality, I will be buying more in different colors. Worth the money.
Read more
--------------------------------------------------------------------------------
Title: 5.0 out of 5 stars
Great fit and look!
Rating: 5.0 out of 5 stars
Content: The shirt is very nice quality, comfortable, and true to size. Looks great!
Read more
--------------------------------------------------------------------------------
Title: 4.0 out of 5 stars
Purchased for my son
Rating: 4.0 out of 5 stars
Content: He likes the pointelle knit to add interest to the shirt (not just flat or rib knit). The fit runs slightly large but he like