In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import re

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Initialize lists to store scraped data
ASINs = []
Product_Names = []
Review_Titles = []
Review_Contents = []
Review_Stars = []
Review_Dates = []
Reviewer_Names = []
Verified_Purchase = []
Helpful_Votes = []
Review_URLs = []

# Load product URLs from CSV file
df = pd.read_csv("amazon_tv_urls.csv")

# Function to extract reviews from a single page
def extract_reviews(asin, product_name):
    try:
        # Scrape the review titles
        titles = driver.find_elements(By.XPATH, "//a[@data-hook='review-title']")
        Review_Titles.extend([title.text for title in titles])

        # Scrape the review content
        contents = driver.find_elements(By.XPATH, "//span[@data-hook='review-body']")
        Review_Contents.extend([content.text for content in contents])

        # Scrape the star ratings
        stars = driver.find_elements(By.XPATH, "//i[@data-hook='review-star-rating']//span")
        Review_Stars.extend([star.get_attribute('innerHTML').split()[0] for star in stars])

        # Scrape the review date
        dates = driver.find_elements(By.XPATH, "//span[@data-hook='review-date']")
        Review_Dates.extend([date.text for date in dates])

        # Scrape the reviewer names
        names = driver.find_elements(By.XPATH, "//span[@class='a-profile-name']")
        Reviewer_Names.extend([name.text for name in names])

        # Scrape the verified purchase status
        verified = driver.find_elements(By.XPATH, "//span[@data-hook='avp-badge']")
        Verified_Purchase.extend([v.text if v else 'Not Verified' for v in verified])

        # Scrape helpful votes
        helpful = driver.find_elements(By.XPATH, "//span[@data-hook='helpful-vote-statement']")
        Helpful_Votes.extend([help.text if help else '0' for help in helpful])

        # Append the current review URL to Review_URLs list
        Review_URLs.extend([driver.current_url] * len(titles))

        # Append the current ASIN and product name to ASINs and Product_Names list
        ASINs.extend([asin] * len(titles))
        Product_Names.extend([product_name] * len(titles))

    except Exception as e:
        print(f"Error while scraping reviews: {e}")

# Function to extract product name from URL
def extract_product_name(url):
    match = re.search(r"/([a-zA-Z0-9\-]+)-dp/[A-Z0-9]+", url)
    if match:
        return match.group(1).replace("-", " ").title()
    else:
        return "N/A"

# Iterate through all URLs and scrape review data for each product
for index, url in enumerate(df['Urls'].tolist()):
    print(f"Scraping URL {index + 1}/{len(df['Urls'])}: {url}")

    # Extract product name from the URL
    product_name = extract_product_name(url)

    try:
        # Extract ASIN from the URL
        asin = url.split("/dp/")[1].split("/")[0]
        reviews_url = f"https://www.amazon.com/product-reviews/{asin}/ref=cm_cr_arp_d_paging_btm_next_1?ie=UTF8&reviewerType=all_reviews&pageNumber=1"
        driver.get(reviews_url)
        time.sleep(2)  # Allow the page to load

        # Extract reviews from the first page and subsequent pages
        while True:
            extract_reviews(asin, product_name)
            
            # Check if the "Next" button is present for pagination
            try:
                next_button = driver.find_element(By.XPATH, "//li[@class='a-last']/a")
                if "disabled" in next_button.get_attribute("class"):
                    break  # If the "Next" button is disabled, exit the loop
                next_button.click()  # Click the "Next" button
                time.sleep(2)  # Wait for the next page to load
            except Exception:
                # If "Next" button is not found or clickable, break out of the loop
                break

    except IndexError:
        print(f"Error extracting ASIN from URL: {url}")
        continue

# Create a DataFrame ensuring all lists are the same length
max_length = max(len(ASINs), len(Product_Names), len(Review_Titles), len(Review_Contents),
                 len(Review_Stars), len(Review_Dates), len(Reviewer_Names), len(Verified_Purchase), 
                 len(Helpful_Votes), len(Review_URLs))

# Trim lists to max length or fill with None
ASINs += [None] * (max_length - len(ASINs))
Product_Names += [None] * (max_length - len(Product_Names))
Review_Titles += [None] * (max_length - len(Review_Titles))
Review_Contents += [None] * (max_length - len(Review_Contents))
Review_Stars += [None] * (max_length - len(Review_Stars))
Review_Dates += [None] * (max_length - len(Review_Dates))
Reviewer_Names += [None] * (max_length - len(Reviewer_Names))
Verified_Purchase += [None] * (max_length - len(Verified_Purchase))
Helpful_Votes += [None] * (max_length - len(Helpful_Votes))
Review_URLs += [None] * (max_length - len(Review_URLs))

# Compile the scraped data into a dictionary
reviews_dict = {
    "ASIN": ASINs,
    "Product_Name": Product_Names,
    "Review_Title": Review_Titles,
    "Review_Content": Review_Contents,
    "Review_Stars": Review_Stars,
    "Review_Date": Review_Dates,
    "Reviewer_Name": Reviewer_Names,
    "Verified_Purchase": Verified_Purchase,
    "Helpful_Votes": Helpful_Votes,
    "Review_URL": Review_URLs
}

# Convert the dictionary into a DataFrame and save it to a CSV file
df_reviews = pd.DataFrame(reviews_dict)
df_reviews.to_csv("amazon_tv_reviews.csv", index=False)

# Print the first few rows of the scraped data
print(df_reviews.head())

# Close the WebDriver
driver.quit()

Scraping URL 1/314: https://www.amazon.com/VIZIO-40-inch-1080p-Virtual-Built/dp/B0CXG3HMX1/ref=sr_1_1?crid=34BZT2YZDL89K&dib=eyJ2IjoiMSJ9.byxHhZj_hF9e7h0GLUC8XZeTmrkaohjn2E_48KkvufqLpBM0iBS-2SbsZKTI2jCHgO5x7M0L-x1Vzd2BENrkWJ37Odj1eoC3GSfjRKrnb9w1LzQNJFFxeMjkE56yfLJx7RJcuPawLyevxhgT24GWIG1L-RLJUa98we9Vgy4DOidHO1kv0P2P4AhjMytU0tX0HQGsyFrzVuwpqFKEs_fmLkeiVJzMw0wBlE1DUPPWxcg.XSq2k2yfLYkXMLXODcf8dSl-eps3cpTiUpKDBf99iK8&dib_tag=se&keywords=tv&qid=1727402869&sprefix=tv%2Caps%2C118&sr=8-1
Scraping URL 2/314: https://www.amazon.com/INSIGNIA-All-New-50-inch-Class-NS-50F301NA24/dp/B0BTTVRWPR/ref=sr_1_2?crid=34BZT2YZDL89K&dib=eyJ2IjoiMSJ9.byxHhZj_hF9e7h0GLUC8XZeTmrkaohjn2E_48KkvufqLpBM0iBS-2SbsZKTI2jCHgO5x7M0L-x1Vzd2BENrkWJ37Odj1eoC3GSfjRKrnb9w1LzQNJFFxeMjkE56yfLJx7RJcuPawLyevxhgT24GWIG1L-RLJUa98we9Vgy4DOidHO1kv0P2P4AhjMytU0tX0HQGsyFrzVuwpqFKEs_fmLkeiVJzMw0wBlE1DUPPWxcg.XSq2k2yfLYkXMLXODcf8dSl-eps3cpTiUpKDBf99iK8&dib_tag=se&keywords=tv&qid=1727402869&sprefix=tv%2Caps%2C118&sr=8-2
Scraping URL 3/31

In [2]:
df_reviews.shape

(19530, 10)