## 1. Data Collection
### Web Scraping:
- Use tool like Selenium in Python to scrape customer reviews from any e-commerce link.
- Collect relevant information such as review text, rating, and date of review.

### Data Storage:
- Store the collected data in a structured format such as a CSV file or a database.

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import random
import pandas as pd
from urllib.parse import urlparse, parse_qs, urlencode

# Set your ChromeDriver path
chrome_driver_path = "change_to_your_directory/chromedriver.exe"

# Setup ChromeDriver
service = Service(executable_path=chrome_driver_path)
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--headless")  # Run in headless mode
options.add_argument(f"user-agent={'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}")

# Initialize ChromeDriver
driver = webdriver.Chrome(service=service, options=options)

# Modify the webdriver property to avoid detection
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
    "source": """
    Object.defineProperty(navigator, 'webdriver', {
        get: () => undefined
    });
    """
})

def get_next_page_url(current_url, page_number):
    url_parts = urlparse(current_url)
    query_params = parse_qs(url_parts.query)
    query_params['pageNumber'] = [str(page_number)]
    new_query = urlencode(query_params, doseq=True)
    next_page_url = f"{url_parts.scheme}://{url_parts.netloc}{url_parts.path}?{new_query}"
    return next_page_url

def scrape_amazon_reviews(url, start_page=1, end_page=10):
    driver.get(url)
    time.sleep(random.uniform(2, 4))  # Random delay

    # Click the "Translate all reviews to English" button if it exists
    try:
        translate_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "input.a-button-input"))
        )
        translate_button.click()
        time.sleep(random.uniform(2, 4))  # Random delay
    except Exception as e:
        print(f"Translate button not found: {e}")

    reviews_data = []
    seen_reviews = set()

    for page in range(start_page, end_page + 1):
        try:
            # Wait for the reviews section to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "cm_cr-review_list"))
            )

            # Extract reviews
            reviews = driver.find_elements(By.CSS_SELECTOR, "div[data-hook='review']")

            for review in reviews:
                try:
                    username = review.find_element(By.CSS_SELECTOR, ".a-profile-name").text
                    rating = review.find_element(By.CSS_SELECTOR, "i[data-hook='review-star-rating'] span.a-icon-alt").text
                    comment = review.find_element(By.CSS_SELECTOR, "span[data-hook='review-body']").text
                    date = review.find_element(By.CSS_SELECTOR, "span[data-hook='review-date']").text

                    review_id = (username, comment)  # Unique identifier for the review
                    if review_id not in seen_reviews:
                        seen_reviews.add(review_id)
                        reviews_data.append({
                            "Username": username,
                            "Rating": rating,
                            "Comment": comment,
                            "Date": date
                        })
                except Exception as e:
                    print(f"Error extracting review: {e}")
                    continue

            print(f"Page {page} reviews extracted")

            # Construct the URL for the next page
            next_page_url = get_next_page_url(driver.current_url, page + 1)
            driver.get(next_page_url)
            time.sleep(random.uniform(3, 5))  # Random delay

        except Exception as e:
            print(f"Error on page {page}: {e}")
            break

    return reviews_data

def save_reviews_to_csv(reviews, filename):
    # Define CSV headers
    headers = ["Username", "Rating", "Date", "Comment"]

    # Write to CSV file
    with open(filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()
        writer.writerows(reviews)

# URL to scrape
# url = "// Add the review of a specific product link here"

# Example
url = "https://www.amazon.com/Logitech-Lightspeed-Wireless-Adjustable-Programmable/product-reviews/B07QKC4WWD/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_reviews&sortBy=recent&pageNumber=1&filterByStar=one_star"

# Loop to scrape 10 pages at a time
total_pages = 10
for i in range(0, total_pages, 10):
    start_page = i + 1
    end_page = start_page + 9
    reviews = scrape_amazon_reviews(url, start_page, end_page)
    
    # Add any name to CSV file based on your liking
    save_reviews_to_csv(reviews, "productName_reviews.csv")
    
    print(f"Scraped and saved reviews from pages {start_page} to {end_page}")

# Load the CSV to a DataFrame to display in Jupyter Notebook
reviews_df = pd.read_csv("amazon_g502_reviews_negativeV7.csv")

# Overview of the webscraped dataset
reviews_df


Page 1 reviews extracted
Error extracting review: Message: no such element: Unable to locate element: {"method":"css selector","selector":"i[data-hook='review-star-rating'] span.a-icon-alt"}
  (Session info: chrome-headless-shell=126.0.6478.127); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF68952EEA2+31554]
	(No symbol) [0x00007FF6894A7ED9]
	(No symbol) [0x00007FF68936872A]
	(No symbol) [0x00007FF6893B8434]
	(No symbol) [0x00007FF6893B853C]
	(No symbol) [0x00007FF6893ABBAC]
	(No symbol) [0x00007FF6893DD06F]
	(No symbol) [0x00007FF6893ABA76]
	(No symbol) [0x00007FF6893DD240]
	(No symbol) [0x00007FF6893FC977]
	(No symbol) [0x00007FF6893DCDD3]
	(No symbol) [0x00007FF6893AA33B]
	(No symbol) [0x00007FF6893AAED1]
	GetHandleVerifier [0x00007FF689838B1D+3217341]
	GetHandleVerifier [0x00007FF689885AE3+3532675]
	GetHandleVerifier [0x00007FF68987B0E0+3489152

Unnamed: 0,Username,Rating,Date,Comment
0,Rick,,"Reviewed in the United States on April 7, 2024",The most was hard to stop jumping around. The ...
1,joel,,"Reviewed in the United States on September 16,...","This is honestly not a bad mouse, couple thing..."
2,Y,,"Reviewed in the United States on December 4, 2023",Play Video\nI ordered new wireless mouse but I...
3,DB,,"Reviewed in the United States on April 11, 2022",If you want to buy products that you can't cus...
4,Jacob Seldal,,"Reviewed in the United States on August 4, 2023",I’m not sure what they are teaching kids these...
...,...,...,...,...
59,Jake Adkins,,"Reviewed in the United States on July 24, 2021",After buying you will have an issue known as t...
60,Amazon Customer,,"Reviewed in the United States on March 14, 2021",Left click button will randomly depress before...
61,joel,,"Reviewed in the United States on September 16,...","This is honestly not a bad mouse, couple thing..."
62,Chad,,"Reviewed in the United States on June 7, 2020",I used the wired G502 for a few years and real...
