In [None]:
import time
import random
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from bs4 import BeautifulSoup
import pandas as pd

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize global variables
driver = None
options = None
reviewList = []
total_reviews_collected = 0  # Global variable to keep track of total reviews

# Function to extract reviews
def extractReview(reviewUrl, retry_count=5, backoff_factor=1.5):
    global driver, reviewList, total_reviews_collected
    for attempt in range(retry_count):
        try:
            driver.get(reviewUrl)
            WebDriverWait(driver, 90).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-hook="review"]')))
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            reviews = soup.findAll('div', {'data-hook': 'review'})
            logging.info(f"Extracting reviews from URL: {reviewUrl}, found {len(reviews)} reviews")
            for item in reviews:
                review_tag = item.find('a', {'data-hook': 'review-title'})
                review_title = review_tag.find_all('span')[-1].text.strip() if review_tag and review_tag.find_all('span') else "No title"

                rating_tag = item.find('i', {'data-hook': 'review-star-rating'})
                if not rating_tag:
                    rating_tag = item.find('span', {'class': 'a-icon-alt'})
                rating = rating_tag.text.strip() if rating_tag else "No rating"

                review_body_tag = item.find('span', {'data-hook': 'review-body'})
                review_body = review_body_tag.text.strip() if review_body_tag else "No review body"

                review_date_tag = item.find('span', {'data-hook': 'review-date'})
                review_date = ' '.join(review_date_tag.text.strip().split()[-3:]) if review_date_tag else "No review date"

                review = {
                    'Review Title': review_title,
                    'Rating': rating,
                    'Review Body': review_body,
                    'Review Date': review_date
                }
                logging.info(f"Appending review: {review}")
                reviewList.append(review)
                total_reviews_collected += 1  # Increment the global review count
            break  # Break the loop if extraction is successful
        except TimeoutException as e:
            logging.error(f"Timeout while loading page: {reviewUrl}")
            if attempt < retry_count - 1:
                sleep_time = random.uniform(10, 20) * (backoff_factor ** attempt)
                logging.info(f"Retrying... (Attempt {attempt + 2} of {retry_count}) after sleeping for {sleep_time} seconds")
                time.sleep(sleep_time)
            else:
                raise e
        except WebDriverException as e:
            logging.error(f"WebDriverException: {e}")
            if attempt < retry_count - 1:
                sleep_time = random.uniform(10, 20) * (backoff_factor ** attempt)
                logging.info(f"Retrying... (Attempt {attempt + 2} of {retry_count}) after sleeping for {sleep_time} seconds")
                reinitialize_driver()
                time.sleep(sleep_time)
            else:
                raise e

# Function to get the total number of review pages
def totalpage(reviewUrl):
    global driver
    try:
        driver.get(reviewUrl)
        WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-hook="cr-filter-info-review-rating-count"]')))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        num = soup.find('div', {'data-hook': 'cr-filter-info-review-rating-count'})
        total_reviews = int(num.text.strip().split(", ")[1].split(" ")[0].replace(',', '')) if num else 0
        return (total_reviews // 10) + (1 if total_reviews % 10 != 0 else 0)
    except TimeoutException as e:
        logging.error(f"Timeout while loading page: {reviewUrl}")
        return 0
    except WebDriverException as e:
        logging.error(f"WebDriverException: {e}")
        return 0

# Function to reinitialize the driver
def reinitialize_driver():
    global driver, options
    if driver:
        driver.quit()
    driver = webdriver.Chrome(options=options)

# Function to collect reviews
def collect(prod, url):
    global driver, options, reviewList

    # Configure Selenium
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode (without opening a browser window)

    # Add user-agent rotation
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
    ]
    options.add_argument(f"user-agent={random.choice(user_agents)}")

    reinitialize_driver()

    reviewList = []
    logging.info(f"Initial review URL: {url}")
    totalPage = min(totalpage(url), 10)  # Limit the total pages to 10
    logging.info(f"Total pages: {totalPage}")
    base_url = url.split('&pageNumber=')[0]  # Remove the existing page number parameter if present
    for i in range(1, totalPage + 1):
        retry_count = 5
        while retry_count > 0:
            try:
                reviewUrl = f"{base_url}&pageNumber={i}&sortBy=recent"
                logging.info(f"Running for page {i}, URL: {reviewUrl}")
                extractReview(reviewUrl)
                break  # Break the retry loop if extraction is successful
            except Exception as e:
                logging.error(f"Error on page {i}, attempt {6 - retry_count}: {e}")
                retry_count -= 1
                time.sleep(random.uniform(10, 20))  # Increased random sleep
        if retry_count == 0:
            logging.error(f"Failed to process page {i} after multiple attempts.")
        else:
            # Random sleep to mimic human browsing behavior
            time.sleep(random.uniform(20, 30))  # Increased random sleep

    logging.info(f"Total reviews collected for {prod}: {len(reviewList)}")

    # Write all reviews to CSV file
    df = pd.DataFrame(reviewList)
    df.to_csv(prod, index=False)

    # Quit the driver
    driver.quit()

# Example usage
product_url = {
    "intel_14_i9_unlocked":                     "https://www.amazon.in/i9-14900K-Desktop-Processor-Integrated-Graphics/product-reviews/B0CGJDKLB8/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_14_i9_unlocked_discrete_graphics":   "https://www.amazon.in/Intel%C2%AE-i9-14900KF-Desktop-Processor-P-cores/product-reviews/B0CGJDBCTK/ref=cm_cr_arp_d_viewopt_fmt?ie=UTF8&reviewerType=all_reviews&formatType=current_format&pageNumber=1",  
    "intel_14_i9":                              "https://www.amazon.in/i9-14900K-Desktop-Processor-Integrated-Graphics/product-reviews/B0CGJDKLB8/ref=cm_cr_getr_d_paging_btm_prev_1?ie=UTF8&reviewerType=all_reviews",
    "intel_14_i7":                              "https://www.amazon.in/Intel-i7-14700-Desktop-Processor-P-cores/product-reviews/B0CQ1PY6D6/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_14_i7_unlocked":                     "https://www.amazon.in/Intel-i7-14700K-Desktop-Processor-P-cores/product-reviews/B0CQ1PY6D6/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_14_i5_unlocked":                     "https://www.amazon.in/Intel-i5-14600K-Desktop-Processor-P-cores/product-reviews/B0CQ1T23GS/ref=cm_cr_getr_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews",
    "intel_14_i5":                              "https://www.amazon.in/Intel-i5-14600-Desktop-Processor-P-cores/product-reviews/B0CQ1RGF5C/ref=cm_cr_getr_d_paging_btm_prev_1?ie=UTF8&reviewerType=all_reviews",
    "intel_14_i3":                              "https://www.amazon.in/Intel-i3-14100-Desktop-Processor-P-cores/product-reviews/B0CQ1S3L53/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    
    "intel_13_i9":                              "https://www.amazon.in/Intel-i9-13900K-Desktop-Processor-P-cores/product-reviews/B0BCF54SR1/ref=cm_cr_getr_d_paging_btm_prev_1?ie=UTF8&reviewerType=all_reviews",
    "intel_13_i7_unlocked":                     "https://www.amazon.in/Intel-i7-13700K-Desktop-Processor-P-cores/product-reviews/B0BCF57FL5/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_13_i7":                              "https://www.amazon.in/Intel%C2%AE-CoreTM-i7-13700F-Processor-Cache/product-reviews/B0BN5XPHWW/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_13_i5_unlocked":                     "https://www.amazon.in/Intel-i5-13600K-Desktop-Processor-P-cores/product-reviews/B0BCDR9M33/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_13_i5":                              "https://www.amazon.in/Intel%C2%AE-CoreTM-i5-13400F-Processor-Cache/product-reviews/B0BN61LYFB/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_13_i3_1":                            "https://www.amazon.in/Intel%C2%AE-CoreTM-i3-13100F-Processor-Cache/product-reviews/B0BN5ZG6J4/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_13_i3_2":                            "https://www.amazon.in/i3-13100-Processor-FC-LGA16A-Generation-Processors/product-reviews/B0BN5Z8MDR/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",

    "intel_12_i9":                              "https://www.amazon.in/Intel-i9-12900K-Desktop-Processor-Unlocked/product-reviews/B09FXDLX95/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_12_i7":                              "https://www.amazon.in/Intel-Generation-Processor-Warranty-Required/product-reviews/B09MDJDSGH/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_12_i7_unlocked":                     "https://www.amazon.in/Intel-i7-12700K-Desktop-Processor-Unlocked/product-reviews/B09FXNVDBJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_12_i7_unlocked_discrete_graphics":   "https://www.amazon.in/Intel-Generation-Processor-Warranty-Required/product-reviews/B09MDHZ2YQ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_12_i5_unlocked":                     "https://www.amazon.in/Intel-i5-12600K-Desktop-Processor-Unlocked/product-reviews/B09FX4D72T/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_12_i5_unlocked_discrete_graphics_1": "https://www.amazon.in/Intel-Generation-Desktop-Processor-Warranty/product-reviews/B09MDFH5HY/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_12_i5_unlocked_discrete_graphics_2": "https://www.amazon.in/Intel-i5-12400-Desktop-Processor-Cache/product-reviews/B09NMPD8V2/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_12_i5":                              "https://www.amazon.in/Intel%C2%AE-CoreTM-i5-13500-Processor-Cache/product-reviews/B0BN5YLBRG/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",

    "intel_12_i3_unlocked_discrete_graphics_1": "https://www.amazon.in/Intel-Generation-Desktop-Processor-Warranty/product-reviews/B09MDGKQLY/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_12_i3_unlocked_discrete_graphics_2": "https://www.amazon.in/Intel-BX8071512100F-INTEL-I3-12100F-DESKTOP/product-reviews/B09NPJX7PV/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_12_i3_1":                            "https://www.amazon.in/Intel-Core-i3-12100-Quad-core-Processor/product-reviews/B09NPHJLPT/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
    "intel_12_i3_2":                            "https://www.amazon.in/Intel%C2%AE-CoreTM-i3-12100-Processor-Cache/product-reviews/B09MDDX29R/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews",
}

for key, val in product_url.items():
    prod = f"{key}.csv"
    collect(prod, val)
    time.sleep(random.uniform(120, 240))

logging.info(f"Total number of all reviews collected: {total_reviews_collected}")
