In [30]:
import time
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver

# Initialize WebDriver (e.g., Chrome)
driver = webdriver.Chrome()

# Lists of product URLs and names
desired_prods = []
prod_names = []

# Function to update the page number in the URL
def update_page_number(url, page_number):
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    query_params['pageNumber'] = [str(page_number)]
    updated_query = urlencode(query_params, doseq=True)
    updated_url = urlunparse(parsed_url._replace(query=updated_query))
    return updated_url

# Function to scrape reviews for a single product
def scrape_reviews(url, product_name):
    i = 0
    names = []
    ratings = []
    rating_dates = []
    titles = []
    reviews_text = []

    while url is not None:
        try:
            i = i + 1
            if i > 1:
                url = update_page_number(url, i)  # Update the URL with the correct pageNumber
            
            driver.get(url)
            time.sleep(5)  # Allow the page to load

            # Check if the login page is displayed
            if "ap/signin" in driver.current_url:
                print("Login page detected. Please log in manually within 15 seconds.")
                time.sleep(15)  # Wait for 15 seconds for manual login
                driver.get(url)  # Reload the page after login
                time.sleep(5)  # Allow the page to load

            # Parse the page with BeautifulSoup
            html_data = BeautifulSoup(driver.page_source, 'html.parser')
            reviews = html_data.find_all('li', {'data-hook': 'review'})

            # If no reviews are found, pause for 15 seconds for manual intervention
            if not reviews:
                print("No reviews found. Pausing for 15 seconds for manual intervention...")
                time.sleep(15)  # Wait for 15 seconds for manual intervention
                driver.get(url)  # Reload the page
                time.sleep(5)  # Allow the page to load
                html_data = BeautifulSoup(driver.page_source, 'html.parser')
                reviews = html_data.find_all('li', {'data-hook': 'review'})

                # If still no reviews, skip this product
                if not reviews:
                    print(f"No reviews found after retry for {product_name}. Skipping...")
                    break

            for review in reviews:
                name = review.find('span', {'class': 'a-profile-name'}).text
                names.append(name.strip())
                rating = review.find('span', {'class': 'a-icon-alt'}).text
                ratings.append(rating)
                rating_date = review.find('span', {'data-hook': 'review-date'}).text
                rating_dates.append(rating_date)
                title = review.find('a', {'data-hook': 'review-title'}).text
                titles.append(title)
                review_text = review.find('span', {'data-hook': 'review-body'}).text
                reviews_text.append(review_text)

            # Check for next page
            url_check = html_data.find('li', {'class': 'a-last'})
            if url_check is None or url_check.find('a') is None:
                print(f"Reached last page ({i}) for {product_name}. Ending scraping.")
                url = None
            else:
                print(f"Successfully scraped page {i} for {product_name}")
                
        except Exception as e:
            print(f"Error occurred on page {i} for {product_name}: {str(e)}")
            url = None

    print(f"Total reviews collected for {product_name}: {len(names)}")

    # Create DataFrame and save to CSV
    data = pd.DataFrame({
        'profile_name': names,
        'rating': ratings,
        'rating_date': rating_dates,
        'title': titles,
        'review_text': reviews_text
    })
    data['product_name'] = product_name
    data.to_csv(f'{product_name}.csv', index=False)

# Iterate through each product and scrape reviews
for url, product_name in zip(desired_prods, prod_names):
    print(f"Scraping reviews for {product_name}...")
    scrape_reviews(url, product_name)
    print(f"Finished scraping reviews for {product_name}.\n")

# Close the WebDriver
driver.quit()

Scraping reviews for Lenoevo Ideapad slim3...
Login page detected. Please log in manually within 15 seconds.
Successfully scraped page 1 for Lenoevo Ideapad slim3
Successfully scraped page 2 for Lenoevo Ideapad slim3
Successfully scraped page 3 for Lenoevo Ideapad slim3
Successfully scraped page 4 for Lenoevo Ideapad slim3
Successfully scraped page 5 for Lenoevo Ideapad slim3
Successfully scraped page 6 for Lenoevo Ideapad slim3
Successfully scraped page 7 for Lenoevo Ideapad slim3
Successfully scraped page 8 for Lenoevo Ideapad slim3
Successfully scraped page 9 for Lenoevo Ideapad slim3
Reached last page (10) for Lenoevo Ideapad slim3. Ending scraping.
Total reviews collected for Lenoevo Ideapad slim3: 100
Finished scraping reviews for Lenoevo Ideapad slim3.

Scraping reviews for Hp Pavilion 14-inch...
Successfully scraped page 1 for Hp Pavilion 14-inch
Reached last page (2) for Hp Pavilion 14-inch. Ending scraping.
Total reviews collected for Hp Pavilion 14-inch: 17
Finished scraping