# WebScrapping Baby Product Reviews

In [1]:
# -- beautifulsoup4    : for interacting with static content
# -- Selenium          : for automating web scraping through interacting with dynamic content, handling pages that load content dynamically via JS
#                        such as clicking buttons, to load more, navigating through page or filling out forms,
# -- webdriver manager : automation interface, browser control, bypassing browser restrictions, webinteractions
# -- pandas            : for storing, manipulating, and converting data to a structured format 


# ** Install these libraries, by running the code below, if you are not installed priorily**
# !pip install beautifulsoup4
# !pip install selenium
# !pip install webdriver-manager
# !pip install pandas 

# importing installed libraries
from bs4 import BeautifulSoup 
# used for parsing HTML content and extracting static elements from the webpages.

from selenium import webdriver
# automates browser actions, enabling interaction with dynamic content on webpages.

# By, WebDriverWait & expected_conditions from selenium
# helps manage, delays, locate elements on the page, wait for specific condition.
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException,ElementClickInterceptedException   # Import TimeoutException

# Service from Selenium -- Handles the setup of WebDriver services for managing browser sessions.
from selenium.webdriver.chrome.service import Service

# ChromeDriver Manager -- Automatically manages and installs necessary browser driver for Selenium, simplifying browser setuo process.
from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd # data manipulation library
import time
import regex as re

start_time = time.time()

product_url = "https://www.bestbuy.ca/en-ca/product/evenflo-gold-sensorsafe-shyft-smart-modular-travel-system-w-litemax-infant-car-seat-moonstone-grey/17661146"

def initialize_driver():
    """
    Initializes and configures a Chrome WebDriver instance.
    
    Returns:
        webdriver.Chrome : The Initialized WebDriver object.
    """
    option = webdriver.ChromeOptions()
    # Add desired options (ex :- headless mode, user-agent)
    driver = webdriver.Chrome( service = Service( ChromeDriverManager().install() ),options = option )
    driver.maximize_window()
    return driver


def navigate_to_reviews( driver ):
    """
    Navigates to the product page on the webpage.
    
    Args:
        driver( webdriver.Chrome ): The WebDriver instance.
    """
    driver.get( product_url )

    try:
        WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@id='rating-link']" ))
        ).click()
        print( "Reviews button clicked." )

        explore_button = WebDriverWait( driver, 20 ).until(
            EC.presence_of_element_located(( By.XPATH, "//button[@data-automation='pdp-explore-all-reviews-link']" ))
        )
        driver.execute_script( "arguments[0].scrollIntoView(true);", explore_button )
        time.sleep(2)
        while True:
            try:
                driver.execute_script("arguments[0].click();", explore_button)
                print("Explore all Reviews button clicked.")
                break
            except ElementClickInterceptedException:
                print("Click intercepted. Scrolling more...")
                driver.execute_script("window.scrollBy(0, 100);")
                time.sleep(2)

    except Exception as e:
        print( f"Error during navigation: { e }" )

    load_all_reviews(driver)


def load_all_reviews( driver ):
    """
    Clicks the 'Show More' button until all reviews are loaded.
    
    Args:
        driver( webdriver.Chrome): The WebDriver instance.
    """
    while True:
        try:
            # Wait until the "Show More" button is visible and clickable
            show_more_button = WebDriverWait(driver, 20).until(
                EC.visibility_of_element_located((By.XPATH, "//button[contains(@class, 'loadMore_3AoXT')]"))
            )
            
            # Scroll to the element and click it using JavaScript for reliability
            driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
            driver.execute_script("arguments[0].click();", show_more_button)
            time.sleep(5)  # Allow time for the content to load
            
        except TimeoutException:
            print("No more 'Show More' buttons. All reviews are loaded.")
            break
        except Exception as e:
            print(f"An error occurred: {e}")
            break

def extract_review_data( driver ):
    """
    Extracts review data from the webpage and saves it to a CSV file.
    
    Args:
        driver ( webdriver.Chrome ): The WebDriver instance.
    
    Returns:
        None
    """
    try:
        reviews = {
            "Review_date" : [ date.get_attribute( 'data-date' ).split( 'T' )[0] for date in driver.find_elements( By.XPATH, "//span[@class='locationAndTime_3MA78']" )],
            "Reviewer_name" : [ name.text for name in driver.find_elements( By.XPATH, "//span[@class='author_20vgR']/span/span/span" )],
            "Review_title" : [ title.text for title in driver.find_elements( By.XPATH, "//div[@class='reviewTitle_1qq1j']/span" )],
            "Review_content" : [ content.text for content in driver.find_elements( By.XPATH, "//div[@class='reviewContent_XCspv']/p/span" )]
        }
        df = pd.DataFrame( reviews )
        print(df.shape)
        #df.to_csv( "reviews.csv", index = False )
        print( "Data saved to 'reviews.csv'" )
        
    except Exception as e:
        print( f"Error extracting review data: {e}" )

def main():
    """
    Main function to orchestrate the review extraction process.
    """
    driver = initialize_driver()
    navigate_to_reviews( driver )
    extract_review_data( driver )
    driver.quit()


if __name__ == "__main__":
    main()

end_time = time.time()
runtime = end_time - start_time
print(f"Runtime : {runtime} ")

Reviews button clicked.
Explore all Reviews button clicked.
No more 'Show More' buttons. All reviews are loaded.
(417, 4)
Data saved to 'reviews.csv'
Runtime : 367.3914313316345 


In [2]:
reviews_df = pd.read_csv('reviews.csv')
reviews_df.shape

(417, 4)

In [3]:
reviews_df.sample(10)

Unnamed: 0,Review Date,Reviewer Name,Review Title,Review Content
410,2019-04-26,MOMof3MMM,Great Product!,[This review was collected as part of a promot...
90,2019-05-02,Carolina28,Overall great!!!,[This review was collected as part of a promot...
42,2021-05-26,MelisaM,Best stroller / car seat ever!!,Amazing I love this stroller and car seat so m...
316,2020-12-04,Mimi99,Nice quality,"Very easy to install, nicely designed and feel..."
163,2020-03-16,Sualeh,Great features and great quality!,Just bought this last week and assembled it fo...
245,2020-08-01,mrymdrd,QUALITY & SAFETY FEATURES!!!,My husband and I just assembled this stroller ...
227,2020-09-03,BabyMama,Wonderful Product,I received this travel system at my baby showe...
371,2019-05-02,Rachel23,Awesome travel system!,[This review was collected as part of a promot...
301,2020-10-10,Cass2020,"Light weight, handles great, and SO easy to use !",I got this as a gift from our registry and i a...
374,2020-07-06,Alxer,Amazing Stroller,"After many reviews read, I purchased the strol..."
