In [None]:
# !pip install selenium
# !pip install beautifulsoup4
# !pip install pandas

In [None]:
from selenium.webdriver.edge.service import Service
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException, NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import os
import csv

In [None]:
def setup_driver():
    driver_path = '../driver/msedgedriver.exe'
    service = Service(executable_path=driver_path)
    driver = webdriver.Edge(service=service)
    return driver

## Extraction of product links

In [None]:
def get_product_links(category_url):
    print(f"Fetching category links from: {category_url}")

    driver = setup_driver()

    driver.get(category_url)

    # Wait for the product links to be present
    wait = WebDriverWait(driver, 10)
    try:
        links = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.absolute-link.product-link')))
    except TimeoutException:
        print("Timed out waiting for product links.")
        links = []

    # Use a regular expression to find links that end with a product ID
    product_links = []
    for link in links:
        href = link.get_attribute('href')
        if href and re.search(r'/pr/[^/]+\/\d+$', href):
            product_links.append(href)

    # Close the browser
    driver.quit()

    print(f"Found {len(product_links)} product links.")
    return product_links

In [None]:
def get_all_category_links(base_url):
    page_number = 1
    links = []
    while True:
        # Construct the URL for the current page
        category_url = f"{base_url}&p={page_number}"
        print(f"Processing URL: {category_url}")
        page_links = get_product_links(category_url)

        if not page_links:  # If no links are found, break the loop
            print("No more product links found.")
            break

        links.extend(page_links)  # Concatenate the obtained links

        if len(page_links) < 48:  # Fewer than 48 links are found, which indicates the last page.
            print("It is the last page.")
            break

        page_number += 1

    return links

In [None]:
# Base category URL for 11 common brands with products having rating of 4 to 5 (four base links in total for the chosen categories, including Shampoo, Conditioner, Treatments, Styling)
base_category_url = 'https://hk.iherb.com/c/bath-personal-care?cids=2066&ratings=5%2C4&bids=MIE%2CSMT%2CCTU%2COKA%2CLOE%2CLBY%2CCMR%2CCFN%2CKTC%2CGIO%2CDVE&cbcr=100477%2C5706'
all_links = get_all_category_links(base_category_url)
print(all_links)

Processing URL: https://hk.iherb.com/c/bath-personal-care?cids=2066&ratings=5%2C4&bids=MIE%2CSMT%2CCTU%2COKA%2CLOE%2CLBY%2CCMR%2CCFN%2CKTC%2CGIO%2CDVE&cbcr=100477%2C5706&p=1
Fetching category links from: https://hk.iherb.com/c/bath-personal-care?cids=2066&ratings=5%2C4&bids=MIE%2CSMT%2CCTU%2COKA%2CLOE%2CLBY%2CCMR%2CCFN%2CKTC%2CGIO%2CDVE&cbcr=100477%2C5706&p=1
Found 48 product links.
Processing URL: https://hk.iherb.com/c/bath-personal-care?cids=2066&ratings=5%2C4&bids=MIE%2CSMT%2CCTU%2COKA%2CLOE%2CLBY%2CCMR%2CCFN%2CKTC%2CGIO%2CDVE&cbcr=100477%2C5706&p=2
Fetching category links from: https://hk.iherb.com/c/bath-personal-care?cids=2066&ratings=5%2C4&bids=MIE%2CSMT%2CCTU%2COKA%2CLOE%2CLBY%2CCMR%2CCFN%2CKTC%2CGIO%2CDVE&cbcr=100477%2C5706&p=2
Found 48 product links.
Processing URL: https://hk.iherb.com/c/bath-personal-care?cids=2066&ratings=5%2C4&bids=MIE%2CSMT%2CCTU%2COKA%2CLOE%2CLBY%2CCMR%2CCFN%2CKTC%2CGIO%2CDVE&cbcr=100477%2C5706&p=3
Fetching category links from: https://hk.iherb.com/c/b

In [None]:
# Specify the filename where data will be saved
output_csv = '../data/links/links_Shampoo.csv'

# Write the list to a CSV file
with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header (optional)
    writer.writerow(['URL'])
    # Write the data
    for link in all_links:
        writer.writerow([link])

In [None]:
# Import csv fle into the list variable all_links

filename = '../data/links/links_Shampoo.csv'

all_links = []

# Open the CSV file and read content
with open(filename, newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)  # Skip the header row
    for row in reader:
        if len(row) > 0:  # Check if the row is not empty
            all_links.append(row[0])

# Now all_links contains all the links from the CSV file, excluding the header
print(all_links)

['https://hk.iherb.com/pr/mielle-strengthening-shampoo-rosemary-mint-12-fl-oz-355-ml/108764', 'https://hk.iherb.com/pr/giovanni-smooth-as-silk-deep-moisture-shampoo-for-damaged-hair-8-5-fl-oz-250-ml/6419', 'https://hk.iherb.com/pr/giovanni-tea-tree-triple-treat-invigorating-shampoo-for-all-hair-types-8-5-fl-oz-250-ml/6412', 'https://hk.iherb.com/pr/sheamoisture-jamaican-black-castor-oil-strengthen-restore-shampoo-13-fl-oz-384-ml/100354', 'https://hk.iherb.com/pr/giovanni-50-50-balanced-hydrating-clarifying-shampoo-for-normal-to-dry-hair-8-5-fl-oz-250-ml/6398', 'https://hk.iherb.com/pr/sheamoisture-kids-extra-nourishing-shampoo-mango-carrot-8-fl-oz-237-ml/97575', 'https://hk.iherb.com/pr/sheamoisture-jamaican-black-castor-oil-strengthen-restore-shampoo-3-2-fl-oz-95-ml/99878', 'https://hk.iherb.com/pr/kitsch-strengthening-solid-shampoo-bar-rice-water-protein-white-tea-mandarin-1-bar-3-2-oz-91-g/122069', 'https://hk.iherb.com/pr/giovanni-root-66-max-volume-shampoo-for-limp-lifeless-hair-8

## Extraction of product information

In [None]:
def scrape_product_data(page_link, driver):
    driver.get(page_link)
    html = driver.page_source
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    if soup.find('input', {'data-product-id': True}):

        # Extract the information
        product_id = soup.find('input', {'data-product-id': True})['data-product-id'].strip()

        product_name_element = soup.find('input', {'data-product-name': True})
        product_name = product_name_element['data-product-name'].strip() if product_name_element else ''

        brand_name_element = soup.find('input', {'data-brand-name': True})
        brand_name = brand_name_element['data-brand-name'].strip() if brand_name_element else ''

        list_price_element = soup.find('input', {'data-list-price': True})
        list_price = list_price_element['data-list-price'].replace('HK$', '').strip() if list_price_element else ''

        discount_price_element = soup.find('input', {'data-discounted-price': True})
        discount_price = discount_price_element['data-discounted-price'].replace('HK$', '').strip() if discount_price_element else ''

        sale_in_30days_element = soup.find('div', class_='recent-activity-message')
        sale_in_30days = sale_in_30days_element.find('span').text.replace('+ sold in 30 days', '').strip() if sale_in_30days_element else ''

        rating_element = soup.find('span', {'class': 'review-average-rating'})
        rating = rating_element.text.strip() if rating_element else ''

        no_of_reviews_element = soup.find('p', {'class': 'review-based-on'})
        no_of_reviews = no_of_reviews_element.text.strip() if no_of_reviews_element else ''

        first_available_element = soup.find('span', {'class': 'product-sale-date'})
        first_available = first_available_element.text.strip() if first_available_element  else ''

        # Rankings
        rankings_div = soup.find('div', {'class': 'best-selling-rank'})
        ranking_cards = rankings_div.find_all('div') if rankings_div else ''
        rankings = {}

        for rank_card in ranking_cards:
            rank_text = rank_card.find('strong', class_='rank').text.strip('#')
            category_anchor = rank_card.find('a', class_='crumbs')
            category = category_anchor.text.strip()
            rankings[f'ranking_{category.lower().replace(" ", "_")}'] = int(re.sub(r'[^0-9]', '', rank_text))

        # Check for status
        stock_alert = ''
        if soup.find('div', class_='text-danger stock-status-text') and 'Out of stock' in soup.find('div', class_='text-danger stock-status-text').text:
            stock_alert = 'out of stock'
        elif soup.find('div', class_='coming-soon-text') and 'Coming soon' in soup.find('div', class_='coming-soon-text').text:
            stock_alert = 'coming soon'
        elif soup.find('div', class_='product-summary-unavailable-message-content') and soup.find('div', class_='product-summary-unavailable-message-content').find('h4') and 'This item is not available for purchase in your country' in soup.find('div', class_='product-summary-unavailable-message-content').find('h4').text:
            stock_alert = 'unavailable for this country'

        # Create a dictionary to store the scraped data
        item = {
            'product_id': product_id,
            'product_name': product_name,
            'brand_name': brand_name,
            'category': 'Shampoo',
            'list_price': float(list_price) if list_price != '' else None,
            'discount_price': float(discount_price) if discount_price != '' else None,
            'sale_in_30days': int(re.sub(r'[^0-9]', '', sale_in_30days)) if sale_in_30days != '' else None,
            'rating': float(rating) if rating != '' else None,
            'no_of_reviews': int(re.sub(r'[^0-9]', '', no_of_reviews)) if no_of_reviews != '' else None,
            'first_available': first_available,
            'rankings': rankings,
            'stock_alert': stock_alert
        }

        item_df = pd.DataFrame([item])
        return item_df

    else:
        print(f"Failed to fetch product details for {page_link}.")
        return None

In [None]:
def main(all_links):
    driver = setup_driver()

    output_csv = '../data/iherb_hair_care_raw_dataset.csv'

    # Use an index to iterate over the list to support removal during iteration
    index = 0

    while index < len(all_links):
        page_link = all_links[index]

        try:
            item_df = scrape_product_data(page_link, driver)
            if item_df is not None:
                # If data is successfully scraped, save it and remove the link from the list
                item_df.to_csv(output_csv, mode='a', header=not os.path.exists(output_csv), index=False)
                print(f"Data for {page_link} saved to {output_csv}.")
                # Remove the successfully scraped link from the list
                del all_links[index]
            else:
                print(f"Failed to fetch product details for {page_link}.")
                index += 1
                continue
        except Exception as e:
            print(f"An error occurred while scraping {page_link}: {e}")
            index += 1
            continue

    driver.quit()

In [None]:
main(all_links)

## Extraction of product reviews

In [None]:
def scrape_reviews(url, max_pages):
    review_contents = []
    driver = setup_driver()
    driver.maximize_window()

    for page_num in range(1, max_pages + 1):
        review_link = f"{url}&p={page_num}"
        print(review_link)
        driver.get(review_link)

        review_containers = driver.find_elements(By.CSS_SELECTOR, "div.MuiBox-root.css-1v71s4n")

        for index, container in enumerate(review_containers):
            print(f"Processing review {index + 1} on page {page_num}...")
            try:
                # Scroll to the element to ensure it's in view
                driver.execute_script("arguments[0].scrollIntoView();", container)

                # Wait for the review text to be visible
                review_text_element = WebDriverWait(container, 10).until(
                    EC.visibility_of_element_located((By.CSS_SELECTOR, "div[data-testid='review-text'] span.__react-ellipsis-js-content"))
                )

                try:
                    show_more_button = container.find_element(By.XPATH, ".//span[contains(text(), 'Show more')]")
                    if show_more_button:
                        print(f"There is 'Show more' for review {index + 1} on page {page_num}...")
                        # Using JavaScript to click the button
                        driver.execute_script("arguments[0].click();", show_more_button)
                        # Wait for the full review text to be visible
                        WebDriverWait(container, 10).until(
                            EC.visibility_of_element_located((By.CSS_SELECTOR, "div[data-testid='review-text'] span.__react-ellipsis-js-content"))
                        )
                except NoSuchElementException:
                    print(f"No 'Show more' button found for review {index + 1} on page {page_num}.")

                # Get the review text
                review_text_element = container.find_element(By.CSS_SELECTOR, "div[data-testid='review-text'] span.__react-ellipsis-js-content")
                review_contents.append(review_text_element.text)
                print(f"Review {index + 1} text on page {page_num}: {review_text_element.text}")
            except TimeoutException:
                print(f"The 'Show more' button is still visible after waiting for it to disappear for review {index + 1} on page {page_num}.")
            except Exception as e:
                print(f"Error processing review {index + 1} on page {page_num}: {e}")

    driver.quit()
    return review_contents

In [None]:
url = 'https://hk.iherb.com/r/mielle-clarifying-sugar-scalp-scrub-rosemary-mint-6-oz-170-g/138164?sort=6&ratings=1'
max_pages_to_scrape = 2
reviews = scrape_reviews(url, max_pages_to_scrape)

https://hk.iherb.com/r/mielle-clarifying-sugar-scalp-scrub-rosemary-mint-6-oz-170-g/138164?sort=6&ratings=1&p=1
Processing review 1 on page 1...
No 'Show more' button found for review 1 on page 1.
Review 1 text on page 1: I was disappointed with this scalp scrub. It didn't cleanse my scalp effectively, and despite having thin hair, I needed to use a lot of product, which means it won't last long. I've tried better scalp scrubs before, so I can't recommend this one and definitely won't be repurchasing
Processing review 2 on page 1...
No 'Show more' button found for review 2 on page 1.
Review 2 text on page 1: the scent is nice, but the package is so uncomfortable!!! the sugar gets stuck and the scrub doesnt flow out! I had to cut the package in order to make it comfortable to use. I also didnt see or feel any benefit to my hair. it's impossible to use on dry hair, and on wet hair the sugar dissolves way before I get a chance to scrub my scalp. and it makes my hair sticky. I don't like i

In [None]:
product_id = re.search(r'/(\d+)\?', url).group(1)
output_csv = f'../data/reviews_dataset.csv'
rating = 1

file_exists = os.path.isfile(output_csv)

# Write the header row
with open(output_csv, mode='a' if file_exists else 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    if not file_exists:  # Write the header only if the file is being created
        writer.writerow(['product_id','rating', 'review_text'])

    # Write each review text as a row in the file with the rating
    for review_text in reviews:
        writer.writerow([product_id, rating, review_text])

print(f'Reviews saved to {output_csv}')

Reviews saved to ../data/reviews_dataset.csv
