In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService

from webdriver_manager.chrome import ChromeDriverManager

import time
import os
import sys
import re

import pandas as pd
import numpy as np

In [2]:
def random_wait(avg: float = 4, std: float = 1) -> None:
    # make sure the random number is positive and doesn't go above or below the average by more than 2 standard deviations
    random_time = -1
    while random_time < 0 or random_time > avg + 2 * std or random_time < avg - 2 * std:
        random_time = np.random.normal(avg, std)
    time.sleep(random_time)

In [3]:
chrome_options = Options()
# chrome_options.add_argument("-start-maximized")
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)

In [4]:
url = "https://www.amazon.fr"
driver.get(url)

random_wait(2, 0.5)

In [5]:
# look for accept cookies button
try:
    accept_cookies = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "sp-cc-accept")))
    accept_cookies.click()

    random_wait(2, 0.5)

except Exception as e:
    # continue if no cookies button
    pass

In [6]:
search_term = "Laptop RTX 4060"
nb_pages = 1

# search bar
try:
    search_bar = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "twotabsearchtextbox")))
    search_bar.clear()

    for letter in search_term:
        search_bar.send_keys(letter)
        random_wait(0.2, 0.05)

    random_wait(1, 0.2)
    search_bar.send_keys(Keys.RETURN)

    random_wait()

except Exception as e:
    print(f"Couldn't find search bar: {e}")
    driver.close()

In [13]:
page = 1

# products json
products = []

products_list = []

while page <= nb_pages:
    print(f"Scraping page {page} of {nb_pages}")

    # get all products
    try:
        products_list_raw = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//div[(@class and contains(concat(' ', normalize-space(@class), ' '), ' a-section ')) and (@class and contains(concat(' ', normalize-space(@class), ' '), ' a-spacing-small '))]")))
        # print(f"Found {len(products_list_raw)} products in raw list")

    except Exception as e:
        print(f"Couldn't find products: {e}")
        driver.close()

    for product in products_list_raw:
        # get product name
        try:
            product_name = product.find_element(By.XPATH, ".//h2/a/span").text

            # only keep products with a name
            if product_name == '':
                continue

            products_list.append(product)

        except Exception as e:
            continue

        # get product price
        try:
            product_price = product.find_element(By.XPATH, ".//span[@class and contains(concat(' ', normalize-space(@class), ' '), ' a-price-whole ')]").text
            product_price = float(product_price.replace(",", ".").replace(" ", ""))

        except Exception as e:
            product_price = ""

        # get product rating
        try:
            product_rating = product.find_element(By.XPATH, ".//i[@class and contains(concat(' ', normalize-space(@class), ' '), ' a-icon-star-small ')]/span").get_attribute("innerHTML").split()[0]
            product_rating = float(product_rating.replace(",", "."))
        except Exception as e:
            product_rating = ""

        # get product number of ratings
        try:
            product_nb_ratings = ""

            nb_ratings_tmp = product.find_elements(By.XPATH, ".//div[(@class and contains(concat(' ', normalize-space(@class), ' '), ' a-row ')) and (@class and contains(concat(' ', normalize-space(@class), ' '), ' a-size-small '))]/span/a[(((@class and contains(concat(' ', normalize-space(@class), ' '), ' a-link-normal ')) and (@class and contains(concat(' ', normalize-space(@class), ' '), ' s-underline-text '))) and (@class and contains(concat(' ', normalize-space(@class), ' '), ' s-underline-link-text '))) and (@class and contains(concat(' ', normalize-space(@class), ' '), ' s-link-style '))]/..")
            for rating in nb_ratings_tmp:
                r = rating.get_attribute("aria-label").replace("\xa0", "").replace("(", "").replace(")", "").replace(" ", "").replace(",", "")
                if r[0].isdigit():
                    product_nb_ratings = int(r)
                    break

        except Exception as e:
            product_nb_ratings = ""

        # get product link
        try:
            product_link = product.find_element(By.XPATH, ".//h2/a").get_attribute("href")
        except Exception as e:
            product_link = ""

        products.append({
                "name": product_name,
                "price": product_price,
                "rating": product_rating,
                "nb_ratings": product_nb_ratings,
                "link": product_link,
                "scrape_date": datetime.now(),
        })

    random_wait(1, 0.2)
    
    # go to next page
    if page == nb_pages:
        break

    try:
        next_page = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[text() = 'Suivant']")))
        next_page.click()
        random_wait(2, 0.5)

        page += 1
    
    except Exception as e:
        print(f"No more pages")
        break

# driver.close()

products_df = pd.DataFrame(products)
products_df

Scraping page 1 of 1


Unnamed: 0,name,price,rating,nb_ratings,link
0,Dell Inspiron 16 5620 Intel Core i7 Ordinateur...,849.0,4.6,8.0,https://www.amazon.fr/sspa/click?ie=UTF8&spc=M...
1,"PCSpecialist Portable Gamer 16"" QHD 240Hz - In...",1649.0,,,https://www.amazon.fr/sspa/click?ie=UTF8&spc=M...
2,"PCSpecialist Portable Gamer 16"" QHD 240Hz - In...",2849.0,,,https://www.amazon.fr/sspa/click?ie=UTF8&spc=M...
3,"Ordinateur Portable 15,6’’ Notebook，16Go DDR4 ...",379.0,,,https://www.amazon.fr/sspa/click?ie=UTF8&spc=M...
4,ASUS TUF F15-TUF507ZV4-LP121W PC Portable Game...,1199.99,4.1,98.0,https://www.amazon.fr/ASUS-TUF-F15-TUF507ZV4-L...
5,"HP OMEN 16-xd0000sf PC Portable Gaming 16.1"" F...",1399.0,3.8,8.0,https://www.amazon.fr/HP-16-xd0000sf-Portable-...
6,Lenovo LOQ 15IRH8 - Ordinateur Portable 15.6''...,1299.99,4.2,75.0,https://www.amazon.fr/Lenovo-LOQ-15IRH8-dExplo...
7,GIGABYTE G5 KF-E3FR313SH Ordinateur Portable G...,,,,https://www.amazon.fr/GIGABYTE-G5-KF-E3FR313SH...
8,ASUS TUF A17-TUF707NV-HX015W PC Portable Gamer...,1399.99,4.0,1.0,https://www.amazon.fr/ASUS-TUF-A17-TUF707NV-HX...
9,ASUS ROG STRIX-G17-G713PV-HX058W PC Portable G...,2499.99,3.9,36.0,https://www.amazon.fr/ASUS-ROG-STRIX-G17-G713P...
