In [None]:
print('hello')

In [None]:
# standard python
import os
import json
import time
import random
import warnings
import csv
from datetime import datetime

# external libs
import pandas as pd
import requests
from tqdm.notebook import tqdm

# beautiful soup
from bs4 import BeautifulSoup

# selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
# selenium with undetected tracking
import undetected_chromedriver as uc

# better visualization in terminal
from IPython.display import display, Image, HTML, clear_output
from PIL import Image as PILImage
from colorama import Fore, Style, init

# no warnings
warnings.filterwarnings("ignore", message="Pyarrow will become a required dependency")


# 0 - Webscrapping demo

## A - Functions

In [None]:
# typing function with human-like effect
def human_typing_effect(target_element, message, min_interval=0.05, max_interval=0.15):
    """Simulates human typing behavior by sending characters one at a time."""
    for character in message:
        target_element.send_keys(character)
        delay = random.uniform(min_interval, max_interval)
        time.sleep(delay)

# smooth scrolling on the page
def perform_smooth_scroll(browser, destination=None, total_time=2, scroll_steps=25):
    """Smoothly scrolls through the webpage."""
    # Determine full scroll height if no destination specified
    scroll_limit = browser.execute_script("return document.body.scrollHeight") if destination is None else destination

    current_scroll = browser.execute_script("return window.pageYOffset")
    increment = (scroll_limit - current_scroll) / scroll_steps

    for step in range(1, scroll_steps + 1):
        position = current_scroll + increment * step
        browser.execute_script(f"window.scrollTo(0, {position})")
        time.sleep(total_time / scroll_steps)

# take screenshot and resize for display
def capture_and_show_screenshot(browser, save_as="capture.png", target_width=800):
    """Captures a screenshot and displays it resized."""
    browser.save_screenshot(save_as)

    img = PILImage.open(save_as)
    scale_factor = target_width / float(img.size[0])
    new_height = int(float(img.size[1]) * scale_factor)
    resized_img = img.resize((target_width, new_height), PILImage.LANCZOS)
    resized_img.save(save_as)

    display(Image(filename=save_as, width=target_width))

# element highlighting on the page
def emphasize_element(browser, web_element, highlight_duration=2):
    """Temporarily highlights a webpage element for visibility."""
    original_styles = web_element.get_attribute("style")
    highlight_script = """
    arguments[0].setAttribute('style', arguments[1] + '; border: 2px solid red; background: yellow; color: black;');
    """
    browser.execute_script(highlight_script, web_element, original_styles)
    time.sleep(highlight_duration)
    restore_script = """
    arguments[0].setAttribute('style', arguments[1]);
    """
    browser.execute_script(restore_script, web_element, original_styles)



## B - Example of Browser - Filling out a form

In [None]:
# Initialisation de colorama pour un affichage coloré
init()

# Configuration du navigateur
print(f"{Fore.CYAN}Configuration du navigateur en cours...{Style.RESET_ALL}")

# Options compatibles pour navigateur furtif
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")

# Simulation du chargement avec barre de progression
for _ in tqdm(range(10), desc="Initialisation WebDriver"):
    time.sleep(0.2)

# Démarrage du navigateur avec les options configurées
driver = webdriver.Chrome(options=chrome_options)

print(f"\n{Fore.CYAN}📝 Remplissage d'un formulaire{Style.RESET_ALL}")
driver.get("https://httpbin.org/forms/post")
time.sleep(2)
capture_and_show_screenshot(driver, "form_initial.png")

print("Chargement du formulaire terminé...")

# Remplissage progressif du formulaire
try:
    # Remplir le nom
    name_field = driver.find_element(By.NAME, "custname")
    emphasize_element(driver, name_field)
    human_typing_effect(name_field, "Amaury Gellé")

    # Remplir le téléphone
    phone_field = driver.find_element(By.NAME, "custtel")
    emphasize_element(driver, phone_field)
    human_typing_effect(phone_field, "0123456789")

    # Remplir l'email
    email_field = driver.find_element(By.NAME, "custemail")
    emphasize_element(driver, email_field)
    human_typing_effect(email_field, "amaurygelle@gmail.com")

    # Choix de taille de pizza
    pizza_size = driver.find_element(By.CSS_SELECTOR, "input[value='medium']")
    emphasize_element(driver, pizza_size)
    pizza_size.click()

    # Choix de la garniture
    topping_field = driver.find_element(By.NAME, "topping")
    emphasize_element(driver, topping_field)
    driver.execute_script("arguments[0].value = 'mushroom';", topping_field)

    # Heure de livraison
    delivery_time = driver.find_element(By.NAME, "delivery")
    emphasize_element(driver, delivery_time)
    human_typing_effect(delivery_time, "18:30")

    # Commentaires
    comments_field = driver.find_element(By.NAME, "comments")
    emphasize_element(driver, comments_field)
    human_typing_effect(comments_field, "Please leave it in front of the doorway - thanks!")

    capture_and_show_screenshot(driver, "form_filled.png")

    # Soumettre le formulaire
    print("Envoi du formulaire en cours...")
    submit_btn = driver.find_element(By.CSS_SELECTOR, "button[type='submit']")
    emphasize_element(driver, submit_btn)
    submit_btn.click()

    time.sleep(2)
    capture_and_show_screenshot(driver, "form_submitted.png")

except Exception as error:
    print(f"Une erreur est survenue lors du remplissage du formulaire : {error}")


## C - Mimicking a Google search

In [None]:
# Redémarrage de colorama pour affichage en couleur
init()

# Préparation du navigateur
print(f"{Fore.CYAN}Initialisation du navigateur...{Style.RESET_ALL}")

# Paramètres pour navigateur furtif
browser_options = webdriver.ChromeOptions()
browser_options.add_argument("--start-maximized")
browser_options.add_argument("--disable-notifications")

# Progression d'initialisation
for _ in tqdm(range(10), desc="Démarrage WebDriver"):
    time.sleep(0.2)

# Lancement du navigateur
driver = webdriver.Chrome(options=browser_options)

print(f"{Fore.GREEN}Recherche interactive sur Google{Style.RESET_ALL}")
print("Accès à Google et gestion des cookies...")

# Navigation vers Google
driver.get("https://www.google.com")
time.sleep(2)

# Gestion des cookies
try:
    accept_cookies_btn = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Accept all')]"))
    )
    accept_cookies_btn.click()
    print("Cookies acceptés ✓")
    time.sleep(1)
except:
    print("Aucun cookie à accepter ou déjà accepté.")

# Simulation de saisie de recherche
search_input = driver.find_element(By.NAME, "q")
emphasize_element(driver, search_input)
query = "Albert School data courses"
print(f"Recherche de : '{query}'")

human_typing_effect(search_input, query)
capture_and_show_screenshot(driver, "google_search_typing.png")

# Validation de la recherche
search_input.send_keys(Keys.RETURN)
time.sleep(3)
capture_and_show_screenshot(driver, "google_results.png")

# Scroll de la page et capture
print("Défilement de la page...")
perform_smooth_scroll(driver, total_time=4)
capture_and_show_screenshot(driver, "google_results_scrolled.png")

# Extraction des résultats
print(f"{Fore.BLUE}Extraction des liens et descriptions...{Style.RESET_ALL}")

# Attente de chargement des liens
try:
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "a[href^='https://']"))
    )
    print("Liens détectés ✓")
except:
    print("Timeout pour les résultats.")

# Collecte des résultats
gathered_results = []
time.sleep(1)

# Sélection des liens
found_links = driver.find_elements(By.CSS_SELECTOR, "a[href^='https://']:not([href*='google'])")

print(f"Nombre de liens détectés : {len(found_links)}")

for item in found_links:
    try:
        link_url = item.get_attribute("href")
        link_title = item.text

        if not link_title.strip():
            try:
                h3_element = item.find_element(By.XPATH, "./ancestor::*//h3 | .//*//h3 | ./following::h3[1] | ./preceding::h3[1]")
                link_title = h3_element.text
            except:
                link_title = "Titre non trouvé"

        link_description = "Description indisponible"
        try:
            possible_desc = [
                item.find_element(By.XPATH, "./ancestor::*[3]//div[string-length(text()) > 50]"),
                item.find_element(By.XPATH, "./following::div[string-length(text()) > 50][1]"),
                item.find_element(By.XPATH, "./ancestor::*//div[contains(@class, 'desc') or contains(@class, 'snippet') or contains(@class, 'description')]")
            ]

            for desc in possible_desc:
                if desc and desc.text.strip():
                    link_description = desc.text.strip()
                    break
        except:
            pass

        if link_title.strip() and link_url and "google" not in link_url.lower():
            gathered_results.append({
                "Title": link_title,
                "URL": link_url,
                "Description": link_description
            })

    except Exception as error:
        print(f"Erreur d'extraction : {str(error)[:100]}...")
        continue

# Suppression des doublons
final_results = []
urls_seen = set()
for entry in gathered_results:
    if entry["URL"] not in urls_seen:
        urls_seen.add(entry["URL"])
        final_results.append(entry)

# Affichage des premiers résultats
print(f"\n{Fore.YELLOW}Aperçu des premiers résultats ({len(final_results)} totalisés) :{Style.RESET_ALL}")
for idx, entry in enumerate(final_results[:3], 1):
    print(f"\nRésultat {idx}:")
    print(f"Title: {entry['Title']}")
    print(f"URL: {entry['URL']}")
    print(f"Description: {entry['Description'][:100]}...")

# Enregistrement des résultats
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
output_csv = f"google_search_results_{current_time}.csv"

with open(output_csv, 'w', newline='', encoding='utf-8') as csv_file:
    headers = ['Title', 'URL', 'Description']
    csv_writer = csv.DictWriter(csv_file, fieldnames=headers)
    csv_writer.writeheader()
    for entry in final_results:
        csv_writer.writerow(entry)

print(f"\n{Fore.GREEN}Résultats exportés dans {output_csv} ({len(final_results)} entrées){Style.RESET_ALL}")

print("\nExportation terminée !")
for _ in tqdm(range(5), desc="Clôture"):
    time.sleep(0.3)



# I - Webscrapping IMDB (Beautiful Soup)

## A - Making a request on the Top 250 movies page

In [None]:
# Inspection du fichier robots.txt de IMDb
# https://www.imdb.com/robots.txt

# Requête vers la page IMDb Top 250
imdb_url = "https://www.imdb.com/chart/top/"
request_headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9"
}

# Envoi de la requête
imdb_response = requests.get(imdb_url, headers=request_headers)

print(f"Statut de la requête : {imdb_response.status_code}")
if imdb_response.status_code == 200:
    print("Connexion établie avec succès.")
else:
    print("Échec d'accès à la page IMDb.")
print('')


In [None]:
# Création d'un objet BeautifulSoup à partir du contenu HTML reçu
page_soup = BeautifulSoup(imdb_response.text, "html.parser")

# Affichage du titre de la page pour validation
print(page_soup.title.string)


## B - Starting small - retrieving the information of the first movie

In [None]:
# Utilisation de soup.select_one() pour récupérer la première occurrence souhaitée

top_movie_element = page_soup.select_one('.ipc-metadata-list-summary-item__c')
print(top_movie_element)

# Extraction du titre
top_movie_title = top_movie_element.select_one('.ipc-title__text')
print(top_movie_title)

# Extraction de la note
rating_element = top_movie_element.select_one('.ipc-rating-star--rating')
movie_rating = float(rating_element.text)
print(movie_rating)

# Extraction de l'année
year_element = top_movie_element.select_one('.ipc-rating-star--rating')
print(year_element.text)

# Réaffichage de la note pour vérification
rating_element = top_movie_element.select_one('.ipc-rating-star--rating')
movie_rating = float(rating_element.text)
print(movie_rating)


## C - Retrieve data for the whole page

In [None]:
# Initialisation des listes pour stocker les données
titles_list = []
years_list = []
ratings_list = []

# Récupération de tous les conteneurs de films
movie_blocks = page_soup.select('.ipc-metadata-list-summary-item')

for block in movie_blocks:
    # Extraction du titre
    movie_title_element = block.select_one('.ipc-title__text')
    if movie_title_element:
        titles_list.append(movie_title_element.text.strip())
    else:
        titles_list.append(None)

    # Extraction de l'année
    movie_year_element = block.select_one('.sc-5179a348-7')
    if movie_year_element:
        years_list.append(movie_year_element.text.strip())
    else:
        years_list.append(None)

    # Extraction de la note
    movie_rating_element = block.select_one('.ipc-rating-star--rating')
    if movie_rating_element:
        ratings_list.append(float(movie_rating_element.text.strip()))
    else:
        ratings_list.append(None)

print('Titres :', titles_list)
print('Années :', years_list)
print('Notes :', ratings_list)



## D - Creating a dataframe and saving the data

In [None]:
# Création du DataFrame

movies_dataframe = pd.DataFrame({
    'Title': titles_list,
    'Year': years_list,
    'Rating': ratings_list
})

movies_dataframe.head()

# Vérification du répertoire actuel
current_dir = os.getcwd()
print(f"Dossier de travail actuel : {current_dir}")

# Sauvegarde du DataFrame dans un dossier dédié

output_path = os.path.join('..', 'notebooks')
output_file = 'imdb_top_movies.csv'
movies_dataframe.to_csv(f'{output_path}/{output_file}', index=False)
print(f"Données exportées vers {output_path}/{output_file}")


# II - Webscrapping an Olist Website

## A - Fetch one product

In [None]:
# Configuration de Chrome en mode headless
headless_options = Options()
headless_options.add_argument('--headless')
headless_options.add_argument('--no-sandbox')
headless_options.add_argument('--disable-dev-shm-usage')
headless_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36')

# Installation du ChromeDriver
browser_service = Service(ChromeDriverManager().install())

# Création du navigateur
browser = webdriver.Chrome(service=browser_service, options=headless_options)

product_url = "https://www.sunsetcosmeticos.com.br/produto/base-cobertura-total-maple-absolute-new-york-208"

browser.get(product_url)

# Extraction des informations
product_title = browser.find_element('css selector', 'h1.name')
print(product_title.text)

product_price = browser.find_element('css selector', 'span.cmp-price-price')
print(product_price.text)

product_rating = browser.find_element('css selector', 'p.value')
print(product_rating.text)


In [None]:
# Transformation du code en fonction

def scrape_product_info(browser, product_link):
    browser.get(product_link)

    try:
        product_title = browser.find_element('css selector', 'h1.name')
        product_title = product_title.text
        print(product_title)
    except Exception:
        product_title = None
        print('Erreur lors de la récupération du titre')

    try:
        product_price = browser.find_element('css selector', 'span.cmp-price-price')
        product_price = product_price.text
        print(product_price)
    except Exception:
        product_price = None
        print('Erreur lors de la récupération du prix')

    try:
        product_rating = browser.find_element('css selector', 'p.value')
        product_rating = product_rating.text
        print(product_rating)
    except Exception:
        product_rating = None
        print('Erreur lors de la récupération de la note')

    product_data = {
        'title': product_title,
        'price': product_price,
        'rating': product_rating
    }
    return product_data

# Exécution de la fonction pour tester
scrape_product_info(browser, 'https://www.sunsetcosmeticos.com.br/produto/base-corbertura-total-maple-absolute-new-york-208')



## B - Fetch one brand

In [None]:
# Démarrage par récupération de l'URL du premier produit de la page, puis utilisation de la fonction

# Configuration de Chrome en mode sans interface (headless)
headless_options = Options()
headless_options.add_argument('--headless')
headless_options.add_argument('--no-sandbox')
headless_options.add_argument('--disable-dev-shm-usage')
headless_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36')

# Installation du ChromeDriver
chrome_service = Service(ChromeDriverManager().install())

# Création du navigateur headless
browser = webdriver.Chrome(service=chrome_service, options=headless_options)

store_url = 'https://www.sunsetcosmeticos.com.br/absolute_new_york'
browser.get(store_url)

# Récupération de l'URL du premier produit affiché
first_product_link = browser.find_element(By.CSS_SELECTOR, 'div.product-block a.anchor')
first_product_href = first_product_link.get_attribute('href')
print("URL du premier produit :", first_product_href)

# Récupération des informations du produit
scrape_product_info(browser, first_product_href)

# Fermeture du navigateur
browser.quit()


In [None]:
# Généraliser la récupération pour tous les produits de la page (sans scroll pour l'instant)

browser = webdriver.Chrome(service=chrome_service, options=headless_options)

# Récupération des liens de tous les produits visibles
product_links = browser.find_elements(By.CSS_SELECTOR, 'div.product-block a.anchor')
all_product_urls = [link.get_attribute('href') for link in product_links]

# Extraction des informations pour chaque produit
for product_link in all_product_urls:
    scrape_product_info(browser, product_link)

# Fermeture du navigateur
browser.quit()




In [None]:
# Fonction pour scroller et charger plus de produits

def scroll_until_loaded(browser, max_attempts=20):
    '''
    Scroll automatique de la page jusqu'à chargement complet des produits
    On compare le nombre de produits avant et après chaque scroll
    Si aucun nouveau produit détecté, arrêt du scroll
    '''
    previous_total = len(browser.find_elements(By.CLASS_NAME, "product-block"))
    attempts = 0

    while attempts < max_attempts:
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Attente du chargement

        current_total = len(browser.find_elements(By.CLASS_NAME, "product-block"))

        if current_total == previous_total:
            break

        previous_total = current_total
        attempts += 1

# Fonction pour récupérer tous les produits d'une marque

def collect_brand_products(browser, brand_name):
    '''
    Récupère tous les produits d'une marque :
    - Accède à la page de la marque
    - Scroll pour tout charger
    - Récupère les liens de chaque produit
    - Visite chaque lien pour extraire les infos
    Retourne une liste de dictionnaires produits
    '''

    brand_page = f'https://www.sunsetcosmeticos.com.br/{brand_name}'
    browser.get(brand_page)

    scroll_until_loaded(browser)

    product_cards = browser.find_elements(By.CSS_SELECTOR, 'div.product-block a.anchor')
    product_links = [card.get_attribute('href') for card in product_cards]

    print(f"Nombre de produits trouvés pour la marque '{brand_name}' : {len(product_links)}")

    all_products = []
    for link in product_links:
        product_info = scrape_product_info(browser, link)
        if product_info:
            all_products.append(product_info)

    return all_products


In [None]:
# Test du code

browser = webdriver.Chrome(service=chrome_service, options=headless_options)

brand_name = 'absolute-new-york'
absolute_new_york_products = collect_brand_products(browser, brand_name)
df_products = pd.DataFrame(absolute_new_york_products)

df_products.head(5)

browser.quit()

# Sauvegarde du DataFrame dans le dossier choisi

output_filename = brand_name.replace('-', '_') + '.csv'
print(output_filename)
output_directory = os.path.join('..', 'notebooks')
df_products.to_csv(f'{output_directory}/{output_filename}', index=False)
print(f"Données enregistrées dans {output_filename} ({len(df_products)} produits)")


## C - All brands

In [None]:
# Remplissage de la fonction finale

def scrape_all_brands(browser, start_url='https://www.sunsetcosmeticos.com.br/p/marcas'):
    """
    Récupère tous les liens de marques,
    visite chaque page de marque, scrape les produits avec collect_brand_products,
    et retourne une liste de dictionnaires de produits.
    """
    import time

    collected_products = []

    browser.get(start_url)
    time.sleep(2)

    brand_links = browser.find_elements(By.CSS_SELECTOR, 'div.tag a.anchor')
    brand_pages = [link.get_attribute('href') for link in brand_links]

    print(f"Nombre de marques détectées : {len(brand_pages)}")

    for brand_page in brand_pages:
        try:
            brand_slug = brand_page.strip('/').split('/')[-1]

            print(f"\n🔎 Traitement de la marque : {brand_slug}")
            products_list = collect_brand_products(browser, brand_slug)
            print(f"{len(products_list)} produits collectés pour {brand_slug}")

            for product in products_list:
                product['brand'] = brand_slug

            collected_products.extend(products_list)

        except Exception as error:
            print(f"Erreur lors du traitement de la marque '{brand_page}': {error}")
            continue

    return collected_products



In [None]:
# Test du scraping complet : récupération de tous les produits de sunsetcosmeticos et sauvegarde

browser = webdriver.Chrome(service=chrome_service, options=headless_options)

all_scraped_products = scrape_all_brands(browser)
df_all_brands = pd.DataFrame(all_scraped_products)

print(df_all_brands.head(3))
print(f"\nTotal : {len(df_all_brands)} produits récupérés.")

df_all_brands.to_csv('all_brands.csv', index=False, encoding='utf-8-sig')

browser.quit()

