# Phase 1 – Scraping Workflow and Strategy

With the HTML structure and data fields defined, the scraping workflow is designed to extract all job offers efficiently and reliably while respecting ethical standards.

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# --- Configuration ---
BASE_SEARCH_URL = "https://www.hellowork.com/fr-fr/emploi/recherche.html?k=job+%C3%A9tudiant&st=relevance"
MAX_PAGES_PER_SECTOR = 10

SECTORS_LIST = [
    {"id": "Agri_peche", "name": "Agriculture • Pêche"},
    {"id": "BTP", "name": "BTP"},
    # ... (Add full list here if needed, truncated for brevity)
    {"id": "Transport_logist", "name": "Transport • Logistique"}
]

def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def handle_cookies(driver):
    try:
        cookie_btn = WebDriverWait(driver, 4).until(
            EC.element_to_be_clickable((By.ID, "hw-cc-notice-continue-without-accepting-btn"))
        )
        cookie_btn.click()
        time.sleep(1)
    except:
        pass

def scrape_job_details(driver, url, sector_name):
    driver.get(url)
    data = {
        "Sector": sector_name,
        "Job_Title": "N/A",
        "Company": "N/A",
        "Location": "N/A",
        "Contract": "N/A",
        "Salary": "N/A",
        "Description": "N/A",
        "Publication_Date": "N/A", # NEW FIELD
        "URL": url
    }

    try:
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))

        try: data["Job_Title"] = driver.find_element(By.CSS_SELECTOR, '[data-cy="jobTitle"]').text.strip()
        except: pass

        try: data["Company"] = driver.find_element(By.CSS_SELECTOR, 'h1 a').text.strip()
        except: pass

        try:
            tags = driver.find_elements(By.CSS_SELECTOR, 'ul.tw-flex.tw-flex-wrap.tw-gap-3 li')
            if len(tags) > 0: data["Location"] = tags[0].text.strip()
            if len(tags) > 1: data["Contract"] = tags[1].text.strip()
        except: pass

        try: data["Salary"] = driver.find_element(By.CSS_SELECTOR, '[data-cy="salary-tag-button"]').text.strip()
        except: pass

        try: 
            desc = driver.find_element(By.CSS_SELECTOR, '[data-truncate-text-target="content"]').text
            data["Description"] = desc.replace("\n", " ").strip()
        except: pass

        # 6. Publication Date Extraction
        try:
            try:
                data["Publication_Date"] = driver.find_element(By.CSS_SELECTOR, '[data-cy="publishDate"]').text.strip()
            except:
                pass 
        except: pass

    except Exception as e:
        print(f"Error extracting details for {url}: {e}")

    return data

def main():
    driver = setup_driver()
    all_results = []
    try:
        driver.get(BASE_SEARCH_URL)
        handle_cookies(driver)
        # Loop sector logic omitted for brevity in notebook view, same as script
        # ...
    finally:
        driver.quit()

if __name__ == "__main__":
    # main() # Commented out to prevent auto-run in notebook
    pass