In [None]:
"""
Script de scraping The Motley Fool - Version Production (Locale + Tickers).
Cible : ID 'article-body-transcript'
"""

import time
import random
import os
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
# Plus besoin de webdriver_manager car on utilise le driver local
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

# --- CONFIGURATION ---
CSV_FILENAME = "cac40_transcripts_fool_2024.csv"
TARGET_YEAR = "2024"
QUARTERS = ["Q1", "Q2", "Q3", "Q4"]

# Liste CAC 40 - Sélection "Blue Chips" (Les plus stables et grosses capitalisations)
CAC40_COMPANIES = [
    # --- GAFAM & TECH (Incontournables) ---
    "Apple Inc. (AAPL)",
    "Microsoft Corporation (MSFT)",
    "Nvidia Corporation (NVDA)",
    "Alphabet Inc. (GOOGL)",
    "Amazon.com Inc. (AMZN)",
    "Meta Platforms (META)",
    "Tesla Inc. (TSLA)",
    "Broadcom Inc. (AVGO)",
    "Adobe Inc. (ADBE)",
    "Salesforce (CRM)",
    "Cisco Systems (CSCO)",
    "Oracle Corporation (ORCL)",
    "Intel Corporation (INTC)",
    "AMD (AMD)",
    "Netflix (NFLX)",
    "IBM (IBM)",

    # --- FINANCE & BANQUE ---
    "Berkshire Hathaway (BRK-B)",
    "JPMorgan Chase (JPM)",
    "Visa Inc. (V)",
    "Mastercard (MA)",
    "Bank of America (BAC)",
    "Wells Fargo (WFC)",
    "Goldman Sachs (GS)",
    "Morgan Stanley (MS)",
    "American Express (AXP)",

    # --- CONSOMMATION & LUXE ---
    "Walmart (WMT)",
    "Procter & Gamble (PG)",
    "Costco Wholesale (COST)",
    "Coca-Cola Company (KO)",
    "PepsiCo (PEP)",
    "McDonald's (MCD)",
    "Nike (NKE)",
    "Starbucks (SBUX)",
    "Home Depot (HD)",
    "Walt Disney (DIS)",

    # --- SANTÉ & PHARMA ---
    "Eli Lilly (LLY)",
    "UnitedHealth Group (UNH)",
    "Johnson & Johnson (JNJ)",
    "Merck & Co. (MRK)",
    "AbbVie (ABBV)",
    "Pfizer (PFE)",
    "Thermo Fisher Scientific (TMO)",
    "Abbott Laboratories (ABT)",

    # --- INDUSTRIE & ÉNERGIE ---
    "Exxon Mobil (XOM)",
    "Chevron (CVX)",
    "General Electric (GE)",
    "Caterpillar (CAT)",
    "Boeing (BA)",
    "Lockheed Martin (LMT)",
    "Honeywell (HON)"
]

def close_marketing_popup(driver):
    """
    Ferme les popups marketing intrusifs (Top 10 Stocks, Promo, etc.)
    qui apparaissent par-dessus le texte.
    """
    print("[POPUP] Vérification des publicités intrusives...")
    try:
        # On attend un tout petit peu car ces pubs mettent parfois 1-2 sec à apparaître
        time.sleep(2)
        
        # LISTE DES SÉLECTEURS POTENTIELS POUR LE BOUTON "FERMER" (La Croix)
        # Motley Fool utilise plusieurs systèmes de pub, on essaie les plus courants
        close_selectors = [
            "button[aria-label='Close']",           # Le plus standard (Accessibilité)
            "button[class*='close']",               # Boutons avec le mot "close" dans la classe
            "div[class*='modal'] button",           # N'importe quel bouton dans une modale
            "svg[data-icon='times']",               # L'icone "X" vectorielle
            ".inf-close-icon",                      # Une classe spécifique vue parfois chez eux
            "button#onetrust-close-btn-handler"     # Parfois lié aux cookies
        ]
        
        for selector in close_selectors:
            try:
                # On cherche l'élément
                close_btn = driver.find_element(By.CSS_SELECTOR, selector)
                
                # S'il est visible et cliquable
                if close_btn.is_displayed():
                    print(f"[POPUP] Publicité détectée. Tentative de fermeture avec : {selector}")
                    close_btn.click()
                    time.sleep(1) # On laisse le temps à l'animation de fermeture
                    return True
            except:
                continue # On essaie le selecteur suivant
                
    except Exception as e:
        # Ce n'est pas grave si ça échoue, ça veut dire qu'il n'y a pas de pub
        pass
    
    return False

def setup_driver():
    """Configure le navigateur Chrome avec le driver LOCAL."""
    options = Options()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("start-maximized")
    
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36")

    # --- CONFIGURATION DU CHEMIN LOCAL ---
    current_folder = os.getcwd()
    
    # Detection automatique du nom de l'executable
    if os.name == 'nt':
        driver_filename = "chromedriver.exe" # Windows
    else:
        driver_filename = "chromedriver"     # Mac / Linux

    driver_path = os.path.join(current_folder, driver_filename)

    if not os.path.exists(driver_path):
        raise FileNotFoundError(f"Le fichier {driver_filename} est introuvable dans {current_folder} !")

    service = Service(executable_path=driver_path)
    driver = webdriver.Chrome(service=service, options=options)
    
    return driver

def load_existing_data():
    if os.path.exists(CSV_FILENAME):
        try:
            df = pd.read_csv(CSV_FILENAME)
            if df.empty: return [], set()
            existing_records = df.to_dict('records')
            # La clé unique reste basée sur le nom complet (avec ticker)
            existing_keys = set(f"{row['company']}_{row['quarter']}" for row in existing_records)
            print(f"[INFO] Reprise : {len(existing_records)} transcripts deja recuperes.")
            return existing_records, existing_keys
        except Exception:
            return [], set()
    return [], set()

def save_data_safely(data):
    temp_filename = CSV_FILENAME + ".tmp"
    try:
        df = pd.DataFrame(data)
        df.to_csv(temp_filename, index=False)
        os.replace(temp_filename, CSV_FILENAME)
    except Exception as e:
        print(f"[ERREUR SAUVEGARDE] {e}")

def check_for_captcha(driver):
    try:
        page_text = driver.page_source.lower()
        if any(x in page_text for x in ["recaptcha", "unusual traffic", "security check", "verify you are human"]):
            print("\n" + "!"*60)
            print("[ALERTE] CAPTCHA DETECTE - Intervention requise")
            input("Appuyez sur ENTREE une fois le site debloque...")
            time.sleep(2)
            return True
    except Exception:
        pass
    return False

def google_search_quarter_fool(driver, company_full_name, quarter):
    # 1. On nettoie le nom (ex: "L'Oreal (OR)" -> "L'Oreal")
    search_name = company_full_name.split('(')[0].strip()

    # On ajoute des guillemets autour du nom dans la recherche Google pour forcer la presence
    # query = f'site:fool.com "{search_name}" "{quarter} {TARGET_YEAR}" "Earnings Call Transcript"'
    
    # Parfois les guillemets sont trop restrictifs pour Google, on essaie sans pour la recherche,
    # mais on sera TRES strict sur le filtrage apres.
    query = f'site:fool.com {search_name} {quarter} {TARGET_YEAR} "Earnings Call Transcript"'
    
    print(f"[RECHERCHE] {company_full_name} (Terme: {search_name}) {quarter}...")

    try:
        driver.get("https://www.google.com")
        time.sleep(random.uniform(2, 3))

        # Gestion Cookies Google
        try:
            buttons = driver.find_elements(By.TAG_NAME, "button")
            for btn in buttons:
                if "tout accepter" in btn.text.lower() or "accept all" in btn.text.lower():
                    driver.execute_script("arguments[0].click();", btn)
                    time.sleep(1)
                    break
        except Exception: pass

        search_box = driver.find_element(By.NAME, "q")
        search_box.clear()
        search_box.send_keys(query)
        search_box.send_keys(Keys.RETURN)
        
        time.sleep(random.uniform(3, 5))
        check_for_captcha(driver)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        links_found = []
        
        for h3 in soup.find_all('h3'):
            link_tag = h3.find_parent('a')
            if link_tag and 'href' in link_tag.attrs:
                url = link_tag['href']
                title = h3.get_text()
                
                # --- FILTRAGE STRICT ---
                
                # 1. Doit venir de fool.com
                if "fool.com" not in url:
                    continue
                
                # 2. Doit etre un transcript
                if "transcript" not in title.lower():
                    continue

                # 3. Doit contenir le trimestre (Q1, Q2...)
                if quarter.lower() not in title.lower():
                    continue

                # 4. VERIFICATION STRICTE ticker ENTREPRISE
                # Le ticker recherche DOIT etre dans le titre du lien ou le nom de l'entreprise
                # Ex: Si on cherche "Kering", le titre doit contenir "PPRUY" ou "Kering"
                ticker = company_full_name.split('(')[1].replace(')', '').strip()
                if ticker.lower() not in title.lower() and search_name.lower() not in title.lower():
                    # Petite nuance : parfois le titre est "LVMH Moet Hennessy..." pour "LVMH"
                    # Si le nom est court (ex: AXA), ca va. 
                    # Mais c'est la securite que vous demandiez.
                    continue
                
                # 5. Doit contenir l'annee cible
                if TARGET_YEAR not in title:
                    continue

                links_found.append({
                    "company": company_full_name,
                    "quarter": quarter,
                    "year": TARGET_YEAR,
                    "title": title,
                    "url": url
                })
        
        return links_found[:1] # On prend le 1er resultat qui a passe tous les filtres stricts

    except Exception as e:
        print(f"[ERREUR GOOGLE] {e}")
        return []

def extract_content_fool(driver, url):
    try:
        driver.get(url)
        # On attend que la page charge vraiment
        time.sleep(random.uniform(3, 5))
        
        # 1. GESTION CAPTCHA (Ta fonction existante)
        check_for_captcha(driver)

        # 2. GESTION COOKIES (Ta fonction existante)
        try:
            driver.find_element(By.ID, "onetrust-accept-btn-handler").click()
            time.sleep(1)
        except Exception: pass
        
        # --- 3. NOUVEAU : GESTION POPUP MARKETING ---
        # On appelle la fonction tueuse de pub ici
        close_marketing_popup(driver)
        # --------------------------------------------

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # CIBLAGE PRECIS (ID)
        content = soup.find('div', id='article-body-transcript')
        
        if not content:
            content = soup.find('div', class_='article-body')

        if content:
            # Nettoyage "Image source"
            for element in content.find_all(string=lambda text: text and "Image source:" in text):
                element.extract()

            # Nettoyage Pubs / Widgets
            for tag in content.select(".ticker-widget, .promo, script, style"):
                tag.decompose()
            
            full_text = content.get_text(separator="\n", strip=True)
            
            # Coupe propre fin de texte
            if "Should you invest" in full_text:
                full_text = full_text.split("Should you invest")[0]
                
            return full_text
            
    except Exception as e:
        print(f"[ERREUR EXTRACTION] {e}")
    return None

def restart_driver(old_driver):
    """Ferme le navigateur actuel et en ouvre un nouveau (Hard Reset)."""
    print("\n" + "!"*40)
    print("[RESTART] Fermeture de Chrome pour nettoyer la session...")
    try:
        old_driver.quit()
    except Exception:
        pass
    
    # Pause de securite pour laisser le systeme liberer les ressources
    time.sleep(5) 
    
    print("[RESTART] Ouverture d'un nouveau navigateur...")
    return setup_driver()



def main():
    print("--- Demarrage Scraping Fool (Local Driver + Tickers) ---")
    all_data, done_set = load_existing_data()
    driver = setup_driver()
    
    try:
        for company in CAC40_COMPANIES:
            for quarter in QUARTERS:
                task_key = f"{company}_{quarter}"
                if task_key in done_set: continue

                results = google_search_quarter_fool(driver, company, quarter)
                
                if not results:
                    print(f"[SKIP] Rien trouve pour {company} {quarter}")
                    time.sleep(2)
                    continue

                item = results[0]
                text = extract_content_fool(driver, item['url'])
                
                if text and len(text) > 2000:
                    item['content'] = text
                    all_data.append(item)
                    done_set.add(task_key)
                    save_data_safely(all_data)
                    print(f"[OK] {company} {quarter} -> {len(text)} cars.")
                else:
                    print(f"[WARN] Contenu vide/court pour {company} {quarter}")

                time.sleep(random.uniform(5, 8))
            print("-" * 20)

    except KeyboardInterrupt:
        print("\n[STOP] Arret manuel.")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()

--- Demarrage Scraping Fool (Local Driver + Tickers) ---
[RECHERCHE] Apple Inc. (AAPL) (Terme: Apple Inc.) Q1...
[SKIP] Rien trouve pour Apple Inc. (AAPL) Q1
[RECHERCHE] Apple Inc. (AAPL) (Terme: Apple Inc.) Q2...
[SKIP] Rien trouve pour Apple Inc. (AAPL) Q2
[RECHERCHE] Apple Inc. (AAPL) (Terme: Apple Inc.) Q3...
[SKIP] Rien trouve pour Apple Inc. (AAPL) Q3
[RECHERCHE] Apple Inc. (AAPL) (Terme: Apple Inc.) Q4...
[OK] Apple Inc. (AAPL) Q4 -> 48945 cars.
--------------------
[RECHERCHE] Microsoft Corporation (MSFT) (Terme: Microsoft Corporation) Q1...

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
[ALERTE] CAPTCHA DETECTE - Intervention requise
[OK] Microsoft Corporation (MSFT) Q1 -> 55837 cars.
[RECHERCHE] Microsoft Corporation (MSFT) (Terme: Microsoft Corporation) Q2...

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
[ALERTE] CAPTCHA DETECTE - Intervention requise
[OK] Microsoft Corporation (MSFT) Q2 -> 57637 cars.
[RECHERCHE] Microsoft Corporation (MSFT) (