# Enrichissement Hybride LLM - Seine-Saint-Denis Appels √† Projets

Ce notebook scrape ET enrichit les donn√©es du site **seine-saint-denis.gouv.fr** avec Claude Sonnet 4.5.

**Filtrage:** Projets publi√©s en 2025, 2026 et ult√©rieurs (>= 01/01/2025)

**Approche:** Scraping complet + LLM Claude pour enrichissement structur√©

## 1. Imports et configuration

In [None]:
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
from datetime import datetime
import json
from urllib.parse import urljoin, parse_qs, urlparse
import time
import os
from dotenv import load_dotenv
import tempfile
import hashlib
import itertools

# Imports LLM
from anthropic import Anthropic
import pypdf

In [None]:
# Charger les variables d'environnement
load_dotenv(override=True)

# V√©rifier Claude API key
claude_api_key = os.getenv('ANTHROPIC_API_KEY')
if claude_api_key:
    print(f"‚úÖ ANTHROPIC_API_KEY trouv√©e: {claude_api_key[:10]}...")
else:
    print(f"‚ùå ANTHROPIC_API_KEY non trouv√©e dans .env")
    print(f"   ‚ö†Ô∏è Vous devez ajouter: ANTHROPIC_API_KEY=sk-ant-xxxxxx")

## 2. Configuration scraper Seine-Saint-Denis

In [None]:
# Configuration du scraper
BASE_URL = "https://www.seine-saint-denis.gouv.fr/Actualites/Appels-a-projets"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

# Filtre de date: >= 01/01/2025
DATE_FILTER_START = datetime(2025, 1, 1)
CURRENT_YEAR = datetime.now().year

print(f"‚úÖ Configuration pr√™te")
print(f"   Base URL: {BASE_URL}")
print(f"   Filtre date: >= {DATE_FILTER_START.strftime('%d/%m/%Y')}")
print(f"   Ann√©es accept√©es: 2025, 2026+")

## 3. Scraper les appels √† projets de Seine-Saint-Denis

In [None]:
def scrape_seine_saint_denis_aap(base_url, max_pages=5):
    """Scraper tous les AAP de Seine-Saint-Denis avec pagination"""
    aap_list = []
    page_offset = 0
    pages_scraped = 0
    
    while pages_scraped < max_pages:
        url = f"{base_url}/(offset)/{page_offset}"
        print(f"üîÑ Scraping page {pages_scraped + 1}: {url}")
        
        try:
            response = requests.get(url, headers=HEADERS, timeout=10)
            response.raise_for_status()
            html_content = response.text
            
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Chercher les articles/conteneurs AAP
            article_containers = soup.find_all(['article', 'div'], class_=re.compile(r'(article|appel|item|news|post)', re.I))
            
            if not article_containers:
                article_containers = soup.find_all('a', href=re.compile(r'appel|projet|actualite', re.I))
            
            if not article_containers:
                print(f"   ‚ö†Ô∏è Aucun conteneur trouv√©, fin du scraping")
                break
            
            page_items = 0
            for container in article_containers:
                aap = extract_aap_item(container, base_url)
                
                if aap and aap.get('date_publication'):
                    try:
                        item_date = pd.to_datetime(aap['date_publication'])
                        if item_date >= DATE_FILTER_START:
                            aap_list.append(aap)
                            page_items += 1
                    except:
                        pass
            
            print(f"   ‚úÖ {page_items} AAP valides trouv√©s")
            
            if page_items == 0 and pages_scraped > 0:
                print(f"   ‚ÑπÔ∏è Aucun nouvel AAP, fin du scraping")
                break
            
            page_offset += 10
            pages_scraped += 1
            time.sleep(1)
            
        except Exception as e:
            print(f"   ‚ùå Erreur: {str(e)[:80]}")
            break
    
    return aap_list


def extract_aap_item(container, base_url):
    """Extraire les donn√©es d'un seul AAP"""
    aap = {}
    
    try:
        # Titre
        title_elem = container.find(['h2', 'h3', 'h4', 'a', 'span'])
        if title_elem:
            aap['titre'] = title_elem.get_text(strip=True)
        else:
            return None
        
        # URL
        link = container.find('a', href=True) if container.name != 'a' else container
        if link:
            href = link.get('href', '')
            aap['url_source'] = urljoin(base_url, href)
        else:
            aap['url_source'] = None
        
        # Description/r√©sum√©
        desc = container.find(['p', 'span'], class_=re.compile(r'(desc|summary|excerpt|chapeau)', re.I))
        if desc:
            aap['resume'] = desc.get_text(strip=True)
        
        # Date de publication
        text_content = container.get_text(' ')
        dates = re.findall(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', text_content)
        
        if dates:
            try:
                # Essayer format JJ/MM/AAAA
                aap['date_publication'] = pd.to_datetime(dates[0], format='%d/%m/%Y').date()
            except:
                try:
                    # Essayer format JJ-MM-AAAA
                    aap['date_publication'] = pd.to_datetime(dates[0], format='%d-%m-%Y').date()
                except:
                    aap['date_publication'] = None
        else:
            aap['date_publication'] = None
        
        # Date limite de candidature (si trouv√©e)
        if len(dates) >= 2:
            try:
                aap['date_limite'] = pd.to_datetime(dates[1], format='%d/%m/%Y').date()
            except:
                aap['date_limite'] = None
        else:
            aap['date_limite'] = None
        
        # Montant
        amounts = re.findall(r'(\d+[\s.,]*\d*)\s*(?:‚Ç¨|euros?|EUROS?)', text_content, re.I)
        if amounts:
            try:
                cleaned = amounts[-1].replace(' ', '').replace('.', '').replace(',', '.')
                aap['montant_max'] = float(cleaned)
            except:
                aap['montant_max'] = None
        else:
            aap['montant_max'] = None
        
        # Organisme
        aap['organisme'] = 'Seine-Saint-Denis (93)'
        
        # ID unique
        aap['id_record'] = f"ssd_{datetime.now().strftime('%Y%m%d%H%M%S')}_{hash(aap['titre']) % 10000}"
        
        return aap
    except Exception as e:
        print(f"  ‚ö†Ô∏è Erreur extraction: {str(e)[:50]}")
        return None

# Scraper
print(f"\nüîÑ Scraping Seine-Saint-Denis...\n")
aap_data = scrape_seine_saint_denis_aap(BASE_URL, max_pages=5)
print(f"\n‚úÖ {len(aap_data)} appels √† projets extraits (filtr√©s 2025+)")

## 4. Cr√©er et nettoyer le DataFrame

In [None]:
# Cr√©er DataFrame
if aap_data:
    mapped_df_ssd = pd.DataFrame(aap_data)
    print(f"üìä DataFrame cr√©√©: {mapped_df_ssd.shape}")
    print(f"   Colonnes: {list(mapped_df_ssd.columns)}")
else:
    mapped_df_ssd = pd.DataFrame()
    print(f"‚ö†Ô∏è Aucune donn√©e √† traiter")

In [None]:
# Fonction de nettoyage du texte
def clean_text(text):
    if not isinstance(text, str):
        return text
    text = re.sub(r'<[^>]+>', '', text)
    html_entities = {
        '&eacute;': '√©', '&icirc;': '√Æ', '&ag√†;': '√†', '&ocirc;': '√¥',
        '&nbsp;': ' ', '&quot;': '\"', '&amp;': '&',
        '&rsquo;': "'", '&ldquo;': '\u201c', '&rdquo;': '\u201d'
    }
    for entity, char in html_entities.items():
        text = text.replace(entity, char)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Appliquer le nettoyage
if not mapped_df_ssd.empty:
    for col in mapped_df_ssd.select_dtypes(include=['object']).columns:
        if col not in ['categories']:
            mapped_df_ssd[col] = mapped_df_ssd[col].apply(clean_text)
    print("‚úÖ Texte nettoy√©")

In [None]:
# Ajouter colonnes manquantes
if not mapped_df_ssd.empty:
    colonnes_requises = ['public_cible', 'taux_financement', 'contact', 'modalite', 'demarches', 'mots_cles', 'objectif', 'montant_min', 'note', 'tags', 'categories']
    
    for col in colonnes_requises:
        if col not in mapped_df_ssd.columns:
            mapped_df_ssd[col] = None
    
    if 'perimetre_geo' not in mapped_df_ssd.columns:
        mapped_df_ssd['perimetre_geo'] = 'Seine-Saint-Denis'
    
    if 'fingerprint' not in mapped_df_ssd.columns:
        def create_fingerprint(row):
            titre = str(row['titre']) if pd.notna(row['titre']) else ''
            organisme = str(row['organisme']) if pd.notna(row['organisme']) else ''
            date_limite = str(row.get('date_limite', '')) if pd.notna(row.get('date_limite')) else ''
            combined = f"{titre}|{organisme}|{date_limite}"
            return hashlib.md5(combined.encode()).hexdigest()[:12]
        
        mapped_df_ssd['fingerprint'] = mapped_df_ssd.apply(create_fingerprint, axis=1)
    
    print(f"‚úÖ DataFrame pr√©par√©: {mapped_df_ssd.shape}")
    print(f"   Colonnes finales: {list(mapped_df_ssd.columns)}")

## 5. Fonctions pour extraction PDF

In [None]:
def extract_pdf_text(pdf_url, max_pages=3):
    """Extraire le texte d'un PDF depuis une URL"""
    try:
        response = requests.get(pdf_url, timeout=10)
        response.raise_for_status()
        
        with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp:
            tmp.write(response.content)
            tmp_path = tmp.name
        
        reader = pypdf.PdfReader(tmp_path)
        text = ''
        for page_num, page in enumerate(reader.pages[:max_pages]):
            text += page.extract_text() + '\n'
        
        os.remove(tmp_path)
        return text if text.strip() else None
    except Exception as e:
        print(f"  ‚ö†Ô∏è Erreur PDF {pdf_url}: {str(e)[:50]}")
        return None


def find_pdf_links(soup, base_url):
    """Trouver les liens PDF dans une page"""
    pdf_links = []
    for link in soup.find_all('a', href=True):
        href = link.get('href', '')
        text = link.get_text().lower()
        
        if ('pdf' in href.lower() or 
            any(keyword in text for keyword in ['reglement', 'document', 'cahier', 'guide', 'annexe'])):
            full_url = urljoin(base_url, href)
            if full_url not in pdf_links:
                pdf_links.append(full_url)
    
    return pdf_links[:2]

print("‚úÖ Fonctions PDF cr√©√©es")

## 6. Classe LLMEnricher

In [None]:
class LLMEnricher:
    """Enrichir les donn√©es AAP avec Claude Sonnet 4.5"""
    
    def __init__(self, api_key=None, model='claude-sonnet-4-5'):
        self.api_key = api_key or os.getenv('ANTHROPIC_API_KEY')
        self.model = model
        self.client = Anthropic(api_key=self.api_key) if self.api_key else None
        self.max_retries = 3
        self.retry_delay = 1
        
        if not self.client:
            raise ValueError('‚ùå ANTHROPIC_API_KEY non trouv√©e')
    
    def extract_full_page(self, url, html_content, pdf_texts=None):
        """Extraire toutes les donn√©es manquantes d'une page avec retry logic"""
        
        if not self.client:
            return None
        
        soup = BeautifulSoup(html_content, 'html.parser')
        text_content = soup.get_text('\n')
        
        if pdf_texts:
            text_content += '\n\n--- DOCUMENTS PDF ---\n'
            text_content += '\n\n'.join(pdf_texts)
        
        text_content = text_content[:16000]
        
        prompt = f"""Tu es un expert en analyse d'appels √† projets fran√ßais (Seine-Saint-Denis).
        
Analyse cette page et extrais les informations manquantes en JSON valide:

{{
   "resume": "R√©sum√© en 1-2 phrases (max 300 caract√®res)",
  "montant_max": montant maximum en euros (nombre ou null),
  "montant_min": montant minimum en euros (nombre ou null),
  "taux_financement": "pourcentage ou description (null si non trouv√©)",
  "categories": ["liste", "de", "cat√©gories"],
  "public_cible": ["associations", "PME", "collectivit√©s"],
  "mots_cles": ["mots-cl√©s", "pertinents"],
  "objectif": "Quel est l'objectif principal",
  "modalite": "Conditions principales",
  "demarches": "Comment candidater",
  "contact": "Email ou t√©l√©phone si trouv√© (ou null)"
}}

IMPORTANT:
- Retourne UNIQUEMENT du JSON valide
- Si une info n'existe pas, mets null
- Les montants doivent √™tre des nombres
- Les listes doivent √™tre des arrays JSON

Contenu:
{text_content}"""
        
        for attempt in range(self.max_retries):
            try:
                message = self.client.messages.create(
                    model=self.model,
                    max_tokens=1024,
                    messages=[{"role": "user", "content": prompt}]
                )
                
                response_text = message.content[0].text
                response_text = response_text.replace('```json', '').replace('```', '')
                
                return json.loads(response_text.strip())
            except json.JSONDecodeError as e:
                print(f"    ‚ùå JSON parsing error: {str(e)[:50]}")
                return None
            except Exception as e:
                error_msg = str(e)
                print(f"    ‚ùå Attempt {attempt + 1}/{self.max_retries} - Error: {error_msg[:80]}")
                if attempt < self.max_retries - 1:
                    wait_time = self.retry_delay * (2 ** attempt)
                    print(f"    ‚è≥ Retrying in {wait_time}s...")
                    time.sleep(wait_time)
                else:
                    print(f"    ‚ùå Max retries exceeded")
                    return None

print("‚úÖ Classe LLMEnricher cr√©√©e")

## 7. Initialiser l'enrichisseur LLM

In [None]:
try:
    enricher = LLMEnricher()
    print("‚úÖ LLMEnricher initialis√©")
    print(f"   Mod√®le: claude-sonnet-4-5")
    print(f"   Retry logic: Enabled (max 3 attempts with exponential backoff)")
except ValueError as e:
    print(f"‚ùå {str(e)}")
    enricher = None

## 8. Enrichir avec LLM

In [None]:
if enricher and not mapped_df_ssd.empty and len(mapped_df_ssd) > 0:
    print(f"üîÑ Enrichissement de {len(mapped_df_ssd)} appels √† projets...\n")
    
    for idx, row in mapped_df_ssd.iterrows():
        url = row.get('url_source')
        titre = str(row.get('titre', 'N/A'))[:50]
        
        if not url or pd.isna(url):
            print(f"‚è≠Ô∏è  [{idx+1}/{len(mapped_df_ssd)}] {titre}: pas d'URL")
            continue
        
        print(f"üîÑ [{idx+1}/{len(mapped_df_ssd)}] {titre}...", end=' ')
        
        try:
            response = requests.get(url, headers=HEADERS, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            pdf_links = find_pdf_links(soup, url)
            
            pdf_texts = []
            if pdf_links:
                print(f"(+{len(pdf_links)} PDFs)", end=' ')
                for pdf_url in pdf_links:
                    pdf_text = extract_pdf_text(pdf_url)
                    if pdf_text:
                        pdf_texts.append(pdf_text[:3000])
            
            extracted = enricher.extract_full_page(url, response.text, pdf_texts)
            
            if extracted:
                for key, value in extracted.items():
                    if key in mapped_df_ssd.columns:
                        mapped_df_ssd.at[idx, key] = value
                print("‚úÖ")
            else:
                print("‚ö†Ô∏è  Aucune donn√©e")
                
        except requests.exceptions.Timeout:
            print("‚è±Ô∏è  Timeout")
        except Exception as e:
            print(f"‚ùå {str(e)[:30]}")
        
        time.sleep(1)
    
    print(f"\n‚úÖ Enrichissement termin√©!")
else:
    if enricher is None:
        print("‚ùå Enrichisseur LLM non disponible")
    if mapped_df_ssd.empty:
        print("‚ùå Pas de donn√©es")

## 9. Statistiques et aper√ßu

In [None]:
if not mapped_df_ssd.empty:
    print("üìä Statistiques:")
    print(f"\n‚úÖ Total: {len(mapped_df_ssd)} enregistrements")
    print(f"\n‚úÖ Remplissage:")
    for col in ['titre', 'resume', 'montant_max', 'montant_min', 'categories', 'public_cible', 'mots_cles', 'objectif']:
        if col in mapped_df_ssd.columns:
            filled = mapped_df_ssd[col].notna().sum()
            pct = (filled / len(mapped_df_ssd) * 100) if len(mapped_df_ssd) > 0 else 0
            print(f"   - {col}: {filled}/{len(mapped_df_ssd)} ({pct:.1f}%)")

In [None]:
if not mapped_df_ssd.empty and len(mapped_df_ssd) > 0:
    print("\nüìã Aper√ßu:")
    print("\n" + "="*80)
    for idx in range(min(3, len(mapped_df_ssd))):
        row = mapped_df_ssd.iloc[idx]
        print(f"\nüìå {row['titre'][:60]}")
        print(f"   Org: {row['organisme']}")
        if pd.notna(row.get('date_publication')):
            print(f"   Date pub: {row['date_publication']}")
        if pd.notna(row.get('montant_max')):
            print(f"   Montant max: {row['montant_max']}‚Ç¨")
        if pd.notna(row['resume']):
            print(f"   R√©sum√©: {str(row['resume'])[:100]}...")

In [None]:
# Configuration affichage Pandas optimis√©e
if not mapped_df_ssd.empty:
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', 100)
    pd.set_option('display.width', None)
    
    print("\nüìä VUE COMPL√àTE DU DATAFRAME")
    print("="*80)
    
    # Vue tabulaire
    display(mapped_df_ssd.to_string())

In [None]:
# Vue HTML interactive
if not mapped_df_ssd.empty:
    from IPython.display import HTML
    display(HTML(mapped_df_ssd.to_html()))

In [None]:
# Vue transpos√©e (une ligne = une colonne)
if not mapped_df_ssd.empty:
    display(mapped_df_ssd.T)

## 10. Exporter les donn√©es

In [None]:
if not mapped_df_ssd.empty:
    # Exporter en CSV
    csv_output = '../data/seine_saint_denis_aap_enriched.csv'
    try:
        os.makedirs(os.path.dirname(csv_output), exist_ok=True)
        mapped_df_ssd.to_csv(csv_output, index=False, encoding='utf-8')
        print(f"‚úÖ Export√© en CSV: {csv_output}")
    except Exception as e:
        print(f"‚ö†Ô∏è  Erreur CSV: {str(e)}")
    
    # Exporter en JSON
    json_output = '../data/seine_saint_denis_aap_enriched.json'
    try:
        os.makedirs(os.path.dirname(json_output), exist_ok=True)
        df_for_json = mapped_df_ssd.copy()
        for col in df_for_json.columns:
            if df_for_json[col].dtype == 'object':
                df_for_json[col] = df_for_json[col].astype(str)
        
        df_for_json.to_json(json_output, orient='records', force_ascii=False, indent=2)
        print(f"‚úÖ Export√© en JSON: {json_output}")
    except Exception as e:
        print(f"‚ö†Ô∏è  Erreur JSON: {str(e)}")

## 11. Upload Airtable (optionnel)

In [None]:
# Optional: Export to CSV
output_path = r'c:\Users\WALID\Documents\Code\appels-a-projets\filtered_projects.csv'
df_projects.to_csv(output_path, index=False, encoding='utf-8')
print(f"Results exported to: {output_path}")