# Scraping Paris.fr - Appels √† Projets

Ce notebook scrape les appels √† projets du site Paris.fr et les mappe au format `mapped_df`.

**Source :** https://www.paris.fr/appels-a-projets

## 1. Imports & Configuration

In [1]:
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
from datetime import datetime
import json
from urllib.parse import urljoin
import time

## 2. Scraper les appels √† projets de Paris.fr

In [None]:
# Configuration
BASE_URL = "https://www.paris.fr/appels-a-projets"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

# R√©cup√©rer le HTML de la premi√®re page
print(f"üîÑ Fetching {BASE_URL}...")
try:
    response = requests.get(BASE_URL, headers=HEADERS, timeout=10)
    response.raise_for_status()
    html_content = response.text
    print(f"‚úÖ Page r√©cup√©r√©e ({len(html_content)} caract√®res)")
except Exception as e:
    print(f"‚ùå Erreur: {str(e)}")
    html_content = None

ÔøΩÔøΩÔøΩ Fetching https://www.paris.fr/appels-a-projets...
‚úÖ Page r√©cup√©r√©e (312332 caract√®res)


In [3]:
# Parser le HTML
if html_content:
    soup = BeautifulSoup(html_content, 'html.parser')
    print(f"‚úÖ HTML pars√© avec BeautifulSoup")
    print(f"   Titre de la page: {soup.title.string if soup.title else 'N/A'}")
else:
    soup = None
    print("‚ö†Ô∏è Pas de contenu HTML √† parser")

‚úÖ HTML pars√© avec BeautifulSoup
   Titre de la page: Appels √† projets - Ville de Paris


In [None]:
# Fonction pour extraire les donn√©es des appels √† projets
def scrape_paris_aap(soup):
    """Scrape les AAP du site Paris.fr"""
    aap_list = []
    
    if not soup:
        return aap_list
    
    # Chercher les conteneurs des AAP (peuvent √™tre articles, divs, liens, etc.)
    # On cherche d'abord des patterns courants
    article_containers = soup.find_all(['article', 'div'], class_=re.compile(r'(appel|project|call|aap)', re.I))
    
    if not article_containers:
        # Fallback: chercher tous les liens potentiels vers les AAP
        article_containers = soup.find_all('a', href=re.compile(r'appel|project', re.I))
    
    print(f"üìù Trouv√© {len(article_containers)} conteneurs potentiels")
    
    for container in article_containers:
        try:
            aap = {}
            
            # Titre
            title_elem = container.find(['h2', 'h3', 'h4', 'a'])
            aap['titre'] = title_elem.get_text(strip=True) if title_elem else 'N/A'
            
            # URL
            if container.name == 'a':
                aap['url_source'] = urljoin(BASE_URL, container.get('href', ''))
            else:
                link = container.find('a', href=True)
                aap['url_source'] = urljoin(BASE_URL, link.get('href', '')) if link else None
            
            # Description/r√©sum√©
            desc = container.find(['p', 'span'], class_=re.compile(r'(desc|summary|excerpt)', re.I))
            aap['resume'] = desc.get_text(strip=True) if desc else None
            
            # Dates (chercher des patterns comme "01/01/2026")
            text_content = container.get_text(' ')
            dates = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text_content)
            if len(dates) >= 2:
                try:
                    aap['date_publication'] = pd.to_datetime(dates[0], format='%d/%m/%Y').date()
                    aap['date_limite'] = pd.to_datetime(dates[1], format='%d/%m/%Y').date()
                except:
                    aap['date_publication'] = None
                    aap['date_limite'] = None
            elif len(dates) == 1:
                try:
                    aap['date_limite'] = pd.to_datetime(dates[0], format='%d/%m/%Y').date()
                    aap['date_publication'] = None
                except:
                    aap['date_publication'] = None
                    aap['date_limite'] = None
            else:
                aap['date_publication'] = None
                aap['date_limite'] = None
            
            # Montant (chercher des patterns comme "‚Ç¨", "euros")
            amounts = re.findall(r'(\d+[\s.,]*\d*)\s*(?:‚Ç¨|euros?|EUROS?)', text_content, re.I)
            if amounts:
                try:
                    # Nettoyer et convertir
                    cleaned = amounts[-1].replace(' ', '').replace('.', '').replace(',', '.')
                    aap['montant_max'] = float(cleaned)
                except:
                    aap['montant_max'] = None
            else:
                aap['montant_max'] = None
            
            # Organisme (chercher des balises sp√©cifiques)
            org = container.find(['span', 'p'], class_=re.compile(r'(organisme|provider|publisher)', re.I))
            aap['organisme'] = org.get_text(strip=True) if org else 'Mairie de Paris'
            
            # Cat√©gories/th√®mes
            categories_elem = container.find_all(['span', 'a'], class_=re.compile(r'(categor|tag|theme)', re.I))
            if categories_elem:
                aap['categories'] = [cat.get_text(strip=True) for cat in categories_elem]
            else:
                aap['categories'] = None
            
            # ID unique
            aap['id_record'] = f"paris_{datetime.now().strftime('%Y%m%d%H%M%S')}_{hash(aap['titre']) % 10000}"
            
            aap_list.append(aap)
            
        except Exception as e:
            print(f"  ‚ö†Ô∏è Erreur parsing: {str(e)}")
            continue
    
    return aap_list

# Scraper les AAP
aap_data = scrape_paris_aap(soup)
print(f"\n‚úÖ {len(aap_data)} appels √† projets extraits")

ÔøΩÔøΩÔøΩ Trouv√© 12 conteneurs potentiels

‚úÖ 12 appels √† projets extraits


## 3. Cr√©er un DataFrame brut

In [None]:
# Cr√©er DataFrame
df_paris = pd.DataFrame(aap_data)
print(f"üìä DataFrame cr√©√©: {df_paris.shape}")
print(f"\nüìã Colonnes: {list(df_paris.columns)}")
print(f"\nüìã Aper√ßu des donn√©es:")
df_paris.head()

ÔøΩÔøΩÔøΩ DataFrame cr√©√©: (12, 9)

ÔøΩÔøΩÔøΩ Colonnes: ['titre', 'url_source', 'resume', 'date_publication', 'date_limite', 'montant_max', 'organisme', 'categories', 'id_record']

ÔøΩÔøΩÔøΩ Aper√ßu des donn√©es:


Unnamed: 0,titre,url_source,resume,date_publication,date_limite,montant_max,organisme,categories,id_record
0,Les appels √† projets de la Ville de Paris,https://www.paris.fr/pages/repondre-a-un-appel...,,,,,Mairie de Paris,,paris_20260102130659_7133
1,Appel √† projets pour le partage de la culture ...,https://www.paris.fr/pages/appel-a-projets-202...,,2025-01-16,2025-03-10,,Mairie de Paris,,paris_20260102130659_7444
2,Appel √† projets ¬´¬†Pr√©venir les conduites √† ris...,https://www.paris.fr/pages/appel-a-projets-pre...,,2025-12-16,2026-02-16,,Mairie de Paris,,paris_20260102130659_3733
3,Appel √† projets ¬´¬†Paris Fabrik¬†¬ª¬†: formations ...,https://www.paris.fr/pages/appel-a-projets-par...,,2025-12-15,2026-02-19,,Mairie de Paris,,paris_20260102130659_6413
4,Appel √† projets immobiliers 2026 pour des lieu...,https://www.paris.fr/pages/appel-a-projets-imm...,,2026-12-10,2026-04-02,,Mairie de Paris,,paris_20260102130659_1238
5,Appel √† propositions pour un emplacement comme...,https://www.paris.fr/pages/appel-a-proposition...,,2025-12-05,2026-01-14,,Mairie de Paris,,paris_20260102130659_4100
6,Appel √† projets ¬´¬†Parcours linguistiques √† vis...,https://www.paris.fr/pages/appel-a-projets-par...,,2025-12-04,2026-02-03,,Mairie de Paris,,paris_20260102130659_6389
7,Appel √† projets¬†: Actions de pr√©vention de la ...,https://www.paris.fr/pages/appel-a-projets-act...,,2025-11-21,2026-01-09,,Mairie de Paris,,paris_20260102130659_9592
8,Appel √† projets des activit√©s p√©ri et extrasco...,https://www.paris.fr/pages/appel-a-projets-des...,,2025-11-03,2026-01-05,,Mairie de Paris,,paris_20260102130659_6449
9,Appel √† projets des temps d'activit√©s p√©riscol...,https://www.paris.fr/pages/appel-a-projets-des...,,2025-11-03,2026-01-05,,Mairie de Paris,,paris_20260102130659_8825


## 4. Nettoyer et normaliser les donn√©es

In [6]:
# Fonction de nettoyage du texte
def clean_text(text):
    if not isinstance(text, str):
        return text
    text = re.sub(r'<[^>]+>', '', text)  # Supprimer les tags HTML
    html_entities = {
        '&eacute;': '√©', '&icirc;': '√Æ', '&agrave;': '√†',
        '&nbsp;': ' ', '&quot;': '\"', '&amp;': '&',
        '&rsquo;': "'", '&ldquo;': '\u201c', '&rdquo;': '\u201d'
    }
    for entity, char in html_entities.items():
        text = text.replace(entity, char)
    text = re.sub(r'\s+', ' ', text)  # Normaliser les espaces
    return text.strip()

# Appliquer le nettoyage
for col in df_paris.select_dtypes(include=['object']).columns:
    if col not in ['categories']:
        df_paris[col] = df_paris[col].apply(clean_text)

print("‚úÖ Texte nettoy√©")

‚úÖ Texte nettoy√©


In [None]:
# Extraire les montants maximums
def extract_max_amount(text):
    if not isinstance(text, str):
        return None
    
    text = text.replace('\xa0', ' ')  # Non-breaking space
    amounts = re.findall(r'\b(\d+[\s.,]*\d*)\s*(?:‚Ç¨|euros?|EUROS?)', text)
    
    if not amounts:
        return None
    
    cleaned_amounts = []
    for amt in amounts:
        amt = amt.replace(' ', '')
        if ',' in amt:
            amt = amt.replace('.', '').replace(',', '.')
        else:
            parts = amt.split('.')
            if len(parts) > 1 and len(parts[-1]) == 3 and parts[-1].isdigit():
                amt = amt.replace('.', '')
        try:
            cleaned_amounts.append(float(amt))
        except ValueError:
            continue
    
    return max(cleaned_amounts) if cleaned_amounts else None

# Appliquer sur resume si montant_max est vide
for idx, row in df_paris.iterrows():
    if pd.isna(row['montant_max']) and pd.notna(row['resume']):
        df_paris.at[idx, 'montant_max'] = extract_max_amount(row['resume'])

print("‚úÖ Montants extraits")

‚úÖ Montants extraits


## 5. Mapper au format mapped_df

In [None]:
# Cr√©er le DataFrame mapp√©
mapped_df_paris = pd.DataFrame()

# Mapping direct des colonnes
if 'id_record' in df_paris.columns:
    mapped_df_paris['id_record'] = df_paris['id_record']
if 'titre' in df_paris.columns:
    mapped_df_paris['titre'] = df_paris['titre']
if 'organisme' in df_paris.columns:
    mapped_df_paris['organisme'] = df_paris['organisme']
if 'date_publication' in df_paris.columns:
    mapped_df_paris['date_publication'] = df_paris['date_publication']
if 'date_limite' in df_paris.columns:
    mapped_df_paris['date_limite'] = df_paris['date_limite']
if 'categories' in df_paris.columns:
    mapped_df_paris['categories'] = df_paris['categories']
if 'montant_max' in df_paris.columns:
    mapped_df_paris['montant_max'] = df_paris['montant_max']
if 'url_source' in df_paris.columns:
    mapped_df_paris['url_source'] = df_paris['url_source']
if 'resume' in df_paris.columns:
    mapped_df_paris['resume'] = df_paris['resume']

# Ajouter des colonnes manquantes avec valeurs par d√©faut
if 'public_cible' not in mapped_df_paris.columns:
    mapped_df_paris['public_cible'] = None
if 'taux_financement' not in mapped_df_paris.columns:
    mapped_df_paris['taux_financement'] = None
if 'contact' not in mapped_df_paris.columns:
    mapped_df_paris['contact'] = None
if 'modalite' not in mapped_df_paris.columns:
    mapped_df_paris['modalite'] = None
if 'demarches' not in mapped_df_paris.columns:
    mapped_df_paris['demarches'] = None
if 'mots_cles' not in mapped_df_paris.columns:
    mapped_df_paris['mots_cles'] = None
if 'objectif' not in mapped_df_paris.columns:
    mapped_df_paris['objectif'] = None
if 'montant_min' not in mapped_df_paris.columns:
    mapped_df_paris['montant_min'] = None
if 'note' not in mapped_df_paris.columns:
    mapped_df_paris['note'] = None
if 'tags' not in mapped_df_paris.columns:
    mapped_df_paris['tags'] = None
if 'perimetre_geo' not in mapped_df_paris.columns:
    mapped_df_paris['perimetre_geo'] = 'Paris'

print(f"‚úÖ DataFrame mapp√© cr√©√©: {mapped_df_paris.shape}")
print(f"\nüìã Colonnes finales: {list(mapped_df_paris.columns)}")

‚úÖ DataFrame mapp√© cr√©√©: (12, 19)

ÔøΩÔøΩÔøΩ Colonnes finales: ['id_record', 'titre', 'organisme', 'date_publication', 'date_limite', 'categories', 'montant_max', 'url_source', 'resume', 'public_cible', 'taux_financement', 'contact', 'modalite', 'demarches', 'mots_cles', 'objectif', 'note', 'tags', 'perimetre_geo']


In [None]:
# Convertir les listes en format Airtable Multiple Select
def convert_pipe_to_list(value):
    """Convertir string avec ||| en liste"""
    if pd.isna(value) or value == '' or value is None:
        return None
    if isinstance(value, str):
        items = [item.strip() for item in value.split('|||') if item.strip()]
        return items if items else None
    if isinstance(value, list):
        return value if value else None
    return None

# Appliquer la conversion
if 'categories' in mapped_df_paris.columns:
    mapped_df_paris['categories'] = mapped_df_paris['categories'].apply(convert_pipe_to_list)
    print("‚úÖ 'categories' converti au format Multiple Select")

if 'mots_cles' in mapped_df_paris.columns:
    mapped_df_paris['mots_cles'] = mapped_df_paris['mots_cles'].apply(convert_pipe_to_list)
    print("‚úÖ 'mots_cles' converti au format Multiple Select")

if 'public_cible' in mapped_df_paris.columns:
    mapped_df_paris['public_cible'] = mapped_df_paris['public_cible'].apply(convert_pipe_to_list)
    print("‚úÖ 'public_cible' converti au format Multiple Select")

‚úÖ 'categories' converti au format Multiple Select
‚úÖ 'mots_cles' converti au format Multiple Select
‚úÖ 'public_cible' converti au format Multiple Select


## 6. Aper√ßu des donn√©es

In [None]:
print("üìä R√©sum√© des donn√©es scrap√©es:")
print(f"   Total: {len(mapped_df_paris)} enregistrements")
print(f"   Colonnes: {len(mapped_df_paris.columns)}")
print(f"\nüìã Premiers enregistrements:")
mapped_df_paris.head()

ÔøΩÔøΩÔøΩ R√©sum√© des donn√©es scrap√©es:
   Total: 12 enregistrements
   Colonnes: 19

ÔøΩÔøΩÔøΩ Premiers enregistrements:


Unnamed: 0,id_record,titre,organisme,date_publication,date_limite,categories,montant_max,url_source,resume,public_cible,taux_financement,contact,modalite,demarches,mots_cles,objectif,note,tags,perimetre_geo
0,paris_20260102130659_7133,Les appels √† projets de la Ville de Paris,Mairie de Paris,,,,,https://www.paris.fr/pages/repondre-a-un-appel...,,,,,,,,,,,Paris
1,paris_20260102130659_7444,Appel √† projets pour le partage de la culture ...,Mairie de Paris,2025-01-16,2025-03-10,,,https://www.paris.fr/pages/appel-a-projets-202...,,,,,,,,,,,Paris
2,paris_20260102130659_3733,Appel √† projets ¬´ Pr√©venir les conduites √† ris...,Mairie de Paris,2025-12-16,2026-02-16,,,https://www.paris.fr/pages/appel-a-projets-pre...,,,,,,,,,,,Paris
3,paris_20260102130659_6413,Appel √† projets ¬´ Paris Fabrik ¬ª : formations ...,Mairie de Paris,2025-12-15,2026-02-19,,,https://www.paris.fr/pages/appel-a-projets-par...,,,,,,,,,,,Paris
4,paris_20260102130659_1238,Appel √† projets immobiliers 2026 pour des lieu...,Mairie de Paris,2026-12-10,2026-04-02,,,https://www.paris.fr/pages/appel-a-projets-imm...,,,,,,,,,,,Paris


In [None]:
# Statistiques
print("üìä Statistiques des donn√©es:")
print(f"\n‚úÖ Titres remplis: {mapped_df_paris['titre'].notna().sum()} / {len(mapped_df_paris)}")
print(f"‚úÖ Organismes remplis: {mapped_df_paris['organisme'].notna().sum()} / {len(mapped_df_paris)}")
print(f"‚úÖ Dates limites remplies: {mapped_df_paris['date_limite'].notna().sum()} / {len(mapped_df_paris)}")
print(f"‚úÖ URLs sources remplies: {mapped_df_paris['url_source'].notna().sum()} / {len(mapped_df_paris)}")
print(f"‚úÖ Montants extraits: {mapped_df_paris['montant_max'].notna().sum()} / {len(mapped_df_paris)}")
print(f"‚úÖ Cat√©gories remplies: {mapped_df_paris['categories'].notna().sum()} / {len(mapped_df_paris)}")

ÔøΩÔøΩÔøΩ Statistiques des donn√©es:

‚úÖ Titres remplis: 12 / 12
‚úÖ Organismes remplis: 12 / 12
‚úÖ Dates limites remplies: 9 / 12
‚úÖ URLs sources remplies: 12 / 12
‚úÖ Montants extraits: 0 / 12
‚úÖ Cat√©gories remplies: 0 / 12


## 7. Cr√©er fingerprints pour d√©dupplication

In [None]:
# Pour fusionner avec d'autres sources, cr√©er une d√©duplication par fingerprint
def create_fingerprint(row):
    """Cr√©er un fingerprint pour d√©dupliquer"""
    titre = str(row['titre']) if pd.notna(row['titre']) else ''
    organisme = str(row['organisme']) if pd.notna(row['organisme']) else ''
    date_limite = str(row['date_limite']) if pd.notna(row['date_limite']) else ''
    
    combined = f"{titre}|{organisme}|{date_limite}"
    import hashlib
    return hashlib.md5(combined.encode()).hexdigest()[:12]

mapped_df_paris['fingerprint'] = mapped_df_paris.apply(create_fingerprint, axis=1)
print(f"‚úÖ Fingerprints cr√©√©s pour d√©dupplication")

## 8. Exporter les donn√©es

In [None]:
# Exporter en CSV
csv_output = '../data/paris_aap_scraped.csv'
try:
    mapped_df_paris.to_csv(csv_output, index=False, encoding='utf-8')
    print(f"‚úÖ Export√© en CSV: {csv_output}")
except Exception as e:
    print(f"‚ö†Ô∏è Erreur export CSV: {str(e)}")

In [None]:
# Exporter en JSON
json_output = '../data/paris_aap_scraped.json'
try:
    # Convertir les listes et dates en JSON-compatible
    df_for_json = mapped_df_paris.copy()
    for col in df_for_json.columns:
        if df_for_json[col].dtype == 'object':
            df_for_json[col] = df_for_json[col].astype(str)
    
    df_for_json.to_json(json_output, orient='records', force_ascii=False, indent=2)
    print(f"‚úÖ Export√© en JSON: {json_output}")
except Exception as e:
    print(f"‚ö†Ô∏è Erreur export JSON: {str(e)}")

## 9. Upload vers Airtable (optionnel)

In [None]:
‚ö†Ô∏è D√©commentez pour uploader vers Airtable (n√©cessite `.env` avec credentials)

In [None]:
# D√©commentez pour uploader
# import sys
# sys.path.append('..')
# from connectors.airtable_connector import AirtableConnector
# 
# airtable = AirtableConnector()
# uploaded = airtable.upload_dataframe(mapped_df_paris)
# print(f"‚úÖ Uploaded {uploaded} records to Airtable!")

print("‚è∏Ô∏è  Upload Airtable d√©sactiv√© (√† activer si n√©cessaire)")