In [2]:
"""
Script d'extraction des POIs touristiques OpenStreetMap
Génère un fichier JSON réutilisable pour l'analyse
"""

import requests
import pandas as pd
import json
from datetime import datetime
import time

class OSMExtractor:
    """Extracteur simplifié de POIs OSM"""
    
    def __init__(self, zone_name, bbox=None):
        self.zone_name = zone_name
        self.bbox = bbox or self._get_bbox(zone_name)
        #self.url = "http://overpass-api.de/api/interpreter"

        # Utiliser un serveur Overpass alternatif plus stable
        self.url = "https://overpass.kumi.systems/api/interpreter"
        # Configuration pour gérer les timeouts
        self.timeout = 300  # 5 minutes
        self.max_retries = 3
        self.retry_delay = 30  # 30 secondes entre les tentatives

        
        
        # Catégories touristiques
        self.categories = {
            'monuments': {'tourism': ['attraction'],'historic': ['monument', 'memorial', 'castle', 'ruins', 'archaeological_site']},
            'musees': {'tourism': ['museum', 'gallery']},
            'patrimoine_religieux': {'amenity': ['place_of_worship'],'historic': ['church', 'cathedral', 'monastery']},
            'hebergement': {'tourism': ['hotel', 'hostel', 'guest_house', 'apartment', 'camp_site']},
            'restaurants_cafes': {'amenity': ['restaurant', 'cafe', 'bar', 'pub', 'fast_food']},
            'parcs_jardins': {'leisure': ['park', 'garden', 'nature_reserve'],'tourism': ['viewpoint'],'natural': ['peak', 'beach', 'wetland', 'water', 'wood', 'grassland', 'cliff', 'valley']},
            'culture_loisirs': {'amenity': ['theatre', 'cinema', 'arts_centre', 'community_centre'],'tourism': ['theme_park', 'zoo', 'aquarium'],'sport': ['stadium', 'sports_centre', 'swimming_pool', 'fitness_centre', 'golf_course', 'climbing']},
            'information_touristique': {'tourism': ['information'],'amenity': ['tourist_info']},
            'commerces': {'shop': ['gift', 'souvenir', 'art', 'books', 'museum', 'craft', 'antiques', 'department_store', 'mall']},
            'transports': {
                'public_transport': ['station', 'stop_position', 'platform', 'stop_area'],
                'railway': ['station', 'halt', 'tram_stop'],
                'amenity': ['ferry_terminal', 'bus_station', 'taxi']
            }
        }
    
    def _get_bbox(self, zone_name):
        """Récupère bbox via Nominatim"""
        nominatim_url = "https://nominatim.openstreetmap.org/search"
        params = {
            'q': zone_name,
            'format': 'json',
            'limit': 1
        }
        headers = {'User-Agent': 'OSM Tourism Analyzer'}
        
        response = requests.get(nominatim_url, params=params, headers=headers)
        if response.status_code == 200 and response.json():
            data = response.json()[0]
            bbox = data['boundingbox']
            # Format: (sud, ouest, nord, est)
            return (float(bbox[0]), float(bbox[2]), float(bbox[1]), float(bbox[3]))
        else:
            # Bbox par défaut pour Île-de-France
            return (48.12, 1.45, 49.24, 3.56)
    
    def _build_query(self, tags):
        """Construit requête Overpass"""
        bbox_str = f"{self.bbox[0]},{self.bbox[1]},{self.bbox[2]},{self.bbox[3]}"
        queries = []
        for key, values in tags.items():
            for value in values:
                queries.append(f'  node["{key}"="{value}"]({bbox_str});')
                queries.append(f'  way["{key}"="{value}"]({bbox_str});')
                queries.append(f'  relation["{key}"="{value}"]({bbox_str});')
        
        query = f"""
                [out:json][timeout:180];
                (
                {''.join(queries)}
                );
                out center meta;
                """
        return query
    
    def extract(self):
        """Extraction de tous les POIs avec gestion des erreurs"""
        all_pois = []
        
        print(f"Extraction pour {self.zone_name}")
        print(f"Zone: {self.bbox}\n")
        
        for cat_name, tags in self.categories.items():
            print(f"Extraction {cat_name}...", end=" ")
            
            # Retry loop
            for attempt in range(self.max_retries):
                try:
                    r = requests.post(
                        self.url, 
                        data={'data': self._build_query(tags)}, 
                        timeout=self.timeout,
                        headers={'User-Agent': 'OSM Tourism Extractor'}
                    )
                    
                    if r.status_code == 200:
                        elements = r.json().get('elements', [])
                        
                        for elem in elements:
                            # Coordonnées
                            if elem['type'] == 'node':
                                lat, lon = elem.get('lat'), elem.get('lon')
                            elif 'center' in elem:
                                lat, lon = elem['center'].get('lat'), elem['center'].get('lon')
                            else:
                                continue
                            
                            tags = elem.get('tags', {})
                            
                            poi = {
                                'osm_id': str(elem['id']),
                                'category': str(cat_name),
                                'name': str(tags.get('name', 'Sans nom')),
                                'lat': float(lat) if lat else None,
                                'lon': float(lon) if lon else None,
                                'address': str(tags.get('addr:street', '')),
                                'city': str(tags.get('addr:city', '')),
                                'phone': str(tags.get('phone', '')),
                                'website': str(tags.get('website', '')),
                                'opening_hours': str(tags.get('opening_hours', '')),
                                'wikipedia': str(tags.get('wikipedia', '')),
                                'timestamp': str(elem.get('timestamp', ''))
                            }
                            all_pois.append(poi)
                        
                        print(f"✓ {len(elements)} POIs")
                        break  # Succès, sortir de la boucle de retry
                    
                    elif r.status_code == 429:
                        print(f"⚠ Rate limit (429), attente {self.retry_delay}s...")
                        time.sleep(self.retry_delay)
                        if attempt < self.max_retries - 1:
                            print(f"  Tentative {attempt + 2}/{self.max_retries}...", end=" ")
                        continue
                    
                    elif r.status_code == 504:
                        print(f"⚠ Timeout serveur (504), attente {self.retry_delay}s...")
                        time.sleep(self.retry_delay)
                        if attempt < self.max_retries - 1:
                            print(f"  Tentative {attempt + 2}/{self.max_retries}...", end=" ")
                        continue
                    
                    else:
                        print(f"✗ Erreur {r.status_code}")
                        break
                
                except requests.Timeout:
                    print(f"⚠ Timeout local, attente {self.retry_delay}s...")
                    time.sleep(self.retry_delay)
                    if attempt < self.max_retries - 1:
                        print(f"  Tentative {attempt + 2}/{self.max_retries}...", end=" ")
                    continue
                
                except Exception as e:
                    print(f"✗ {str(e)}")
                    break
            
            # Attente plus longue entre chaque catégorie
            time.sleep(5)
        
        return pd.DataFrame(all_pois)
    
    def save(self, df, format='json'):
        """Sauvegarde les données"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"pois_{self.zone_name.replace(' ', '_')}_{timestamp}"
        
        if format == 'json':
            df.to_json(f"../data/raw/{filename}.json", orient='records', indent=2)
            print(f"\n✓ Sauvegardé: {filename}.json")
        elif format == 'csv':
            df.to_csv(f"../data/raw/{filename}.csv", index=False)
            print(f"\n✓ Sauvegardé: {filename}.csv")


# ==== UTILISATION ====
if __name__ == "__main__":
    
    # Configuration
    extractor = OSMExtractor(zone_name="Île-de-France")
    # Ou avec bbox custom: OSMExtractor("Paris", bbox=(48.815, 2.225, 48.902, 2.420))
    
    # Extraction
    df = extractor.extract()
    
    print(f"\n{'='*50}")
    print(f"Total: {len(df)} POIs extraits")
    print(f"Catégories: {df['category'].nunique()}")
    
    # Sauvegarde
    extractor.save(df, format='json')
    #extractor.save(df, format='csv')

Extraction pour Île-de-France
Zone: (48.1201456, 1.4462445, 49.241431, 3.5592208)

Extraction monuments... ✓ 6738 POIs
Extraction musees... ✓ 1058 POIs
Extraction patrimoine_religieux... ✓ 3632 POIs
Extraction hebergement... ✓ 4231 POIs
Extraction restaurants_cafes... ✓ 32335 POIs
Extraction parcs_jardins... ✓ 47481 POIs
Extraction culture_loisirs... ✓ 4121 POIs
Extraction information_touristique... ✓ 11507 POIs
Extraction commerces... ✓ 2880 POIs
Extraction transports... ✓ 66947 POIs

Total: 180930 POIs extraits
Catégories: 10

✓ Sauvegardé: pois_Île-de-France_20251215_135925.json
