# OSM POI extraction and cleaning

In [None]:
import os
os.environ['USE_PYGEOS'] = '0'
import matplotlib.pyplot as plt
# import leafmap.maplibregl as leafmap
from shapely import Point, LineString, Polygon
import geopandas as gpd
import h3
import networkx as nx
import pandas as pd
from rasterstats import zonal_stats, point_query
import rasterio
import osmnx as ox
import duckdb
from rasterio.plot import show
import numpy as np
from rasterio import features
import time
from tqdm.notebook import tqdm

In [None]:
# Administrative boundaries
cantons = gpd.read_file(
    '../../SeroCOV/data/geo_units/swissBOUNDARIES3D_1_3_TLM_KANTONSGEBIET.shp', engine='pyogrio')
communes = gpd.read_file('../../SeroCOV/data/geo_units/swissBOUNDARIES3D_1_3_TLM_HOHEITSGEBIET.shp', engine='pyogrio')
# Only retain communes that are in the canton of Geneva
communes = communes[communes.KANTONSNUM == 25]

cantons = cantons.to_crs(2056)
communes = communes.to_crs(2056)
canton_ge = cantons[cantons.NAME=='Genève']

## Define buffered area within which we will extract amenities

In [None]:
from shapely.geometry import box

# Get the geometry of the place
gdf = ox.geocode_to_gdf(place)

# Create a buffer around the geometry
buffered_gdf = gdf.to_crs(epsg=2056).buffer(4000).to_crs(gdf.crs)
buffered_gdf.plot()
# plt.savefig('./results/canton_ge_buffered.png', dpi=80)
buffered_gdf = buffered_gdf.to_crs(4326)
# Get the bounding box of the buffered area
bounds = buffered_gdf.total_bounds
bbox = box(*bounds)

# Extract the network using the buffered area
G_buffered = ox.graph_from_polygon(bbox, network_type='walk')

In [None]:
gdf_buffered= gpd.GeoDataFrame(buffered_gdf, crs = 4326, geometry = buffered_gdf)

In [None]:
# tags = {"amenity": True, "landuse": ["retail", "commercial"], "highway": "bus_stop"}
# gdf_retail_bus = ox.features_from_place(place, tags)

## Based on [Bruno et al.](https://static-content.springer.com/esm/art%3A10.1038%2Fs44284-024-00119-4/MediaObjects/44284_2024_119_MOESM1_ESM.pdf)

In [None]:
# Healthcare POIs
healthcare_pois_dict = {
    'amenity': ['clinic', 'dentist', 'doctors', 'hospital', 'pharmacy', 'physiokinesitherapy'],
    'healthcare': True,  # All healthcare tags
    'healthcare:speciality': True,  # All healthcare specialities
    'government': ['healthcare'],
    'office': ['medical', 'physician'],
    'social_facility': ['ambulatory_care', 'assisted_living', 'day_care', 'daycare', 'healthcare']
}

# Services POIs
services_pois_dict = {
    'amenity': [
        'animal_boarding', 'animal_breeding', 'animal_shelter', 'atm', 'bank', 'bicycle_parking', 
        'bicycle_rental', 'bicycle_repair_station', 'boat_rental', 'boat_storage', 'bureau_de_change', 
        'car_rental', 'car_sharing', 'car_wash', 'casino', 'childcare', 'club', 'community_centre', 
        'community_centres', 'driving_school', 'internet_cafe', 'kindergarten', 'left_luggage', 
        'luggage_storage', 'marketplace', 'ministry', 'money_transfer', 'mortuary', 'newsagent', 
        'nightclub', 'nursing_home', 'parcel_locker', 'payment_terminal', 'photo_booth', 'photobooth', 
        'post_office', 'recording_studio', 'rescue_station', 'social_centre', 'social_facility', 'toilets'
    ],
    'bicycle_parking': ['building', 'lockers', 'rack', 'stands', 'wall_loops', 'wave'],
    'building': ['kindergarten', 'nursing_home', 'post_office', 'public_administration', 'public_bath', 'service'],
    'building:part': ['transportation'],
    'craft': [
        'camera_repair', 'car_repair', 'carpenter', 'caterer', 'cleaning', 'coachbuilder', 'confectionery',
        'electrician', 'electronics_repair', 'gardener', 'handicraft', 'heating_engineer', 'hvac', 'joiner',
        'key_cutter', 'laboratory', 'locksmith', 'optician', 'photographer', 'photographic_laboratory',
        'plumber', 'printer', 'sculptor', 'shoemaker', 'stonemason', 'tailor', 'tiler', 'turner', 'upholsterer',
        'watchmaker', 'window_construction'
    ],
    'leisure': ['arcade_hall', 'amusement_arcade', 'adult_gaming_centre', 'escape_game', 'recreation_center', 'sauna', 'tanning_salon'],
    'office': [
        'architect', 'architect;engineer', 'bank', 'employment_agency', 'employment_consultant', 'energy_supplier',
        'engineer', 'engineering', 'estate_agent', 'financial', 'financial_advisor', 'foundation', 'government',
        'insurance', 'labour_union', 'lawyer', 'legal', 'moving_company', 'ngo', 'oil;gas', 'public_administration',
        'service', 'tax_advisor', 'taxi', 'telecommunication', 'travel_agent', 'wedding_planner'
    ],
    'service': [
        'dealer; repair', 'electrical', 'parts', 'parts;repair', 'repair', 'tyres;repair'
    ],
    'service:bicycle:pump': ['yes'],
    'service:bicycle:repair': ['yes'],
    'service:vehicle:inspection': ['yes'],
    'service:vehicle:car_repair': ['yes'],
    'service:vehicle:tyres': ['yes'],
    'shop': [
        'dog_spa', 'dry_cleaning', 'electrical_repair', 'electronics_repair', 'estate_agent', 'finance',
        'funeral_directors', 'hairdresser', 'internet', 'laundry', 'massage', 'model', 'money_lender',
        'nails', 'newsagent', 'newspaper_agent', 'pet_grooming', 'photo', 'printing', 'rental', 'scrap_yard',
        'shoe_repair', 'tattoo', 'travel_agency'
    ],
    'social_facility': ['food_bank', 'group_home', 'leisure', 'nursing_home', 'senior_citizen_centre'],
    'street_cabinet': ['postal_service'],
    'tourism': ['spa_resort', 'theme_park']
}

# Transport POIs
transport_pois_dict = {
    'amenity': ['bus_station', 'ferry_terminal'],
    'building': ['subway_station', 'transportation'],
    'building:part': ['transportation'],
    'bus': ['urban', 'yes'],
    'car_sharing': ['yes'],
    'construction': ['subway_station'],
    'departures_board': ['no', 'realtime', 'timetable', 'yes'],
    'government': ['transportation'],
    'public_transport': ['platform', 'station', 'stop', 'stop_area', 'stop_position'],
    'railway': ['platform', 'station', 'stop', 'stop;station', 'subway_entrance', 'tram_stop'],
    'shelter_type': ['public_transport'],
    'station': ['light_rail', 'rail', 'subway', 'train'],
    'subway': ['yes'],
    'suspended:highway': ['bus_stop'],
    'taxi': ['designated', 'yes'],
    'train': ['yes'],
    'tram': ['yes'],
    'trolleybus': ['yes'],
    'waterway': ['boatyard', 'dock']
}

# Outdoor POIs
outdoor_pois_dict = {
    'amenity': ['bench', 'watering_place'],
    'attraction': [
        'amusement_ride', 'animal', 'big_wheel', 'bumper_boats', 'bumper_car', 'bumper_cars',
        'carousel', 'carriage_ride', 'dark_ride', 'drop_tower', 'haunted_house', 'maze',
        'playset', 'pneumatic_slide', 'roller_coaster', 'slide', 'swing_carousel', 'trampoline',
        'water_ride', 'water_slide'
    ],
    'bench': ['yes'],
    'denotation': ['park'],
    'landuse': ['recreation_ground'],
    'leisure': [
        'beach_resort', 'bleachers', 'dog_park', 'fishing', 'garden', 'ice_rink', 'marina',
        'miniature_golf', 'nature_reserve', 'park', 'picnic_table', 'pitch', 'playground',
        'recreation_ground', 'water_park'
    ],
    'natural': ['beach', 'cave', 'cave_entrance', 'cliff', 'park'],
    'playground': [
        'roundabout', 'seesaw', 'slide', 'slide;spinny;swing', 'springy', 'structure'
    ],
    'playground:theme': ['playground'],
    'shelter_type': ['picnic_shelter'],
    'site_type': ['stadium'],
    'tourism': ['picnic_site', 'viewpoint', 'zoo'],
    'water': ['Piscina', 'fishpond', 'lake', 'pond']
}

# Supplies POIs
supplies_pois_dict = {
    'amenity': ['compressed_air', 'fuel', 'gambling;vending_machine', 'marketplace', 'vending_machine'],
    'building': ['mall', 'market', 'retail', 'retail_outlet', 'shopping_mall', 'supermarket'],
    'building:part': ['retail', 'supermarket'],
    'building:use': ['commercial'],
    'craft': ['distillery', 'dressmaker', 'glaziery', 'goldsmith', 'parquet_layer', 'pottery', 'winery'],
    'fuel:cng': ['yes'],
    'fuel:diesel': ['yes'],
    'fuel:octane_95': ['yes'],
    'fuel:octane_98': ['yes'],
    'goods': ['yes'],
    'landuse': ['retail'],
    'second_hand': ['no', 'only', 'yes'],
    'service': ['parts', 'parts;repair'],
    'service:bicycle:retail': ['yes'],
    'shop': [
        'accessories', 'alcohol', 'anime', 'antiques', 'appliance', 'art', 'artist', 'baby_goods',
        'bag', 'bakery', 'bathroom_furnishing', 'beauty', 'beauty; piercing', 'bed', 'beverages',
        'bicycle', 'bookmaker', 'books', 'boutique', 'building_supplies', 'butcher', 'camera',
        'cannabis', 'car', 'car_hifi', 'car_parts', 'caravan', 'carpet', 'charity', 'cheese',
        'chemist', 'chocolate', 'clothes', 'clothes;tailor', 'coffee', 'collector', 'comics',
        'communications', 'computer', 'confectionery', 'construction_supplies', 'convenience',
        'copyshop', 'cosmetics', 'costumes', 'craft', 'curtain', 'dairy', 'deli', 'dental_supplies',
        'department_store', 'doityourself', 'doors', 'duty_free', 'e-cigarette', 'electric_supplies',
        'electrical', 'electronics', 'engraver', 'erotic', 'fabric', 'fair_trade', 'farm', 'fashion',
        'fashion_accessories', 'fireworks', 'fishing', 'fishmonger', 'flooring', 'florist', 'food',
        'frame', 'frozen_food', 'fuel', 'furniture', 'games', 'general', 'gift', 'glazery', 'glaziery',
        'greengrocer', 'grocery', 'haberdashery', 'hairdresser_supply', 'hardware', 'hat', 'health_food',
        'hearing_aids', 'heating', 'heating_cooling', 'herbalist', 'hifi', 'hobby', 'household_goods',
        'household_linen', 'houseware', 'hunting', 'interior_decoration', 'jewelry', 'kiosk', 'kitchen',
        'leather', 'lighting', 'locksmith', 'lottery', 'mall', 'medical_supply', 'metal construction craft',
        'mobile_phone', 'motorcycle', 'motorcycle_repair', 'music', 'musical_instrument', 'negozio_di_lampade',
        'noodles', 'nutrition_supplements', 'optician', 'orthopedics', 'outdoor', 'paint', 'party', 'pasta',
        'pastry', 'pawnbroker', 'perfumery', 'pet', 'pottery', 'printer_ink', 'pyrotechnics', 'radiotechnics',
        'religion', 'scuba_diving', 'seafood', 'second_hand', 'security', 'sewing', 'shoes', 'sports',
        'stationery', 'supermarket', 'tailor', 'tea', 'telecommunication', 'ticket', 'tiles', 'tobacco',
        'toys', 'trade', 'truck', 'tyres', 'vacuum_cleaner', 'variety_store', 'video', 'video_games',
        'watches', 'weapons', 'wholesale', 'window_blind', 'wine', 'wood', 'workshop', 'yes'
    ],
    'use': ['market'],
    'vending': True  # All vending tags
}

fast_food_pois_dict = {'amenity':'fast_food',
                      'building':'fast_food_restaurant'}
# Restaurant POIs
restaurant_pois_dict = {
    'amenity': [
        'bbq', 'bar', 'biergarten', 'cafe', 'canteen', 'fast_food', 'food_court', 'ice_cream',
        'kitchen', 'pub', 'restaurant'
    ],
    'bar': True,  # All bar tags
    'building': ['fast_food_restaurant'],
    'cuisine': True,  # All cuisine tags
    'cuisine_1': True,  # All cuisine_1 tags
    'cuisine_2': True,  # All cuisine_2 tags
    'diet': True,  # All diet tags
    'diet:vegan': True,  # All diet:vegan tags
    'diet:vegetarian': True,  # All diet:vegetarian tags
    'food': ['ice_cream'],
    'ice_cream': ['artisanal', 'industrial', 'yes'],
    'ice_cream:type': True,  # All ice_cream:type tags
    'oven': ['electrical', 'wood_fired'],
    'shop': ['cafe', 'ice_cream', 'pizza'],
    'takeaway': ['no', 'only', 'yes'],
    'tourism': ['agritourism']
}

# Culture POIs
culture_pois_dict = {
    'amenity': [
        'art_gallery', 'arts_centre', 'baptistery', 'cinema', 'conference_centre', 'convention_centre',
        'events_venue', 'library', 'monastery', 'music_school', 'planetarium', 'public_bookcase',
        'theatre', 'ticket_office', 'toy_library'
    ],
    'attraction': ['historic', 'train'],
    'artwork_type': True,  # All artwork_type tags
    'building': [
        'abbey', 'arena', 'baptistery', 'basilica', 'castle', 'cinema', 'congress_centre', 'gasometer',
        'library', 'mausoleum', 'monastery', 'museum', 'obelisk', 'place_of_worship', 'propylaea',
        'pyramid', 'quadrifrons', 'sports_arena', 'stadium', 'temple', 'theatre'
    ],
    'building:part': ['castle', 'theatre'],
    'club': ['culture'],
    'fountain': ['sarcophagus', 'scenic'],
    'government': ['culture'],
    'historic': True,  # All historic tags
    'historic:civilization': True,  # All historic:civilization tags
    'historic:period': True,  # All historic:period tags
    'landuse': ['observatory'],
    'leisure': ['stadium'],
    'memorial': [
        'bust', 'column', 'fountain', 'ghost_bike', 'obelisk', 'plaque', 'sculpture', 'statue',
        'stele', 'stolperstein', 'stopelstein', 'war', 'war_memorial', 'yes'
    ],
    'museum': True,  # All museum tags
    'period': ['aurelian'],
    'ruins': [
        'acqueduct', 'baths', 'castle', 'cemetery', 'crepidoma', 'mausoleum', 'temple', 'thermae', 'tomb'
    ],
    'site_type': [
        'castle', 'catacomb', 'catacombs', 'domus', 'fortification', 'necropolis', 'nymphaeum',
        'roman_circus', 'roman_road', 'roman_villa', 'ruins', 'temple'
    ],
    'tomb': ['mausoleum', 'memorial', 'sarcophagus', 'war_grave'],
    'tourism': ['aquarium', 'artwork', 'attraction', 'gallery', 'museum']
}

# Education POIs
education_pois_dict = {
    'amenity': ['college', 'fablab', 'language_school', 'school', 'university'],
    'building': ['college', 'foreign_school', 'school', 'school;yes', 'university'],
    'building:part': ['school'],
    'distretto_scolastico': True,  # All distretto_scolastico tags
    'government': ['education'],
    'landuse': ['education'],
    'office': ['educational_institution', 'research', 'research_institute', 'school']
}

# Physical POIs
physical_pois_dict = {
    'amenity': ['Lasergame', 'dancing_school', 'dive_centre', 'dojo'],
    'boules': ['bocce', 'boccia', 'yes'],
    'building': ['gym', 'gymnasium', 'sports_centre', 'sports_hall'],
    'club': ['sport'],
    'dance:teaching': ['yes'],
    'leisure': [
        'bowling_alley', 'dance', 'fitness_centre', 'fitness_station', 'golf_course', 'horse_riding',
        'racetrack', 'sports_centre', 'sports_hall', 'swimming_pool', 'track', 'yoga'
    ],
    'natural': ['peak'],
    'playground': ['climbingframe', 'climbingwall', 'zipwire'],
    'shop': ['yoga'],
    'sport': True,  # All sport tags
    'swimming_pool': ['covered', 'indoor', 'outdoor', 'yes']
}

In [None]:
def fetch_pois_from_tag_dict(place_name, tag_dict, timeout=15, pause=2):
    """
    Fetch points of interest by iterating through tag categories and concatenating results.
    
    Parameters
    ----------
    place_name : str
        The name of the place to query (e.g., 'Grand Genève')
    tag_dict : dict
        Dictionary of tag categories and their values
    timeout : int, optional
        Timeout for API requests in seconds, defaults to 180
    pause : int, optional
        Pause between API requests in seconds, defaults to 2
        
    Returns
    -------
    gdf : GeoDataFrame
        Combined GeoDataFrame of all POIs
    """
    # Get place geometry once to reuse
    try:
        place_gdf = ox.geocode_to_gdf(place_name)
        place_polygon = place_gdf.unary_union
        print(f"Retrieved polygon for {place_name}")
    except Exception as e:
        print(f"Error retrieving place geometry: {e}")
        return None
    
    # Set custom timeout for API requests
    ox.settings.requests_timeout = timeout
    
    # Initialize list to store GeoDataFrames
    gdfs = []
    successful_categories = []
    failed_categories = []
    
    # Iterate through each tag category
    for category, values in tqdm(tag_dict.items(), desc="Processing categories"):
        # Create a single-key dictionary for this category
        single_tag_dict = {category: values}
        
        try:
            # Fetch features using the polygon
            gdf = ox.features_from_polygon(place_polygon, tags=single_tag_dict)
            gdf = gdf.reset_index()
            if not gdf.empty:
                # Add category metadata
                gdf['poi_category'] = category
                
                # Convert problematic columns to string to ensure compatibility
                for col in gdf.columns:
                    if col != 'geometry' and isinstance(gdf[col].iloc[0], (list, dict)):
                        gdf[col] = gdf[col].astype(str)
                
                gdfs.append(gdf)
                successful_categories.append(category)
                print(f"Added {len(gdf)} {category} features")
            else:
                print(f"No {category} features found")
                
            # Add pause to avoid overwhelming the API
            time.sleep(pause)
            
        except Exception as e:
            print(f"Error fetching {category}: {e}")
            failed_categories.append(category)
            continue
    
    # Report on processing results
    print(f"\nSuccessfully processed {len(successful_categories)} categories")
    if failed_categories:
        print(f"Failed to process {len(failed_categories)} categories: {', '.join(failed_categories)}")
    
    # Combine all GeoDataFrames
    if gdfs:
        try:
            combined_gdf = pd.concat(gdfs, ignore_index=True)
            # Remove duplicate POIs based on OSM ID
            if 'osmid' in combined_gdf.columns:
                combined_gdf = combined_gdf.drop_duplicates(subset='osmid')
            print(f"Final dataset contains {len(combined_gdf)} unique features")
            return combined_gdf
        except Exception as e:
            print(f"Error combining GeoDataFrames: {e}")
            return gdfs  # Return the list of GeoDataFrames if concatenation fails
    else:
        print("No features collected")
        return gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")

In [None]:
# gdf_culture_pois = fetch_pois_from_tag_dict("Grand Genève", culture_pois_dict)

In [None]:
# # Culture POIs
# gdf_culture_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", culture_pois_dict)

# # Education POIs
# gdf_education_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", education_pois_dict)

# # Healthcare POIs
# gdf_healthcare_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", healthcare_pois_dict)

# # Services POIs
# gdf_services_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", services_pois_dict)

# # Transport POIs
# gdf_transport_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", transport_pois_dict)

# # Outdoor POIs
# gdf_outdoor_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", outdoor_pois_dict)

# # Supplies POIs
# gdf_supplies_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", supplies_pois_dict)

# # Restaurant POIs
# gdf_restaurant_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", restaurant_pois_dict)

# # Physical POIs
# gdf_physical_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", physical_pois_dict)

In [None]:
import backoff

# Add exponential backoff decorator for API resilience
@backoff.on_exception(backoff.expo, 
                     (Exception),
                     max_tries=5,
                     max_time=300)
def fetch_features_with_retry(polygon, tags):
    """Fetch features with exponential backoff retry logic"""
    return ox.features_from_polygon(polygon, tags=tags)

def fetch_pois_from_tag_dict(place_name, tag_dict, timeout=180, pause=5, chunk_size=None):
    """
    Fetch points of interest by iterating through tag categories with improved error handling.
    
    Parameters
    ----------
    place_name : str
        The name of the place to query (e.g., 'Grand Genève')
    tag_dict : dict
        Dictionary of tag categories and their values
    timeout : int, optional
        Timeout for API requests in seconds, defaults to 180 (increased)
    pause : int, optional
        Pause between API requests in seconds, defaults to 5 (increased)
    chunk_size : int, optional
        If provided, splits large areas into chunks of this size in sq km
        
    Returns
    -------
    gdf : GeoDataFrame
        Combined GeoDataFrame of all POIs
    """
    # Set custom timeout for API requests (increased)
    ox.settings.requests_timeout = timeout
    ox.settings.overpass_rate_limit = True  # Enable rate limiting
    
    # Get place geometry once to reuse
    try:
        place_gdf = ox.geocode_to_gdf(place_name)
        place_polygon = place_gdf.unary_union
        print(f"Retrieved polygon for {place_name}")
        
        # Optional: Split into chunks if area is large and chunk_size is provided
        if chunk_size is not None and place_gdf.to_crs(epsg=3857).area.sum() > chunk_size * 1e6:
            from shapely.geometry import box
            print(f"Large area detected. Splitting into chunks...")
            minx, miny, maxx, maxy = place_polygon.bounds
            chunks = []
            curr_x = minx
            while curr_x < maxx:
                curr_y = miny
                while curr_y < maxy:
                    chunk = box(curr_x, curr_y, 
                                min(curr_x + 0.15, maxx), 
                                min(curr_y + 0.15, maxy))
                    if chunk.intersects(place_polygon):
                        chunks.append(chunk.intersection(place_polygon))
                    curr_y += 0.15
                curr_x += 0.15
            print(f"Area split into {len(chunks)} chunks")
            polygons = chunks
        else:
            polygons = [place_polygon]
            
    except Exception as e:
        print(f"Error retrieving place geometry: {e}")
        return None
    
    # Initialize list to store GeoDataFrames
    gdfs = []
    successful_categories = []
    failed_categories = []
    
    # Iterate through each tag category
    for category, values in tqdm(tag_dict.items(), desc="Processing categories"):
        # Create a single-key dictionary for this category
        single_tag_dict = {category: values}
        
        category_gdfs = []
        try:
            # Process each polygon (or chunk)
            for i, poly in enumerate(polygons):
                if len(polygons) > 1:
                    print(f"Processing chunk {i+1}/{len(polygons)} for {category}")
                
                try:
                    # Use the retry-enabled function
                    gdf = fetch_features_with_retry(poly, single_tag_dict)
                    
                    if not gdf.empty:
                        gdf = gdf.reset_index()
                        # Add category metadata
                        gdf['poi_category'] = category
                        
                        # Convert problematic columns to string
                        for col in gdf.columns:
                            if col != 'geometry' and isinstance(gdf[col].iloc[0], (list, dict)):
                                gdf[col] = gdf[col].astype(str)
                        
                        category_gdfs.append(gdf)
                        print(f"  Added {len(gdf)} {category} features from chunk {i+1}")
                    
                    # Add pause between chunk requests
                    time.sleep(pause)
                    
                except Exception as e:
                    print(f"  Error in chunk {i+1} for {category}: {e}")
                    # Continue with next chunk instead of failing entire category
            
            # Combine all chunks for this category
            if category_gdfs:
                category_combined = pd.concat(category_gdfs, ignore_index=True)
                gdfs.append(category_combined)
                successful_categories.append(category)
                print(f"Added total of {len(category_combined)} {category} features")
            else:
                print(f"No {category} features found in any chunk")
                
            # Add pause between categories
            time.sleep(pause)
            
        except Exception as e:
            print(f"Error processing {category}: {e}")
            failed_categories.append(category)
            continue
    
    # Report on processing results
    print(f"\nSuccessfully processed {len(successful_categories)} categories")
    if failed_categories:
        print(f"Failed to process {len(failed_categories)} categories: {', '.join(failed_categories)}")
    
    # Combine all GeoDataFrames
    if gdfs:
        try:
            combined_gdf = pd.concat(gdfs, ignore_index=True)
            # Remove duplicate POIs based on OSM ID
            if 'osmid' in combined_gdf.columns:
                combined_gdf = combined_gdf.drop_duplicates(subset='osmid')
            print(f"Final dataset contains {len(combined_gdf)} unique features")
            return combined_gdf
        except Exception as e:
            print(f"Error combining GeoDataFrames: {e}")
            return gdfs  # Return the list of GeoDataFrames if concatenation fails
    else:
        print("No features collected")
        return gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")

# Install backoff library if not already installed
# !pip install backoff

In [None]:
# Usage
gdf_culture_pois_2025 = fetch_pois_from_tag_dict(
    "Grand Genève", 
    culture_pois_dict, 
    timeout=180,     # Increased timeout
    pause=10,        # Increased pause between requests
    chunk_size=50    # Optional: enable chunking for large areas
)
# Education POIs
gdf_education_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", education_pois_dict, 
    timeout=90,     # Increased timeout
    pause=10,        # Increased pause between requests
    chunk_size=25 )   # Optional: enable chunking for large areas)

# Healthcare POIs
gdf_healthcare_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", healthcare_pois_dict, 
    timeout=90,     # Increased timeout
    pause=5,        # Increased pause between requests
    chunk_size=25 )   # Optional: enable chunking for large areas)

# Services POIs
gdf_services_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", services_pois_dict, 
    timeout=90,     # Increased timeout
    pause=5,        # Increased pause between requests
    chunk_size=25)    # Optional: enable chunking for large areas)

# Transport POIs
gdf_transport_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", transport_pois_dict, 
    timeout=90,     # Increased timeout
    pause=5,        # Increased pause between requests
    chunk_size=25)    # Optional: enable chunking for large areas)

# Outdoor POIs
gdf_outdoor_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", outdoor_pois_dict, 
    timeout=90,     # Increased timeout
    pause=5,        # Increased pause between requests
    chunk_size=25)    # Optional: enable chunking for large areas)

# Supplies POIs
gdf_supplies_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", supplies_pois_dict, 
    timeout=90,     # Increased timeout
    pause=10,        # Increased pause between requests
    chunk_size=25)    # Optional: enable chunking for large areas)

# Restaurant POIs
gdf_restaurant_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", restaurant_pois_dict, 
    timeout=90,     # Increased timeout
    pause=5,        # Increased pause between requests
    chunk_size=25)    # Optional: enable chunking for large areas)

# Physical POIs
gdf_physical_pois_2025 = fetch_pois_from_tag_dict("Grand Genève", physical_pois_dict, 
    timeout=90,     # Increased timeout
    pause=5,        # Increased pause between requests
    chunk_size=25)    # Optional: enable chunking for large areas)

In [None]:
gdf_culture_pois_2025.drop(['nodes', 'ways'], axis=1).to_parquet('../data/15min_city/raw_osm_culture_pois.parquet')
gdf_education_pois_2025.drop(['nodes', 'ways'], axis=1).to_parquet('../data/15min_city/raw_osm_education_pois.parquet')
gdf_healthcare_pois_2025.drop(['nodes', 'ways'], axis=1).to_parquet('../data/15min_city/raw_osm_healthcare_pois.parquet')
gdf_physical_pois_2025.drop(['nodes', 'ways'], axis=1).to_parquet('../data/15min_city/raw_osm_physical_pois.parquet')
gdf_services_pois_2025.drop(['nodes', 'ways'], axis=1).to_parquet('../data/15min_city/raw_osm_services_pois.parquet')
gdf_transport_pois_2025.drop(['nodes', 'ways'], axis=1).to_parquet('../data/15min_city/raw_osm_transport_pois.parquet')
gdf_outdoor_pois_2025.drop(['nodes', 'ways'], axis=1).to_parquet('../data/15min_city/raw_osm_outdoor_pois.parquet')
gdf_supplies_pois_2025.drop(['nodes', 'ways'], axis=1).to_parquet('../data/15min_city/raw_osm_supplies_pois.parquet')
gdf_restaurant_pois_2025.drop(['nodes'], axis=1).to_parquet('../data/15min_city/raw_osm_restaurant_pois.parquet')
gdf_physical_pois_2025.drop(['nodes', 'ways'], axis=1).to_parquet('../data/15min_city/raw_osm_physical_pois.parquet')

In [None]:
gdf_culture_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_culture_pois.parquet')
gdf_education_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_education_pois.parquet')
gdf_healthcare_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_healthcare_pois.parquet')
gdf_physical_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_physical_pois.parquet')
gdf_services_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_services_pois.parquet')
gdf_transport_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_transport_pois.parquet')
gdf_outdoor_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_outdoor_pois.parquet')
gdf_supplies_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_supplies_pois.parquet')
gdf_restaurant_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_restaurant_pois.parquet')
gdf_physical_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_physical_pois.parquet')

## OSM data cleaning

In [None]:
def fix_geometries(df):
    from shapely import wkb
    if isinstance(df['geometry'].iloc[0], bytes):
        df['geometry'] = df['geometry'].apply(lambda x: wkb.loads(x) if isinstance(x, bytes) else x)
    return gpd.GeoDataFrame(df, geometry='geometry', crs=4326)
    
def add_categories(gdf, category_name, category_dict):
    """
    Add 'category' and 'subcategory' columns to a GeoDataFrame based on a dictionary.
    
    Parameters:
    -----------
    gdf : GeoDataFrame
        The GeoDataFrame to categorize
    category_name : str
        The main category name (e.g., 'physical')
    category_dict : dict
        Dictionary where keys are column names and values are lists of values to match
        or True to match any non-null value
    
    Returns:
    --------
    GeoDataFrame
        The input GeoDataFrame with 'category' and 'subcategory' columns added
    """
    # Create copies of the GeoDataFrame and add new columns
    result_gdf = gdf.copy()
    result_gdf['category'] = None
    result_gdf['subcategory'] = None
    
    # Track rows that have been categorized
    categorized_mask = pd.Series(False, index=result_gdf.index)
    
    # Iterate through dictionary items
    for column_name, values in category_dict.items():
        # Check if column exists in the GeoDataFrame
        if column_name not in result_gdf.columns:
            continue
            
        if values is True:
            # If values is True, match any non-null value in the column
            mask = ~result_gdf[column_name].isna() & ~categorized_mask
        else:
            # Match specific values in the column
            mask = result_gdf[column_name].isin(values) & ~categorized_mask
        
        # Update category and subcategory for matching rows
        result_gdf.loc[mask, 'category'] = category_name
        result_gdf.loc[mask, 'subcategory'] = column_name
        
        # Update the mask of categorized rows
        categorized_mask = categorized_mask | mask
    result_gdf = result_gdf.to_crs(2056)
    result_gdf['geometry'] = result_gdf['geometry'].centroid
    result_gdf['lon'] = result_gdf.to_crs(4326).geometry.x
    result_gdf['lat'] = result_gdf.to_crs(4326).geometry.y
    return result_gdf.reset_index(drop=True)
    

In [None]:
gdf_culture_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_culture_pois.parquet')
gdf_education_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_education_pois.parquet')
gdf_healthcare_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_healthcare_pois.parquet')
gdf_physical_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_physical_pois.parquet')
gdf_services_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_services_pois.parquet')
gdf_transport_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_transport_pois.parquet')
gdf_outdoor_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_outdoor_pois.parquet')
gdf_supplies_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_supplies_pois.parquet')
gdf_restaurant_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_restaurant_pois.parquet')
gdf_physical_pois_2025 = pd.read_parquet('../data/15min_city/raw_osm_physical_pois.parquet')

In [None]:
# Fix geometries if needed
gdf_culture_pois_2025 = fix_geometries(gdf_culture_pois_2025)
gdf_education_pois_2025 = fix_geometries(gdf_education_pois_2025)
gdf_healthcare_pois_2025 = fix_geometries(gdf_healthcare_pois_2025)
gdf_physical_pois_2025 = fix_geometries(gdf_physical_pois_2025)
gdf_services_pois_2025 = fix_geometries(gdf_services_pois_2025)
gdf_transport_pois_2025 = fix_geometries(gdf_transport_pois_2025)
gdf_outdoor_pois_2025 = fix_geometries(gdf_outdoor_pois_2025)
gdf_supplies_pois_2025 = fix_geometries(gdf_supplies_pois_2025)
gdf_restaurant_pois_2025 = fix_geometries(gdf_restaurant_pois_2025)

In [None]:
gdf_physical_pois_2025_clean = add_categories(gdf_physical_pois_2025, 'physical', physical_pois_dict)
gdf_culture_pois_2025_clean = add_categories(gdf_culture_pois_2025, 'culture', culture_pois_dict)
gdf_education_pois_2025_clean = add_categories(gdf_education_pois_2025, 'education', education_pois_dict)
gdf_healthcare_pois_2025_clean = add_categories(gdf_healthcare_pois_2025, 'healthcare', healthcare_pois_dict)
gdf_services_pois_2025_clean = add_categories(gdf_services_pois_2025, 'services', services_pois_dict)
gdf_transport_pois_2025_clean = add_categories(gdf_transport_pois_2025, 'transport', transport_pois_dict)
gdf_outdoor_pois_2025_clean = add_categories(gdf_outdoor_pois_2025, 'outdoor', outdoor_pois_dict)
gdf_supplies_pois_2025_clean = add_categories(gdf_supplies_pois_2025, 'supplies', supplies_pois_dict)
gdf_restaurant_pois_2025_clean = add_categories(gdf_restaurant_pois_2025, 'restaurant', restaurant_pois_dict)

In [None]:
# Extra step : exclude private swimming pools
gdf_physical_pois_2025_clean = gdf_physical_pois_2025_clean[gdf_physical_pois_2025_clean.leisure != 'swimming_pool'].reset_index(drop=True)
# Extra step : exclude private gardens
gdf_outdoor_pois_2025_clean = gdf_outdoor_pois_2025_clean[gdf_outdoor_pois_2025_clean.leisure != 'garden'].reset_index(drop=True)

In [None]:
# Combine all the cleaned dataframes (optional)
gdfs_clean = [
    gdf_physical_pois_2025_clean[['osmid','category','subcategory', 'lon', 'lat', 'geometry']],
    gdf_culture_pois_2025_clean[['osmid','category','subcategory', 'lon', 'lat', 'geometry']],
    gdf_education_pois_2025_clean[['osmid','category','subcategory', 'lon', 'lat', 'geometry']],
    gdf_healthcare_pois_2025_clean[['osmid','category','subcategory', 'lon', 'lat', 'geometry']],
    gdf_services_pois_2025_clean[['osmid','category','subcategory', 'lon', 'lat', 'geometry']],
    gdf_transport_pois_2025_clean[['osmid','category','subcategory', 'lon', 'lat', 'geometry']],
    gdf_outdoor_pois_2025_clean[['osmid','category','subcategory', 'lon', 'lat', 'geometry']],
    gdf_supplies_pois_2025_clean[['osmid','category','subcategory', 'lon', 'lat', 'geometry']],
    gdf_restaurant_pois_2025_clean[['osmid','category','subcategory', 'lon', 'lat', 'geometry']]
]

# Concatenate all cleaned dataframes into one (optional)
gdf_all_pois_clean = pd.concat(gdfs_clean, ignore_index=True)

# Save the results (optional)
for gdf_clean, category in zip(gdfs_clean, ['physical', 'culture', 'education', 'healthcare', 
                                           'services', 'transport', 'outdoor', 'supplies', 'restaurant']):
    gdf_clean.to_parquet(f'../data/15min_city/processed_osm_{category}_pois.parquet')

# Save the combined dataframe (optional)
gdf_all_pois_clean.to_parquet('../data/15min_city/processed_osm_all_pois.parquet')