In [1]:
# Import Libraries
import sys
from pathlib import Path
import geopandas as gpd

# Add src to path
sys.path.insert(0, str(Path.cwd() / 'src'))

# Import our library functions
from src.config import CityConfig
from src.building_processor import load_buildings, process_buildings
from src.poi_processor import load_pois, process_pois
from src.spatial_matcher import match_pois_to_buildings, join_matches_to_pois
from src.heuristics import apply_heuristics_to_pois
from src.trip_generator import (
    create_trip_generators,
    save_trip_generators,
    print_trip_gen_summary
)

# --- Configuration ---
CITY = 'brooklyn'
config = CityConfig(city_name=CITY)
place_name = "Brooklyn, New York, USA"

CACHE_DIR = Path("cache")
CACHE_DIR.mkdir(exist_ok=True)
USE_CACHE = True  # Set to False to force re-download

# Cache and output file paths
BUILDINGS_CACHE = CACHE_DIR / f"buildings_raw_{CITY}.geojson"
POIS_CACHE = CACHE_DIR / f"pois_raw_{CITY}.geojson"
BUILDINGS_OUT = Path('data') / f"buildings_{CITY}.geojson"
TRIP_GENERATORS_OUT_GEOJSON = Path('data') / f"trip_generators_{CITY}.geojson"
TRIP_GENERATORS_OUT_CSV = Path('data') / f"trip_generators_{CITY}.csv"

In [2]:
# Load and Process Buildings

# Load buildings
buildings_raw = load_buildings(
    place_name=place_name,
    cache_path=BUILDINGS_CACHE,
    use_cache=USE_CACHE
)

# Process buildings to calculate square footage
buildings_gdf = process_buildings(buildings_raw, config)

# Display sample
print(buildings_gdf[['building_id', 'building', 'footprint_sqft', 'estimated_floors', 'total_sqft']].head(10))

Loading buildings from cache: cache/buildings_raw_brooklyn.geojson
Loaded 331,538 buildings from cache

Buildings ready: 331,538
   building_id    building  footprint_sqft  estimated_floors     total_sqft
0            0         yes    33487.362541                 3  100462.087623
1            1         yes    10160.424650                 5   50802.123248
2            2         yes    10294.479685                 5   51472.398427
3            3         yes     9558.502166                 5   47792.510831
4            4         yes     6745.779850                 6   40474.679101
5            5         yes    13402.631808                11  147428.949889
6            6  apartments     8986.979593                 6   53921.877561
7            7         yes    16507.082635                 6   99042.495808
8            8      public    30301.440341                15  454521.605118
9            9         yes     5634.379375                 5   28171.896876


In [3]:
# Load and Process POIs

# Load POIs
pois_raw = load_pois(
    place_name=place_name,
    cache_path=POIS_CACHE,
    use_cache=USE_CACHE
)

# Process POIs (filter and get centroids)
pois_gdf = process_pois(
    pois_raw,
    filter_non_trip_generators=True
)

# Show POI type distribution
for col in ['amenity', 'shop', 'office', 'leisure', 'tourism', 'public_transport']:
    if col in pois_gdf.columns:
        counts = pois_gdf[col].dropna().value_counts().head(10)
        if len(counts) > 0:
            print(f"\nTop 10 {col} types:")
            print(counts)

Downloading POIs from OSM...
Downloaded 56,636 POIs, saved to: cache/pois_raw_brooklyn.geojson
POIs before filtering: 56,636
POIs after filtering: 29,743

POIs ready: 29,743

Top 10 amenity types:
amenity
restaurant          1806
fast_food           1416
place_of_worship     984
cafe                 763
school               593
bar                  469
pharmacy             291
bank                 251
dentist              211
clinic               190
Name: count, dtype: int64

Top 10 shop types:
shop
convenience      1145
hairdresser       642
beauty            557
clothes           487
supermarket       351
laundry           350
alcohol           283
car_repair        253
deli              231
variety_store     222
Name: count, dtype: int64

Top 10 office types:
office
company         293
estate_agent    188
lawyer          136
insurance        73
tax_advisor      73
government       58
accountant       57
financial        39
construction     35
travel_agent     35
Name: count, dtype:

In [4]:
# Match POIs to Buildings

# Perform spatial matching
matches_df = match_pois_to_buildings(buildings_gdf, pois_gdf)

# Join matches back to POIs
pois_matched = join_matches_to_pois(pois_gdf, matches_df)

Matching POIs to buildings using spatial join...
Matched 18,694 POIs to buildings (62.9%)
POIs with building matches: 18,694


In [5]:
# Apply Brooklyn Heuristics for Space Allocation

# Apply heuristics to allocate space
processed_pois_df = apply_heuristics_to_pois(pois_matched, buildings_gdf)

# Summary statistics
import pandas as pd
remaining_flags = processed_pois_df.get('is_remaining')
if isinstance(remaining_flags, pd.Series):
    remaining_mask = remaining_flags.fillna(False).astype(bool)
else:
    remaining_mask = pd.Series(False, index=processed_pois_df.index)

actual_pois = processed_pois_df[~remaining_mask]
remaining_pois = processed_pois_df[remaining_mask]

print(f"\nSpace Allocation Summary:")
print(f"  Actual POI sqft: {actual_pois['poi_sqft'].sum():,.0f}")
print(f"  Remaining/upper floor sqft: {remaining_pois['poi_sqft'].sum():,.0f}")
print(f"  Total allocated: {processed_pois_df['poi_sqft'].sum():,.0f}")

# Verify no double-counting
print(f"\nDouble-count check (sample buildings with multiple POIs):")
multi_poi_buildings = processed_pois_df.groupby('building_id').filter(lambda x: len(x) > 1)
if len(multi_poi_buildings) > 0:
    sample_check = multi_poi_buildings.groupby('building_id').agg({
        'poi_sqft': 'sum',
        'building_total_sqft': 'first'
    }).head(5)
    sample_check['ratio'] = sample_check['poi_sqft'] / sample_check['building_total_sqft']
    print(sample_check)

Processed POIs (including inferred remaining): 27,928

Space Allocation Summary:
  Actual POI sqft: 238,461,724
  Remaining/upper floor sqft: 166,276,695
  Total allocated: 404,738,418

Double-count check (sample buildings with multiple POIs):
                 poi_sqft  building_total_sqft  ratio
building_id                                          
8            4.545216e+05         4.545216e+05    1.0
12           8.454686e+04         8.454686e+04    1.0
22           4.348655e+04         4.348655e+04    1.0
28           1.299619e+06         1.299619e+06    1.0
33           9.902837e+04         9.902837e+04    1.0


  remaining_mask = remaining_flags.fillna(False).astype(bool)


In [6]:
# Create and Save Final Trip Generator Dataset

# This function now handles the unified creation and unit conversion
trip_generators_gdf = create_trip_generators(processed_pois_df, buildings_gdf, config)

# Print summary with trip generation units
print_trip_gen_summary(trip_generators_gdf)

# Save the final trip generator files
save_trip_generators(
    trip_generators_gdf,
    geojson_path=TRIP_GENERATORS_OUT_GEOJSON,
    csv_path=TRIP_GENERATORS_OUT_CSV
)

# Save the processed buildings file as well
buildings_gdf.to_file(BUILDINGS_OUT, driver="GeoJSON")
print(f"Saved processed buildings to: {BUILDINGS_OUT}")

print(f"\nWorkflow complete!")

Creating trip generators using optimized processing...
Buildings with POIs: 14,201
Buildings without POI: 317,337


  is_remaining = remaining_series.fillna(False).astype(bool)


Converting to trip generation units...

=== Unified Trip Generator Dataset ===
Total generators: 345,265
Total sqft: 2,058,645,066

By Source:
  building_inferred: 317,337.0 generators, 1,653,906,648 sqft
  inferred_remaining: 10,398.0 generators, 166,276,695 sqft
  osm_poi: 17,530.0 generators, 238,461,724 sqft

Top 15 Trip Generation Categories:
                                                    count   total_units  \
trip_gen_category                                                         
Residential (3 or more floors)                     325914  1.724654e+06   
Public School (Students)                              772  5.902861e+05   
Cineplex                                               55  3.981816e+05   
Local Retail                                        11450  1.098750e+05   
Office (multi-tenant type building)                  1276  3.667163e+04   
Medical Office                                        565  2.616197e+04   
Hotel                                             