In [12]:
# Imports
import geopandas as gpd
import pandas as pd
from pathlib import Path
from shapely.geometry import box
from itertools import cycle

import folium
from branca.element import Element

In [13]:
# Paths and map settings
DATA_DIR = Path("data")
TRIP_GEN_PATH = DATA_DIR / "trip_generators_brooklyn.geojson"
BUILDINGS_PATH = DATA_DIR / "buildings_brooklyn.geojson"
ZOOM_START = 20

In [14]:
# Load data and ensure CRS
for path in [TRIP_GEN_PATH, BUILDINGS_PATH]:
    if not path.exists():
        raise FileNotFoundError(f"Missing input: {path}")

trip_generators_gdf = gpd.read_file(TRIP_GEN_PATH)
buildings_gdf = gpd.read_file(BUILDINGS_PATH)

building_required = {"building_id", "total_sqft", "estimated_floors"}
b_missing = building_required - set(buildings_gdf.columns)
if b_missing:
    raise ValueError(f"Buildings data missing required columns: {b_missing}")

required_cols = {"building_id", "sqft", "land_use_type", "source"}
missing = required_cols - set(trip_generators_gdf.columns)
if missing:
    raise ValueError(f"Trip generator data missing required columns: {missing}")

if buildings_gdf.crs is None:
    buildings_gdf = buildings_gdf.set_crs("EPSG:4326")
if trip_generators_gdf.crs is None:
    trip_generators_gdf = trip_generators_gdf.set_crs("EPSG:4326")

if buildings_gdf.crs != "EPSG:4326":
    buildings_gdf = buildings_gdf.to_crs("EPSG:4326")
if trip_generators_gdf.crs != "EPSG:4326":
    trip_generators_gdf = trip_generators_gdf.to_crs("EPSG:4326")

print(f"Loaded {len(buildings_gdf):,} buildings and {len(trip_generators_gdf):,} generators")

Loaded 331,538 buildings and 345,265 generators


In [15]:
# Analyze Buildings with Warnings

# Check if warning columns exist in the data
if 'has_warning' in buildings_gdf.columns:
    # Get all buildings with warnings
    warning_buildings = buildings_gdf[buildings_gdf['has_warning'] == True].copy()
    
    print(f"Total buildings with warnings: {len(warning_buildings):,}")
    
    if len(warning_buildings) > 0:
        # Analyze warning types
        warning_types = []
        for flags in warning_buildings['warning_flags'].dropna():
            warning_types.extend(flags.split(';'))
        
        from collections import Counter
        warning_counts = Counter(warning_types)
        
        print("\nWarning breakdown:")
        for warning_type, count in warning_counts.most_common():
            if warning_type == 'large_sqft':
                print(f"  - Buildings > 1M sqft: {count:,}")
            elif warning_type == 'very_large_sqft':
                print(f"  - Buildings > 5M sqft: {count:,}")
            elif warning_type == 'high_floors':
                print(f"  - Buildings > 100 floors: {count:,}")
        
        # Show sample of flagged buildings
        print("\nSample of flagged buildings:")
        sample_cols = ['building_id', 'building', 'total_sqft', 'estimated_floors', 'warning_flags']
        display_df = warning_buildings[sample_cols].sort_values('total_sqft', ascending=False).head(15)
        print(display_df.to_string())
        
        # Statistics
        print("\n" + "="*60)
        print("STATISTICS")
        print("="*60)
        print(f"Average sqft in flagged buildings: {warning_buildings['total_sqft'].mean():,.0f}")
        print(f"Median sqft in flagged buildings: {warning_buildings['total_sqft'].median():,.0f}")
        print(f"Max sqft: {warning_buildings['total_sqft'].max():,.0f}")
        print(f"Min sqft: {warning_buildings['total_sqft'].min():,.0f}")
        
        buildings_ready = True
    else:
        print("No buildings with warnings found!")
        buildings_ready = False
else:
    print("Warning flags not found in building data. Please re-run process_buildings_pois.ipynb to generate warning flags.")
    buildings_ready = False


  - Buildings > 1M sqft: 63

Sample of flagged buildings:
282394       282394      hospital  4.670733e+06                25    large_sqft
234970       234970    commercial  3.211228e+06                 7    large_sqft
330650       330650    apartments  2.938562e+06                23    large_sqft
613             613    apartments  2.877641e+06                52    large_sqft
53025         53025    apartments  2.798366e+06                55    large_sqft
331526       331526  construction  2.638899e+06                47    large_sqft
593             593         civic  2.592974e+06                12    large_sqft
291638       291638           yes  2.541941e+06                38    large_sqft
617             617    apartments  2.273278e+06                40    large_sqft
258243       258243      hospital  2.206908e+06                20    large_sqft
330733       330733    apartments  2.172580e+06                25    large_sqft
89892         89892    industrial  2.051893e+06              

In [16]:
# Check POI associations with flagged buildings using the merged trip generator data

if 'has_warning' in buildings_gdf.columns and len(warning_buildings) > 0:
    print("Analyzing POI associations with flagged buildings...")
    print("="*60)
    
    # Get building IDs of flagged buildings
    warning_building_ids = set(warning_buildings['building_id'].astype(str))
    
    # The trip_generators dataset already has building_id field from the processing pipeline
    # Filter for POIs in flagged buildings
    trip_gen_with_warnings = trip_generators_gdf[
        trip_generators_gdf['building_id'].astype(str).isin(warning_building_ids)
    ].copy()
    
    print(f"Total flagged buildings: {len(warning_buildings):,}")
    print(f"Trip generators in flagged buildings: {len(trip_gen_with_warnings):,}")
    
    # Count buildings with and without POIs
    buildings_with_pois = trip_gen_with_warnings['building_id'].nunique()
    buildings_without_pois = len(warning_buildings) - buildings_with_pois
    
    print(f"\nFlagged buildings WITH trip generators: {buildings_with_pois:,}")
    print(f"Flagged buildings WITHOUT trip generators: {buildings_without_pois:,}")
    print(f"Percentage with generators: {buildings_with_pois/len(warning_buildings)*100:.1f}%")
    
    if len(trip_gen_with_warnings) > 0:
        # Analyze generator sources in these large buildings
        print("\n" + "="*60)
        print("TRIP GENERATOR SOURCES IN LARGE BUILDINGS")
        print("="*60)
        
        source_breakdown = trip_gen_with_warnings.groupby('source').agg({
            'building_id': 'count',
            'sqft': 'sum'
        }).rename(columns={'building_id': 'generator_count'})
        source_breakdown = source_breakdown.sort_values('generator_count', ascending=False)
        
        print("\nGenerators by Source:")
        for idx, row in source_breakdown.iterrows():
            print(f"  {idx}: {row['generator_count']:,} generators, {row['sqft']:,.0f} total sqft")
        
        # Analyze land use types
        print("\n" + "="*60)
        print("LAND USE BREAKDOWN FOR LARGE BUILDINGS")
        print("="*60)
        
        land_use_breakdown = trip_gen_with_warnings.groupby('land_use_type').agg({
            'building_id': 'count',
            'sqft': 'sum'
        }).rename(columns={'building_id': 'generator_count'})
        land_use_breakdown = land_use_breakdown.sort_values('sqft', ascending=False).head(15)
        
        print("\nTop 15 Land Use Types by Square Footage:")
        for idx, row in land_use_breakdown.iterrows():
            print(f"  {idx}: {row['generator_count']:,} generators, {row['sqft']:,.0f} sqft")
        
        # Sample of buildings with most generators
        print("\n" + "="*60)
        print("TOP 10 FLAGGED BUILDINGS BY GENERATOR COUNT")
        print("="*60)
        
        generators_per_building = trip_gen_with_warnings.groupby('building_id').agg({
            'land_use_type': 'count',
            'sqft': 'sum',
            'source': lambda x: '; '.join(x.unique())
        }).rename(columns={'land_use_type': 'generator_count', 'sqft': 'generator_total_sqft'})
        
        # Join with building info
        top_buildings = generators_per_building.merge(
            warning_buildings[['building_id', 'building', 'total_sqft', 'estimated_floors']],
            left_index=True,
            right_on='building_id',
            how='left'
        ).sort_values('generator_count', ascending=False).head(10)
        
        for idx, row in top_buildings.iterrows():
            print(f"\nBuilding {row['building_id']} ({row.get('building', 'unknown')}):")
            print(f"  - Building Total SqFt: {row['total_sqft']:,.0f}")
            print(f"  - Generator Count: {row['generator_count']:,}")
            print(f"  - Generator Total SqFt: {row['generator_total_sqft']:,.0f}")
            print(f"  - Coverage: {row['generator_total_sqft']/row['total_sqft']*100:.1f}%")
            print(f"  - Sources: {row['source']}")
            
            # Show land use breakdown for this building
            building_generators = trip_gen_with_warnings[
                trip_gen_with_warnings['building_id'] == row['building_id']
            ].groupby('land_use_type')['sqft'].agg(['count', 'sum']).sort_values('sum', ascending=False).head(5)
            
            if len(building_generators) > 0:
                print(f"  - Top Land Uses:")
                for land_use, data in building_generators.iterrows():
                    print(f"    • {land_use}: {data['count']} generators ({data['sum']:,.0f} sqft)")
    
    # Check for buildings with no generators
    print("\n" + "="*60)
    print("FLAGGED BUILDINGS WITHOUT TRIP GENERATORS")
    print("="*60)
    
    buildings_no_generators = warning_buildings[
        ~warning_buildings['building_id'].astype(str).isin(
            trip_gen_with_warnings['building_id'].astype(str)
        )
    ]
    
    if len(buildings_no_generators) > 0:
        print(f"\n{len(buildings_no_generators)} large buildings have NO trip generators:")
        
        # These are likely all building_inferred since they have no POIs
        sample_cols = ['building_id', 'building', 'total_sqft', 'estimated_floors']
        no_gen_sample = buildings_no_generators[sample_cols].sort_values('total_sqft', ascending=False).head(10)
        print("\nTop 10 by size:")
        print(no_gen_sample.to_string())
        
        # Check building types without generators
        if 'building' in buildings_no_generators.columns:
            print("\nBuilding types without generators:")
            no_gen_types = buildings_no_generators['building'].value_counts()
            for btype, count in no_gen_types.items():
                print(f"  - {btype}: {count} buildings")
            
        print("\nNote: These buildings likely have 'building_inferred' residential generators")
        print("in the trip_generators dataset that aren't being captured in this analysis.")
else:
    print("Cannot analyze generator associations - warning data not available")

Analyzing POI associations with flagged buildings...
Total flagged buildings: 63
Trip generators in flagged buildings: 120

Flagged buildings WITH trip generators: 63
Flagged buildings WITHOUT trip generators: 0
Percentage with generators: 100.0%

TRIP GENERATOR SOURCES IN LARGE BUILDINGS

Generators by Source:
  osm_poi: 76.0 generators, 36,834,589 total sqft
  building_inferred: 28.0 generators, 39,320,843 total sqft
  inferred_remaining: 16.0 generators, 22,989,471 total sqft

LAND USE BREAKDOWN FOR LARGE BUILDINGS

Top 15 Land Use Types by Square Footage:
  residential: 34.0 generators, 46,947,439 sqft
  hospital: 6.0 generators, 12,090,501 sqft
  kindergarten: 3.0 generators, 6,620,200 sqft
  office: 3.0 generators, 5,506,584 sqft
  warehouse: 3.0 generators, 3,523,302 sqft
  industrial: 2.0 generators, 3,089,801 sqft
  department_store: 4.0 generators, 2,967,587 sqft
  estate_agent: 1.0 generators, 2,798,366 sqft
  school: 2.0 generators, 2,295,773 sqft
  museum: 1.0 generators, 

In [18]:
# Create Interactive Map of Flagged Buildings

if buildings_ready and len(warning_buildings) > 0:
    print(f"Creating interactive map of {len(warning_buildings):,} flagged buildings...")
    print("Click on buildings to see details")
    print("-" * 60)
    
    # Get center from warning buildings
    bounds = warning_buildings.total_bounds
    map_center = [(bounds[1] + bounds[3]) / 2, (bounds[0] + bounds[2]) / 2]
    
    # Create map
    warning_map = folium.Map(
        location=map_center, 
        zoom_start=12, 
        tiles="CartoDB positron",
        width='100%',
        height='600px'
    )
    
    # Color code by warning severity
    def get_warning_color(flags):
        if 'very_large_sqft' in str(flags) or 'high_floors' in str(flags):
            return '#ff0000'  # Red for severe warnings
        elif 'large_sqft' in str(flags):
            return '#ff8800'  # Orange for moderate warnings
        else:
            return '#ffff00'  # Yellow for other warnings
    
    warning_buildings['warning_color'] = warning_buildings['warning_flags'].apply(get_warning_color)
    
    # Prepare POI data for each building
    warning_building_ids = set(warning_buildings['building_id'].astype(str))
    trip_gen_for_map = trip_generators_gdf[
        trip_generators_gdf['building_id'].astype(str).isin(warning_building_ids)
    ]
    
    # Aggregate POI data by building
    poi_summary_by_building = {}
    if len(trip_gen_for_map) > 0:
        for building_id in warning_building_ids:
            building_pois = trip_gen_for_map[trip_gen_for_map['building_id'].astype(str) == building_id]
            if len(building_pois) > 0:
                # Group by land use type and source
                land_use_summary = building_pois.groupby('land_use_type').agg({
                    'sqft': ['count', 'sum']
                }).round(0)
                
                source_summary = building_pois.groupby('source')['sqft'].sum().round(0)
                
                poi_summary_by_building[building_id] = {
                    'total_generators': len(building_pois),
                    'total_poi_sqft': building_pois['sqft'].sum(),
                    'land_use_types': land_use_summary,
                    'sources': source_summary
                }
    
    # Add warning details to popup
    warning_buildings['warning_popup'] = warning_buildings.apply(
        lambda row: f'''
        <div style="font-size: 14px; width: 250px;">
        <b>Building ID:</b> {row.building_id}<br>
        <b>Type:</b> {row.get('building', 'unknown')}<br>
        <b>Total SqFt:</b> {row.total_sqft:,.0f}<br>
        <b>Floors:</b> {row.estimated_floors}<br>
        <b>Footprint:</b> {row.footprint_sqft:,.0f} sqft<br>
        <hr>
        <b style="color:red">WARNINGS:</b><br>
        • {row.warning_flags.replace(';', '<br>• ').replace('_', ' ').title()}
        </div>
        ''',
        axis=1
    )
    
    # Create comprehensive tooltip showing all attributes including POI data
    def create_tooltip(row):
        # Skip geometry and popup columns in tooltip
        skip_cols = {'geometry', 'warning_popup', 'warning_color'}
        tooltip_html = '<div style="font-size: 12px; max-width: 500px;">'
        tooltip_html += '<b style="font-size: 14px;">Building Details</b><br><hr style="margin: 3px 0;">'
        
        # Add building attributes
        tooltip_html += '<div style="margin-bottom: 8px;">'
        for col in row.index:
            if col not in skip_cols and pd.notna(row[col]):
                # Format column name
                col_name = col.replace('_', ' ').title()
                
                # Format value based on type
                val = row[col]
                if isinstance(val, (int, float)):
                    if 'sqft' in col.lower() or 'area' in col.lower():
                        val_str = f'{val:,.0f}'
                    elif isinstance(val, float):
                        val_str = f'{val:,.2f}'
                    else:
                        val_str = f'{val:,}'
                else:
                    val_str = str(val)
                
                # Highlight warning-related fields
                if col == 'warning_flags':
                    tooltip_html += f'<b style="color: red;">{col_name}:</b> {val_str}<br>'
                elif col == 'has_warning':
                    tooltip_html += f'<b>{col_name}:</b> <span style="color: red;">{val_str}</span><br>'
                else:
                    tooltip_html += f'<b>{col_name}:</b> {val_str}<br>'
        tooltip_html += '</div>'
        
        # Add POI/Trip Generator information
        building_id = str(row.building_id)
        if building_id in poi_summary_by_building:
            poi_data = poi_summary_by_building[building_id]
            
            tooltip_html += '<hr style="margin: 5px 0;">'
            tooltip_html += '<b style="font-size: 13px; color: #0066cc;">Trip Generators (POIs)</b><br>'
            tooltip_html += f'<b>Total Generators:</b> {poi_data["total_generators"]:,}<br>'
            tooltip_html += f'<b>Total POI SqFt:</b> {poi_data["total_poi_sqft"]:,.0f}<br>'
            tooltip_html += f'<b>Coverage:</b> {poi_data["total_poi_sqft"]/row.total_sqft*100:.1f}%<br>'
            
            # Add source breakdown
            if len(poi_data['sources']) > 0:
                tooltip_html += '<br><b>By Source:</b><br>'
                for source, sqft in poi_data['sources'].items():
                    tooltip_html += f'  • {source}: {sqft:,.0f} sqft<br>'
            
            # Add top land use types
            if len(poi_data['land_use_types']) > 0:
                tooltip_html += '<br><b>Top Land Uses:</b><br>'
                land_use_sorted = poi_data['land_use_types'].sort_values(('sqft', 'sum'), ascending=False).head(5)
                for land_use, data in land_use_sorted.iterrows():
                    count = int(data[('sqft', 'count')])
                    total = data[('sqft', 'sum')]
                    tooltip_html += f'  • {land_use}: {count} ({total:,.0f} sqft)<br>'
        else:
            tooltip_html += '<hr style="margin: 5px 0;">'
            tooltip_html += '<b style="font-size: 13px; color: #999;">No POI data found</b><br>'
            tooltip_html += '<i style="color: #666;">Building likely has inferred residential generators</i><br>'
        
        tooltip_html += '</div>'
        return tooltip_html
    
    # Add buildings to map (use feature group for better performance)
    feature_group = folium.FeatureGroup(name='Warning Buildings')
    
    for idx, row in warning_buildings.iterrows():
        folium.GeoJson(
            row.geometry,
            style_function=lambda x, color=row['warning_color']: {
                'fillColor': color,
                'color': 'black',
                'weight': 2,
                'fillOpacity': 0.8,
            },
            popup=folium.Popup(row['warning_popup'], max_width=300),
            tooltip=folium.Tooltip(create_tooltip(row), sticky=True)
        ).add_to(feature_group)
    
    feature_group.add_to(warning_map)
    
    # Add legend
    legend_html = '''
    <div style='position: fixed; 
                bottom: 30px; 
                left: 10px; 
                z-index: 9999; 
                background: white; 
                padding: 10px; 
                border: 2px solid #ccc; 
                border-radius: 5px; 
                font-size: 14px;
                box-shadow: 2px 2px 5px rgba(0,0,0,0.3);'>
    <b>Warning Severity</b><br>
    <div style='margin-top: 5px;'>
        <span style='display:inline-block;width:15px;height:15px;background:#ff0000;margin-right:8px;border:1px solid black;'></span>
        Severe (>5M sqft or >100 floors)
    </div>
    <div style='margin-top: 3px;'>
        <span style='display:inline-block;width:15px;height:15px;background:#ff8800;margin-right:8px;border:1px solid black;'></span>
        Moderate (>1M sqft)
    </div>
    </div>
    '''
    warning_map.get_root().html.add_child(Element(legend_html))
    
    # Add layer control
    folium.LayerControl().add_to(warning_map)
    
    # Display the map - this must be the last expression evaluated
    display(warning_map)
else:
    print("No map to display - no warning buildings found or data not loaded")

Creating interactive map of 63 flagged buildings...
Click on buildings to see details
------------------------------------------------------------
