In [1]:
import ee
import osmnx as ox
import geopandas as gpd
import geemap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from shapely.geometry import Polygon, MultiPolygon
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# ee.Authenticate()
project_id = 'bengaluru-lakes-485612'

try:
    ee.Initialize(project=project_id)
    print("Successfully initialized!")
except Exception:
    ee.Authenticate()
    ee.Initialize(project=project_id)

Successfully initialized!


In [3]:
# Define Bengaluru boundary
place_name = 'Bengaluru, Karnataka, India'

# Query lakes using tags: natural=water and water=lake
tags = {'natural': 'water', 'water': 'lake'}

print(f'Searching for lakes in {place_name}...')
try:
    # Use osmnx to fetch features
    lakes_gdf = ox.features_from_place(place_name, tags)
    
    # Filter for polygons and multipolygons
    lakes_gdf = lakes_gdf[lakes_gdf.geometry.type.isin(['Polygon', 'MultiPolygon'])]
    
    # Keep only relevant columns and drop rows without names
    lakes_gdf = lakes_gdf[['name', 'geometry']].dropna(subset=['name'])
    
    print(f'Found {len(lakes_gdf)} lakes with names.')
    display(lakes_gdf.head())
except Exception as e:
    print(f'Error retrieving lakes: {e}')


Searching for lakes in Bengaluru, Karnataka, India...
Found 202 lakes with names.


Unnamed: 0_level_0,Unnamed: 1_level_0,name,geometry
element,id,Unnamed: 2_level_1,Unnamed: 3_level_1
relation,1332093,NCBS Pond,"POLYGON ((77.5791 13.07125, 77.57909 13.07121,..."
relation,1853330,Vengayyana Lake,"POLYGON ((77.70218 13.01708, 77.70235 13.017, ..."
relation,1857615,Halasuru lake,"POLYGON ((77.62261 12.98202, 77.6227 12.98193,..."
relation,2310400,Chelekere,"POLYGON ((77.64527 13.02519, 77.64512 13.02543..."
relation,2310417,Madiwala Lake,"MULTIPOLYGON (((77.61159 12.90261, 77.61165 12..."


In [5]:
def get_lake_area_fast(lake_row, start_year=2020, end_year=2025):
    lake_name = lake_row['name']
    print(f"processing {lake_name}...")
    lon, lat = lake_row.geometry.centroid.x, lake_row.geometry.centroid.y
    
    # Pre-process geometries
    lake_geom_ee = geemap.gdf_to_ee(gpd.GeoDataFrame([lake_row], crs=lakes_gdf.crs))
    strict_boundary = lake_geom_ee.geometry()
    buffered_boundary = strict_boundary.buffer(50)
    
    # CALCULATE POTENTIAL CAPACITY (Static geometric area of the polygon)
    # We use a pixelArea of 1 multiplied by the reducer to get the total polygon size in m2
    potential_area_m2 = ee.Image.pixelArea().reduceRegion(
        reducer=ee.Reducer.sum(),
        geometry=strict_boundary,
        scale=10, # Sentinel resolution for precision
        maxPixels=1e9
    ).get('area')
    
    # Convert to ha on the server side
    potential_ha = ee.Number(potential_area_m2).divide(10000)
    
    years = ee.List.sequence(start_year, end_year)

    def calculate_annual_stats(year):
        year = ee.Number(year)
        
        # Select collection based on year
        s2_col = (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
                  .filterBounds(buffered_boundary)
                  .filter(ee.Filter.calendarRange(year, year, 'year'))
                  .filter(ee.Filter.calendarRange(11, 2, 'month')) 
                  .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20))
                  .select(['B3', 'B11', 'B8', 'B4'], ['Green', 'SWIR1', 'NIR', 'Red']))
        
        l8_col = (ee.ImageCollection('LANDSAT/LC08/C02/T1_L2')
                  .filterBounds(buffered_boundary)
                  .filter(ee.Filter.calendarRange(year, year, 'year'))
                  .filter(ee.Filter.calendarRange(11, 2, 'month'))
                  .filter(ee.Filter.lt('CLOUD_COVER', 20))
                  .select(['SR_B3', 'SR_B6', 'SR_B5', 'SR_B4'], ['Green', 'SWIR1', 'NIR', 'Red']))
        
        collection = ee.ImageCollection(ee.Algorithms.If(year.gte(2017), s2_col, l8_col))
        scale = ee.Number(ee.Algorithms.If(year.gte(2017), 10, 30))
        
        image = collection.median()
        
        # Indices
        mndwi = image.normalizedDifference(['Green', 'SWIR1']).rename('water')
        ndvi = image.normalizedDifference(['NIR', 'Red']).rename('weed')
        
        # Masks
        water_mask = mndwi.gt(0)
        weed_mask = ndvi.gt(0.4).And(mndwi.gt(-0.5))
        combined_mask = water_mask.Or(weed_mask).rename('total')
        
        # Area image
        area_img = ee.Image.cat([water_mask, weed_mask, combined_mask]).multiply(ee.Image.pixelArea())
        
        # REDUCTION 1: Static Boundary
        stats_static = area_img.reduceRegion(
            reducer=ee.Reducer.sum(),
            geometry=strict_boundary,
            scale=scale,
            maxPixels=1e9
        )
        
        # REDUCTION 2: Buffered Boundary (50m)
        stats_buffer = area_img.reduceRegion(
            reducer=ee.Reducer.sum(),
            geometry=buffered_boundary,
            scale=scale,
            maxPixels=1e9
        )

        # Helper to get numbers safely
        def get_ha(stats, key):
            return ee.Number(stats.get(key, 0)).divide(10000)

        return ee.Feature(None, {
            'name': lake_name,
            'year': year,
            'lat': lat,
            'lon': lon,
            'potential_ha': potential_ha, # This stays constant for all years of a lake
            # Static Results
            'static_water_ha': get_ha(stats_static, 'water'),
            'static_weed_ha': get_ha(stats_static, 'weed'),
            'static_total_ha': get_ha(stats_static, 'total'),
            # Buffered Results
            'buffer_water_ha': get_ha(stats_buffer, 'water'),
            'buffer_weed_ha': get_ha(stats_buffer, 'weed'),
            'buffer_total_ha': get_ha(stats_buffer, 'total')
        })

    annual_features = ee.FeatureCollection(years.map(calculate_annual_stats))
    return pd.DataFrame([f['properties'] for f in annual_features.getInfo()['features']])

In [6]:
import os

# 1. Ensure the output directory exists
output_dir = 'data'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 2. List to store the results of each lake
all_lakes_results = []

print(f"Starting batch processing for {len(lakes_gdf)} lakes...")

# 3. Loop through all 202 lakes in lakes_gdf
for idx in range(len(lakes_gdf)):
    lake_row = lakes_gdf.iloc[idx]
    
    try:
        # Call the fast server-side function
        # Using 2020-2025 as the default range
        df_lake = get_lake_area_fast(lake_row, start_year=2020, end_year=2025)
        
        # Add to our collection
        all_lakes_results.append(df_lake)
        
    except Exception as e:
        print(f"Error processing {lake_row['name']} (Index {idx}): {e}")

# 4. Concatenate all individual DataFrames into one master table
master_df = pd.concat(all_lakes_results, ignore_index=True)

# 5. Save the final result to the data folder
csv_filename = os.path.join(output_dir, 'bengaluru_lakes_master_2020_2025.csv')
master_df.to_csv(csv_filename, index=False)

print("-" * 30)
print(f"Processing Complete!")
print(f"Total rows generated: {len(master_df)}")
print(f"File saved to: {csv_filename}")

Starting batch processing for 202 lakes...
processing NCBS Pond...
processing Vengayyana Lake...
processing Halasuru lake...
processing Chelekere...
processing Madiwala Lake...
processing Iblur Lake...
processing Benniganahalli Lake...
processing Puttenhalli Lake...
processing Mathikere Lake...
processing Anchepalya Lake...
processing Sankey Tank...
processing Rachenahalli Lake...
processing Chinnappanahalli Lake...
processing Herohalli Kere...
processing Kaikondrahalli Lake...
processing Lal Bahadur Shastri Nagar Lake...
processing Agara Lake...
processing Hebbal Lake...
processing Sarakki Kere...
processing Jakkur Lake...
processing Yediyur Lake...
processing Seegehalli Lake...
processing Yelahanka Lake...
processing Hemmigepura Kere...
processing Thubarahalli Lake...
processing Agrahara Lake...
processing Puttenahalli Lake...
processing Attur Lake...
processing Allalasandra Lake...
processing Kodigehalli Lake...
processing Narsipura Lake...
processing Dasarahalli Tank...
processing 

In [7]:
def get_land_cover_stats(lake_row, start_year=2020, end_year=2025):
    lake_name = lake_row['name']
    print(f"processing {lake_name}...")
    
    # 1. Setup Geometries
    lake_gdf_single = gpd.GeoDataFrame([lake_row], crs=lakes_gdf.crs)
    lake_geom_ee = geemap.gdf_to_ee(lake_gdf_single)
    strict_boundary = lake_geom_ee.geometry()
    
    # Create the Donut Zones
    outer_buffer = strict_boundary.buffer(50)
    inner_buffer = strict_boundary.buffer(-50)
    
    # Zone A: 50m Inside the boundary (The Shoreline/Riparian zone)
    inside_50m_zone = strict_boundary.difference(inner_buffer)
    
    # Zone B: 50m Outside the boundary (The Neighborhood/Encroachment zone)
    outside_50m_zone = outer_buffer.difference(strict_boundary)
    
    years = ee.List.sequence(start_year, end_year)

    def calculate_annual_land_cover(year):
        year = ee.Number(year)
        
        # We use Sentinel-2 for better resolution (10m) on buildings
        collection = (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
                      .filterBounds(outer_buffer)
                      .filter(ee.Filter.calendarRange(year, year, 'year'))
                      .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 10))
                      .median())
        
        # NDVI for Green Cover (NIR - Red) / (NIR + Red)
        ndvi = collection.normalizedDifference(['B8', 'B4']).rename('ndvi')
        
        # NDBI for Buildings (SWIR - NIR) / (SWIR + NIR)
        ndbi = collection.normalizedDifference(['B11', 'B8']).rename('ndbi')
        
        # Classification Thresholds
        green_mask = ndvi.gt(0.4).rename('green')
        building_mask = ndbi.gt(0.0).And(ndvi.lt(0.2)).rename('buildings')
        
        stats_img = ee.Image.cat([green_mask, building_mask]).multiply(ee.Image.pixelArea())
        
        # Helper to reduce by zone
        def get_stats(zone_geom):
            return stats_img.reduceRegion(
                reducer=ee.Reducer.sum(),
                geometry=zone_geom,
                scale=10,
                maxPixels=1e9
            )

        res_inside = get_stats(inside_50m_zone)
        res_outside = get_stats(outside_50m_zone)

        return ee.Feature(None, {
            'name': lake_name, 'year': year,
            'in_green_ha': ee.Number(res_inside.get('green', 0)).divide(10000),
            'in_build_ha': ee.Number(res_inside.get('buildings', 0)).divide(10000),
            'out_green_ha': ee.Number(res_outside.get('green', 0)).divide(10000),
            'out_build_ha': ee.Number(res_outside.get('buildings', 0)).divide(10000)
        })

    annual_fc = ee.FeatureCollection(years.map(calculate_annual_land_cover))
    return pd.DataFrame([f['properties'] for f in annual_fc.getInfo()['features']])

In [8]:
all_land_results = []

for idx in range(len(lakes_gdf)):
    lake_row = lakes_gdf.iloc[idx]
    try:
        df_land = get_land_cover_stats(lake_row)
        all_land_results.append(df_land)
    except Exception as e:
        print(f"Skipped {lake_row['name']}: {e}")

# Save the final Land Cover dataset
if all_land_results:
    master_land_df = pd.concat(all_land_results, ignore_index=True)
    master_land_df.to_csv('data/bengaluru_lakes_land_cover.csv', index=False)

processing NCBS Pond...
processing Vengayyana Lake...
processing Halasuru lake...
processing Chelekere...
processing Madiwala Lake...
processing Iblur Lake...
processing Benniganahalli Lake...
processing Puttenhalli Lake...
processing Mathikere Lake...
processing Anchepalya Lake...
processing Sankey Tank...
processing Rachenahalli Lake...
processing Chinnappanahalli Lake...
processing Herohalli Kere...
processing Kaikondrahalli Lake...
processing Lal Bahadur Shastri Nagar Lake...
processing Agara Lake...
processing Hebbal Lake...
processing Sarakki Kere...
processing Jakkur Lake...
processing Yediyur Lake...
processing Seegehalli Lake...
processing Yelahanka Lake...
processing Hemmigepura Kere...
processing Thubarahalli Lake...
processing Agrahara Lake...
processing Puttenahalli Lake...
processing Attur Lake...
processing Allalasandra Lake...
processing Kodigehalli Lake...
processing Narsipura Lake...
processing Dasarahalli Tank...
processing Subramanyapura Lake...
processing Panathuru

In [9]:
# Load the hydrological master data
df_master = pd.read_csv('data/bengaluru_lakes_master_2020_2025.csv')

# Load the land cover (buildings/greenery) data
df_land = pd.read_csv('data/bengaluru_lakes_land_cover.csv')

# Merge on common keys: 'name' and 'year'
df_combined = pd.merge(df_master, df_land, on=['name', 'year'], how='inner')

# Save the final consolidated dataset
df_combined.to_csv('data/bengaluru_lakes_combined_data_2020_2025.csv', index=False)

df = pd.read_csv('data/bengaluru_lakes_combined_data_2020_2025.csv')

# Remove duplicates based on name and year, keeping the first entry found
df_cleaned = df.drop_duplicates(subset=['name', 'year'], keep='first')

# Save the cleaned version
df_cleaned.to_csv('data/bengaluru_lakes.csv', index=False)

print(f"Cleaned dataset saved. Remaining rows: {len(df_cleaned)}")

Cleaned dataset saved. Remaining rows: 1086


In [10]:
df = pd.read_csv('data/bengaluru_lakes.csv')
df = df[['name', 'lat', 'lon', 'year', 'potential_ha', 
         'static_total_ha', 'static_water_ha', 'static_weed_ha',
         'buffer_total_ha', 'buffer_water_ha', 'buffer_weed_ha',
         'in_build_ha', 'out_build_ha', 'in_green_ha', 'out_green_ha']]
df = df.sort_values(['potential_ha', 'year'], ascending=False)
df = df[df['potential_ha'] > 0.5]

df.to_csv('data/bengaluru_lakes_cleaned_gt_0.5ha.csv')

df_mean = df.groupby(['name', 'lat', 'lon']).mean(numeric_only=True).reset_index()
df_mean = df_mean.sort_values(by='in_build_ha', ascending=False)
df_mean = df_mean.drop(columns=['year'])
df_mean['encroachment_pct'] = (df_mean['in_build_ha'] / df_mean['potential_ha']) * 100
df_mean = df_mean.sort_values(by='encroachment_pct', ascending=False)

df_mean.to_csv('data/bengaluru_lakes_mean.csv')
df_mean.head()

Unnamed: 0,name,lat,lon,potential_ha,static_total_ha,static_water_ha,static_weed_ha,buffer_total_ha,buffer_water_ha,buffer_weed_ha,in_build_ha,out_build_ha,in_green_ha,out_green_ha,encroachment_pct
32,Chikkabettahalli Lake,13.091797,77.554721,1.863068,0.521525,0.495941,0.025583,0.912038,0.616391,0.295648,1.523316,2.523841,0.025976,0.290142,81.763834
119,Panathur Chikka Kere,12.93147,77.708637,0.904825,0.473972,0.469141,0.006389,0.852702,0.485996,0.36832,0.734791,0.989899,0.002957,0.509506,81.208109
35,Chokkanahalli Lake,13.084613,77.629221,2.02232,0.455899,0.414647,0.041252,1.028474,0.417931,0.610543,1.526276,0.954972,0.054326,1.146548,75.471547
120,Panathuru Kere,12.931616,77.706855,4.14942,2.738649,2.690679,0.052478,4.448214,2.743214,1.711458,2.83831,1.195444,0.034464,2.042395,68.402574
95,Krishna Nagara Lake,12.874035,77.579749,4.421779,2.770499,2.670207,0.100292,4.383426,2.693766,1.68966,2.799416,0.941187,0.134707,3.034444,63.309715


In [None]:
import ee
import geemap

# 1. Setup
#Initialize Project

def map_lake_health(lake_name, start_year='2020', end_year='2025'):
    # Fetch actual irregular boundary from OSM
    try:
        lake_feature = geemap.osm_to_ee(f"{lake_name}, Bengaluru")
        lake_boundary = lake_feature.geometry()
    except:
        print(f"Boundary for {lake_name} not found. Check spelling.")
        return None

    # Load Sentinel-2 and clip to the JAGGED boundary
    s2_img = (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
              .filterBounds(lake_boundary)
              .filterDate(f'{start_year}-01-01', f'{end_year}-12-31')
              .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 10))
              .median()
              .clip(lake_boundary))

    # Calculate Indices
    mndwi = s2_img.normalizedDifference(['B3', 'B11']).rename('Water')
    ndvi = s2_img.normalizedDifference(['B8', 'B4']).rename('Vegetation')
    ndbi = s2_img.normalizedDifference(['B11', 'B8']).rename('Buildings')

    # NDTI (Turbidity/Silt Index) 
    # High values = Muddy water / Silt deposits
    ndti = s2_img.normalizedDifference(['B4', 'B3']).rename('Silt')

    # Create a mask to remove water and vegetation
    non_water_mask = mndwi.lt(0)  # Only areas that are NOT water
    non_veg_mask = ndvi.lt(0.2)   # Only areas that are NOT healthy plants

    # Calculate the refined Built-up Index
    refined_encroachment = ndbi.updateMask(non_water_mask).updateMask(non_veg_mask)

    # Visualize
    Map = geemap.Map()
    Map.add_basemap('SATELLITE')
    Map.centerObject(lake_boundary, 17)
    
    # Add Layers
    Map.addLayer(ndbi, {'min': -0.1, 'max': 0.3, 'palette': ['white', 'red']}, 'Encroachment (Red)')
    Map.addLayer(refined_encroachment, {'min': 0, 'max': 0.4, 'palette': ['white', 'darkred']}, 'Refined Encroachment')
    Map.addLayer(ndti, {'min': -0.1, 'max': 0.2, 'palette': ['white', 'brown']}, '4. Silt/Turbidity (Brown)')
    Map.addLayer(ndvi, {'min': 0, 'max': 0.6, 'palette': ['white', 'green']}, 'Weeds (Green)')
    Map.addLayer(mndwi, {'min': -0.5, 'max': 0.2, 'palette': ['white', 'blue']}, 'Water (Blue)')
    Map.addLayer(lake_boundary, {'color': 'yellow'}, 'Legal Boundary')
    
    Map.add_layer_control()
    return Map

# --- EXECUTION ---
# Just change the name here to visualize any lake!
my_lake_map = map_lake_health("Panathur Chikka Kere")
my_lake_map

---
---

## Slope and Elevation
This script performs a **lake-specific topographic analysis** using **exact lake boundary geometries** (not circular proxies). It converts lake polygons stored in a CSV into **Earth Engine FeatureCollections**, then extracts **mean elevation and mean slope** within each lake’s true spatial footprint.

---

#### Step 1: Preparing Lake Attributes and Boundaries
- Two CSV files are loaded:
  - One containing **lake attributes** (names, metadata).
  - Another containing **exact lake boundary geometries** stored as **WKT (Well-Known Text)**.
- Duplicate lake names in the boundary file are removed to ensure a **one-to-one join**.
- The attribute table and boundary table are **merged on lake name**, attaching polygon geometry to each lake record.

**Key idea:** this step upgrades the analysis from *approximate circular buffers* to **true lake outlines**.

---

#### Step 2: Converting WKT Geometries into Earth Engine Features
- Each lake’s geometry is read from the WKT string using **Shapely**.
- Two cases are handled explicitly:
  - **Polygon** → a single contiguous lake boundary.
  - **MultiPolygon** → lakes with multiple disconnected basins or islands.
- Coordinates are extracted from Shapely objects and converted into:
  - **ee.Geometry.Polygon** or
  - **ee.Geometry.MultiPolygon**
- Each geometry is wrapped as an **ee.Feature** with the lake name as metadata.
- All features are combined into a single **ee.FeatureCollection**.

**Key idea:** Earth Engine cannot read WKT directly, so this step bridges **local vector geometry** → **cloud-based geospatial analysis**.

---

#### Step 3: Preparing Topographic Layers
- **SRTM elevation data** (~30 m resolution) is loaded.
- Two topographic variables are derived:
  - **Elevation** → absolute height of the lake basin.
  - **Slope** → steepness of terrain inside the lake footprint.
- These layers are stacked into a single **multi-band image** for efficient processing.

**Key idea:** elevation gives **hydrological position**, slope gives **geomorphic integrity**.

---

#### Step 4: Extracting Mean Topography within Exact Lake Boundaries
- `reduceRegions` is used with the **lake polygon FeatureCollection**.
- For each lake polygon:
  - Mean **elevation** is computed.
  - Mean **slope** is computed.
- Extraction is done at **30 m scale**, matching the SRTM resolution.

**Key idea:** statistics are computed **only inside the real lake boundaries**, not across buffers or surrounding land.

---

#### Step 5: Exporting Results
- The Earth Engine results are converted into a **Pandas DataFrame**.
- The final table is saved as `lake_slope_elevation.csv`.

Each row in the output represents:
- one lake
- its **mean elevation**
- its **mean slope**

---

#### Conceptual Significance
This workflow measures **how flat or bowl-shaped each lake actually is**, using its **true spatial extent**. Flat, low-slope lakes are more likely to be **filled, encroached, or hydrologically compromised**, while steeper basins indicate **better-preserved lake morphology**.

---

#### One-line takeaway
We are extracting **physically meaningful topographic indicators** (slope and elevation) **directly from exact lake boundaries**, enabling robust analysis of lake degradation and flood vulnerability.


In [12]:
import ee
import geemap
import pandas as pd
from shapely import wkt # To parse the WKT geometry
from shapely.geometry import Polygon, MultiPolygon

df = pd.read_csv('data/bengaluru_lakes_mean.csv')
df_boundary = pd.read_csv('data/lake_polygon_boundaries.csv')
df_boundary = df_boundary.drop_duplicates(subset='name')
df = df.merge(df_boundary, on = 'name', how = 'left')

# 2. Convert Pandas DataFrame (with geometries) to EE FeatureCollection
features = []
for i, row in df.iterrows():
    if pd.notnull(row['geometry']):
        # Parse the WKT string
        poly = wkt.loads(row['geometry'])
        
        if isinstance(poly, Polygon):
            # Single Polygon: Create a list containing one ring
            coords = [list(poly.exterior.coords)]
            geom = ee.Geometry.Polygon(coords)
            
        elif isinstance(poly, MultiPolygon):
            # MultiPolygon: Iterate through all constituent polygons
            all_rings = []
            for p in poly.geoms:
                all_rings.append([list(p.exterior.coords)])
            geom = ee.Geometry.MultiPolygon(all_rings)
            
        features.append(ee.Feature(geom, {'name': row['name']}))

lake_polygons = ee.FeatureCollection(features)

# 3. Topographic Analysis
srtm = ee.Image("USGS/SRTMGL1_003")
elevation = srtm.select('elevation')
slope = ee.Terrain.slope(elevation).rename('slope')
topo_stack = ee.Image.cat([elevation, slope])

# 4. Extract Stats (Mean values within the EXACT boundaries)
stats = topo_stack.reduceRegions(
    collection=lake_polygons,
    reducer=ee.Reducer.mean(),
    scale=30 
)

# 5. Export to CSV
df_results = geemap.ee_to_df(stats)
df_results.to_csv('data/lake_slope_elevation.csv', index=False)

## Extracting Hydrological Context for Bengaluru’s Lakes

This section of the code prepares the **spatial boundary**, **lake locations**, and **hydrological flow layers** needed to understand how water moves across Bengaluru and interacts with its lakes.

---

### 1. Defining the Bengaluru Urban Boundary

```python
bengaluru_boundary = ee.FeatureCollection("FAO/GAUL/2015/level2")
    .filter(ee.Filter.eq('ADM2_NAME', 'Bangalore Urban'))
```

### 2. Loading Lake Locations as Point Features

```python 
df = pd.read_csv('data/bengaluru_lakes_mean.csv')
features = [
    ee.Feature(
        ee.Geometry.Point([row['lon'], row['lat']]),
        {'name': row['name']}
    ) 
    for i, row in df.iterrows()
]
lake_points = ee.FeatureCollection(features)
```
**What is happening**

* Reads a CSV file containing lake centroids.

* Converts each latitude–longitude pair into:

    * An Earth Engine Point geometry

    * With lake name as metadata

* All points are combined into a FeatureCollection.

**Why this matters**
* Lake points act as anchors to sample hydrological properties.

* Enables point-based queries such as:

    * Upstream contributing area

    * Flow direction at the lake location

---

### 3. Loading MERIT Hydro Datasets
`merit = ee.Image("MERIT/Hydro/v1_0_1")`

**What this dataset is**

* **MERIT Hydro** is a globally corrected hydrological dataset.

* Built on improved DEMs with:

    * Reduced striping errors

    * Corrected river networks

* It is especially useful for urban flood and drainage analysis.

---

### 4. Extracting Flow Accumulation
```python
flow_acc = merit.select('upa')
flow_acc_viz = flow_acc.log10()
```

**Key concepts**

* `upa` (Upstream Accumulation Area)

* Represents the total upstream area draining into each pixel.

* High values indicate major drains and valleys.

**Why log-transform**

* Raw flow accumulation values span several orders of magnitude.

* Log transformation:

    * Enhances visibility of small urban streams

    * Prevents large rivers from dominating the visualization

**Hydrological meaning**

* Pixels with high values indicate where runoff naturally converges.

* Lakes located on high upa pixels are structurally flood-prone.

---

### 5. Extracting Flow Direction
`flow_dir = merit.select('dir')`

**What dir represents**

* Indicates the direction water flows out of each pixel.

* Encoded using a D8 flow model (8 possible directions).

**Flow direction explains**:

* How water moves between lakes

* Which lakes are upstream or downstream

* Essential for understanding Bengaluru’s historic cascade lake system.

---

### 6. Preparing the Map for Visualisation
```python
Map = geemap.Map()
Map.centerObject(bengaluru_boundary, 11)
```

**What is happening**

* Initializes an interactive map.

* Centers the map over Bangalore Urban at a city-scale zoom level.

---

### 7. Flow Accumulation Visualisation Parameters
```python
acc_params = {
    'min': 0, 
    'max': 5, 
    'palette': ['#000000', '#023858', '#0570b0', '#74a9cf', '#fff7fb']
}
```

**Interpretation**

* Dark colors → low or negligible drainage

* Light colors → strong drainage pathways

**Highlights**:

* Natural valleys

* Stormwater drains

* Low-lying convergence zones

---

### 8. Flow Direction Visualisation Parameters

```python
dir_params = {
    'min': 1, 
    'max': 128, 
    'palette': ['red', 'orange', 'yellow', 'green', 'blue', 'cyan', 'magenta', 'black']
}
```


**Interpretation**

* Each color corresponds to a specific flow direction.

* Together, they reveal the directional logic of runoff across the city.

* Helps visually verify:

    * Whether lakes align with natural flow paths

    * Where drainage has been disrupted by urban development

---

In [None]:
# 2. Get Boundary and Lakes
bengaluru_boundary = ee.FeatureCollection("FAO/GAUL/2015/level2") \
    .filter(ee.Filter.eq('ADM2_NAME', 'Bangalore Urban'))

# Load your lake points
df = pd.read_csv('data/bengaluru_lakes_mean.csv')
features = [ee.Feature(ee.Geometry.Point([row['lon'], row['lat']]), {'name': row['name']}) for i, row in df.iterrows()]
lake_points = ee.FeatureCollection(features)

# 3. Load MERIT Hydro Datasets with CORRECT BAND NAMES
merit = ee.Image("MERIT/Hydro/v1_0_1")

# 'upa' is the band for Upstream Accumulation Area
flow_acc = merit.select('upa') 
# We log-transform it for better visualization of small streams
flow_acc_viz = flow_acc.log10() 

# 'dir' is the band for Flow Direction
flow_dir = merit.select('dir')

# 4. Visualization on the Map
Map = geemap.Map()
Map.centerObject(bengaluru_boundary, 11)

# Palette for Flow Accumulation (Blue to White represents the drainage network)
acc_params = {
    'min': 0, 
    'max': 5, 
    'palette': ['#000000', '#023858', '#0570b0', '#74a9cf', '#fff7fb']
}

# Palette for Direction (Standard 8-direction colors)
dir_params = {
    'min': 1, 
    'max': 128, 
    'palette': ['red', 'orange', 'yellow', 'green', 'blue', 'cyan', 'magenta', 'black']
}

Map.addLayer(flow_dir.clip(bengaluru_boundary), dir_params, '1. Flow Direction (Compass)')
Map.addLayer(flow_acc_viz.clip(bengaluru_boundary), acc_params, '2. Flow Accumulation (Drainage Network)')
Map.addLayer(lake_points, {'color': 'red'}, '3. Lake Locations')

Map



In [14]:
# 5. Extract and Save Data
print("Extracting flow stats for lakes...")
# Combine bands into one image for sampling
topo_image = ee.Image.cat([
    flow_acc.rename('flow_accumulation_km2'),
    flow_dir.rename('flow_direction_code')
])

stats = topo_image.reduceRegions(
    collection=lake_points,
    reducer=ee.Reducer.mean(),
    scale=90
)

try:
    df_results = geemap.ee_to_df(stats)
    if not os.path.exists('data'): os.makedirs('data')
    df_results.to_csv('data/lake_flow_analysis.csv', index=False)
    print("Success! Data saved to data/lake_flow_analysis.csv")
    print(df_results[['name', 'flow_accumulation_km2']].sort_values(by='flow_accumulation_km2', ascending=False).head())
except Exception as e:
    print(f"Error saving CSV: {e}")

Extracting flow stats for lakes...
Success! Data saved to data/lake_flow_analysis.csv
                          name  flow_accumulation_km2
152  Yellamallappa Chetty Lake             267.274902
114              Ramapura Kere             224.145035
136             Bellandur Lake             114.820732
154                  Saul Kere              23.875479
85                    Hosakere              16.710138


---
---

### Lake-wise SAR (Synthetic Aperture Radar) Flood Frequency Extraction

This script computes **observed flood frequency** around each Bengaluru lake using **Sentinel-1 SAR radar data** for the period **2020–2025**. The output is a lake-level dataset showing **how often flooding was actually detected**, based on satellite observations rather than model assumptions.

---

**Initialization and Data Loading**

- Earth Engine is initialized so all geospatial processing runs on Google’s servers.
- A CSV containing lake names and coordinates is loaded into a Pandas DataFrame.

---

**Building a City-wide Flood Frequency Image (Done Once)**

- All lake coordinates are combined into a single **MultiPoint geometry**, then buffered by **500 m** to define a city-wide region of interest.
- Sentinel-1 SAR images are loaded and filtered:
  - Spatially: only images covering the buffered lake region
  - Temporally: 2020–2025
  - Polarisation: **VH** (best for water detection in urban areas)
  - Mode: **IW** (standard land observation mode)

- Each SAR image is converted into a **binary water map** using a −20 dB threshold:
  - Values below −20 dB → likely water/flooded
  - Values above −20 dB → land or built-up
- All binary water maps are stacked over time.
- **Flood frequency (%)** is computed per pixel as:  
  *(number of times water was detected ÷ number of observations) × 100*  
  This produces a single raster (`sar_flood_freq_pct`) showing how often each pixel was inundated over five years.

---

**Lake-wise Extraction Loop**

- The script loops through lakes **client-side (Python)** for progress monitoring.
- For each lake:
  - A **200 m buffer** around the lake centroid is created to capture spillover and nearby waterlogging.
  - The **mean flood frequency** within this buffer is extracted from the precomputed SAR flood-frequency image using `reduceRegion`.
  - This yields one number per lake:  
    *“On average, what percentage of satellite passes detected flooding here?”*

- Results are stored in a list with lake name and flood frequency.
- Errors for individual lakes are caught so the loop continues uninterrupted.

---

**Saving the Output**

- The collected results are converted to a DataFrame.
- A CSV file is written containing:
  - `name` → lake name  
  - `sar_flood_freq_pct` → observed flood frequency (2020–2025)

---

**Conceptual Meaning**

- This workflow provides an **empirical, observation-based measure of flooding**, not a simulated one.
- It captures:
  - chronic waterlogging
  - repeated lake overflows
  - drainage failures
- The output is ideal for:
  - validating flood models
  - identifying flood hotspots
  - serving as a **ground-truth target** for machine-learning flood-risk models

---

**One-line takeaway**

This code converts five years of Sentinel-1 radar imagery into a lake-wise measure of how often flooding actually occurred around Bengaluru’s lakes.


In [15]:
import ee
import geemap
import pandas as pd
import time

# 1. Initialize
ee.Initialize(project='bengaluru-lakes-485612')

# 2. Load Data
df_lakes = pd.read_csv('data/bengaluru_lakes_mean.csv')

# 3. Create the Base Frequency Image (Do this ONCE outside the loop)
roi_all = ee.Geometry.MultiPoint(df_lakes[['lon', 'lat']].values.tolist()).buffer(500)
s1_collection = ee.ImageCollection('COPERNICUS/S1_GRD') \
    .filterBounds(roi_all) \
    .filterDate('2020-01-01', '2025-12-31') \
    .filter(ee.Filter.listContains('transmitterReceiverPolarisation', 'VH')) \
    .filter(ee.Filter.eq('instrumentMode', 'IW'))

def identify_water(image):
    return image.select('VH').lt(-20).rename('is_water').copyProperties(image, ['system:time_start'])

water_ts = s1_collection.map(identify_water)
flood_freq_img = water_ts.sum().divide(water_ts.count()).multiply(100).rename('sar_flood_freq_pct')

# 4. Processing Loop with Prints
results = []
print(f"Starting extraction for {len(df_lakes)} lakes...")

for index, row in df_lakes.iterrows():
    lake_name = row['name']
    print(f"[{index+1}/{len(df_lakes)}] Processing: {lake_name}...", end="\r")
    
    # Define local geometry
    point = ee.Geometry.Point([row['lon'], row['lat']]).buffer(200)
    
    # Extract mean frequency for this specific lake
    try:
        # reduceRegion (singular) is faster for a single geometry
        stat = flood_freq_img.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=point,
            scale=10,
            maxPixels=1e9
        ).getInfo()
        
        results.append({
            'name': lake_name,
            'sar_flood_freq_pct': stat.get('sar_flood_freq_pct')
        })
    except Exception as e:
        print(f"\nError processing {lake_name}: {e}")

# 5. Save results
results_df = pd.DataFrame(results)
results_df.to_csv('data/lake_sar_flood_frequency_2025.csv', index=False)
print("\nExtraction Complete! File saved.")

Starting extraction for 162 lakes...
[162/162] Processing: Vidyaranyapura Kere...ake...e...
Extraction Complete! File saved.


## What this code does and why it is useful

### Purpose  
This code computes a **true flood-frequency metric around lakes** using **Sentinel-1 SAR data**, by distinguishing **episodic flooding** from **permanent lake water**. It produces a lake-wise percentage indicating how often areas around each lake experience flooding over a multi-year period (2020–2025).

---

### Step-by-step explanation  

#### 1. Lake input and region of interest  
- Reads a CSV containing lake names and their latitude–longitude coordinates.  
- Combines all lake points into a **single buffered region of interest (ROI)**.  
- This optimisation ensures Sentinel-1 data are loaded only once for the entire study area, improving efficiency.

---

#### 2. Sentinel-1 SAR data selection  
- Loads **Sentinel-1 GRD** imagery within the ROI and time window (2020–2025).  
- Filters for:
  - **IW mode** (standard for land applications)  
  - **VH polarisation**, which is sensitive to open water  
- Selects only the VH band to reduce data volume.

---

#### 3. Water detection  
- Converts each SAR image into a **binary water mask** using a backscatter threshold (VH < −20 dB).  
- This exploits the physical property that smooth water surfaces return very low radar backscatter.  
- Temporal metadata is preserved for time-series analysis.

---

#### 4. Dry-season baseline (normal lake extent)  
- Restricts the water masks to **January–March**, when flooding is minimal.  
- Computes the mean water occurrence during this dry season.  
- Pixels classified as water in ≥70% of dry-season observations are treated as **permanent lake water**.  
- This baseline represents the *normal, non-flooded lake extent*.

---

#### 5. Flood detection (core logic)  
- For every SAR acquisition, identifies **flood pixels** as:
  - Water present **outside** the dry-season baseline.  
- This step removes permanent lake water and isolates **anomalous inundation**, which is the defining feature of flooding.

---

#### 6. Flood frequency calculation  
- Sums all flood detections across time.  
- Divides by the number of valid observations.  
- Converts the result into a **percentage flood frequency image**, indicating how often flooding occurred at each pixel.

---

#### 7. Lake-wise flood-frequency extraction  
- Iterates over each lake.  
- Uses a local buffer around the lake centre (proxy for surrounding flood-prone area).  
- Computes the **mean flood frequency (%)** within that buffer.  
- Stores results lake by lake for further analysis or export.

---

### Utility of this approach  

- **Conceptually correct**: Measures flooding as a deviation from normal conditions, not simple water presence.  
- **SAR-based**: Works reliably during monsoon months and under cloud cover.  
- **Comparative**: Enables ranking of lakes by chronic flood exposure.  
- **Policy-relevant**: Useful for urban planning, drainage prioritisation, lake rejuvenation, and resilience studies.  
- **Scalable**: Can be extended to polygons, rainfall conditioning, or adaptive thresholds.

In short, this code transforms raw SAR imagery into a defensible, event-based **flood-frequency indicator** around urban lakes.


In [16]:
import ee
import geemap
import pandas as pd

# -------------------------------------------------------------------
# 1. INITIALISE EARTH ENGINE
# -------------------------------------------------------------------
ee.Initialize(project='bengaluru-lakes-485612')

# -------------------------------------------------------------------
# 2. LOAD LAKE POINT DATA
# -------------------------------------------------------------------
df_lakes = pd.read_csv('data/bengaluru_lakes_mean.csv')

# Combined ROI for efficiency
roi_all = ee.Geometry.MultiPoint(
    df_lakes[['lon', 'lat']].values.tolist()
).buffer(500)

# -------------------------------------------------------------------
# 3. LOAD SENTINEL-1 SAR DATA
# -------------------------------------------------------------------
s1 = (
    ee.ImageCollection('COPERNICUS/S1_GRD')
    .filterBounds(roi_all)
    .filterDate('2020-01-01', '2025-12-31')
    .filter(ee.Filter.eq('instrumentMode', 'IW'))
    .filter(ee.Filter.listContains(
        'transmitterReceiverPolarisation', 'VH'
    ))
    .select('VH')
)

# -------------------------------------------------------------------
# 4. WATER DETECTION FUNCTION
# -------------------------------------------------------------------
def detect_water(image):
    water = image.lt(-20)  # heuristic threshold
    return water.rename('water').copyProperties(
        image, ['system:time_start']
    )

water_series = s1.map(detect_water)

# -------------------------------------------------------------------
# 5. DRY-SEASON BASELINE WATER MASK (NORMAL LAKE EXTENT)
# -------------------------------------------------------------------
dry_season = water_series.filter(
    ee.Filter.calendarRange(1, 3, 'month')  # Jan–Mar
)

# Permanent water = water in ≥70% of dry-season observations
baseline_water = (
    dry_season.mean()
    .gt(0.7)
    .rename('baseline_water')
)

# -------------------------------------------------------------------
# 6. FLOOD DETECTION (KEY STEP)
# -------------------------------------------------------------------
def detect_flood(image):
    flood = image.And(baseline_water.Not())
    return flood.rename('flood').copyProperties(
        image, ['system:time_start']
    )

flood_series = water_series.map(detect_flood)

# -------------------------------------------------------------------
# 7. FLOOD FREQUENCY IMAGE (% OF OBSERVATIONS)
# -------------------------------------------------------------------
flood_freq_img = (
    flood_series.sum()
    .divide(flood_series.count())
    .multiply(100)
    .rename('flood_freq_pct')
)

# -------------------------------------------------------------------
# 8. LAKE-WISE EXTRACTION
# -------------------------------------------------------------------
results = []
total = len(df_lakes)

print(f"Starting flood-frequency extraction for {total} lakes...")

for i, row in df_lakes.iterrows():
    lake_name = row['name']
    print(f"[{i+1}/{total}] Processing {lake_name}", end="\r")

    # Local analysis buffer (replace with polygon if available)
    lake_geom = ee.Geometry.Point(
        [row['lon'], row['lat']]
    ).buffer(200)

    try:
        stat = flood_freq_img.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=lake_geom,
            scale=10,
            maxPixels=1e9
        ).getInfo()

        results.append({
            'name': lake_name,
            'flood_freq_pct': stat.get('flood_freq_pct')
        })

    except Exception as e:
        print(f"\nError processing {lake_name}: {e}")

# -------------------------------------------------------------------
# 9. SAVE OUTPUT
# -------------------------------------------------------------------
results_df = pd.DataFrame(results)
results_df.to_csv(
    'data/lake_true_flood_frequency_2025.csv',
    index=False
)

print("\nFlood-frequency extraction complete.")


Starting flood-frequency extraction for 162 lakes...
[162/162] Processing Vidyaranyapura Kerey LakeLake
Flood-frequency extraction complete.


---
---

## Measuring Rainfall Intensity and Timing
* For urban flooding in Bengaluru, "**Total Rainfall**" is less important than "**Intensity**" (how much rain falls in a short window). 
* We use the **GPM (Global Precipitation Measurement) IMERG dataset**, which provides data every 30 minutes.
    * **Metric 1 (Intensity): Max Daily Rainfall (mm/day)**.
    * **Metric 2 (Timing)**: The month of the peak rainfall event (to correlate with your SAR flood observations).

---

## Measuring Imperviousness
* "Imperviousness" refers to surfaces like concrete, asphalt, and rooftops that prevent water from soaking into the ground. 
* A high impervious percentage in the **200m buffer** around a lake leads to rapid runoff and higher flood risk.
    * Dataset: **Dynamic World (10m) or ESA WorldCover**. Dynamic World is preferred because it's at **10m resolution (same as Sentinel-2).**

---

In [None]:
import ee
import pandas as pd

# 1. Initialize
ee.Initialize(project='bengaluru-lakes-485612')

# 2. Load Geometries
df_lakes = pd.read_csv('data/bengaluru_lakes_mean.csv')
features = [
    ee.Feature(ee.Geometry.Point([row['lon'], row['lat']]).buffer(200), {'name': row['name']}) 
    for _, row in df_lakes.iterrows()
]
lake_fc = ee.FeatureCollection(features)

def export_hydrology_year(year):
    print(f"Submitting Task for {year}...")
    start_date = ee.Date.fromYMD(year, 1, 1)
    end_date = ee.Date.fromYMD(year, 12, 31)

    # --- 1. RAINFALL: DAILY AGGREGATION ---
    gpm = ee.ImageCollection("NASA/GPM_L3/IMERG_V07") \
        .filterDate(start_date, end_date) \
        .select('precipitation')

    days = ee.List.sequence(0, end_date.difference(start_date, 'day').subtract(1))
    
    def calc_daily(d):
        date = start_date.advance(d, 'day')
        return gpm.filterDate(date, date.advance(1, 'day')) \
                  .sum().multiply(0.5) \
                  .set('system:time_start', date.millis())
    
    daily_col = ee.ImageCollection.fromImages(days.map(calc_daily))
    daily_list = daily_col.toList(366)

    # --- 2. VECTORIZED ROLLING 3-DAY SUM (Faster) ---
    # We sum Image(i) + Image(i-1) + Image(i-2)
    indices = ee.List.sequence(2, daily_list.length().subtract(1))
    
    def sum_3days(i):
        i = ee.Number(i)
        img1 = ee.Image(daily_list.get(i))
        img2 = ee.Image(daily_list.get(i.subtract(1)))
        img3 = ee.Image(daily_list.get(i.subtract(2)))
        return img1.add(img2).add(img3).set('system:time_start', img1.get('system:time_start'))

    max_3day_img = ee.ImageCollection.fromImages(indices.map(sum_3days)).max().rename('max_3day_rain_mm')

    # --- 3. PEAK INTENSITY & IMPERVIOUSNESS ---
    peak_30min_img = gpm.max().multiply(0.5).rename('peak_30min_intensity_mm')
    
    dw = ee.ImageCollection("GOOGLE/DYNAMICWORLD/V1") \
        .filterDate(start_date, end_date).select('label').mode()
    impervious_img = dw.eq(6).rename('impervious_fraction')

    # --- 4. BATCH EXTRACTION ---
    combined = peak_30min_img.addBands([max_3day_img, impervious_img])
    
    stats = combined.reduceRegions(
        collection=lake_fc,
        reducer=ee.Reducer.mean(),
        scale=10,
        tileScale=4 # Splits the job into smaller tiles to avoid memory errors
    )

    # --- 5. EXPORT TO DRIVE ---
    task = ee.batch.Export.table.toDrive(
        collection=stats,
        description=f'Hydrology_Stats_{year}',
        folder='EE_Exports', # Folder name in your Google Drive
        fileNamePrefix=f'lake_stats_{year}',
        fileFormat='CSV'
    )
    task.start()

# Run for all years
for yr in range(2020, 2026):
    export_hydrology_year(yr)

print("All tasks submitted! Check your Google Earth Engine 'Tasks' tab or your Google Drive 'EE_Exports' folder.")

---
---

### Recorded data cleaning for further processing


In [None]:
import pandas as pd

# 1. Load all datasets
df_hydro = pd.read_csv('data/lake_stats_summary_2020_2025.csv')
df_landuse = pd.read_csv('data/bengaluru_lakes_cleaned_gt_0.5ha.csv')
df_flow = pd.read_csv('data/lake_flow_analysis.csv')
df_flood = pd.read_csv('data/lake_sar_flood_frequency_2025.csv')
df_encroach = pd.read_csv('data/bengaluru_lakes_mean.csv')

# 2. Average the Yearly Data (Hydro & Land Use)
# We drop 'year' and 'Unnamed: 0' before averaging
hydro_mean = df_hydro.drop(columns=['year', 'Unnamed: 0'], errors='ignore').groupby('name').mean().reset_index()

# For landuse, we keep lat/lon as they are constant, but average the areas
landuse_mean = df_landuse.drop(columns=['year', 'Unnamed: 0'], errors='ignore').groupby('name').mean().reset_index()

# 3. Merge into a single "Representative" DataFrame
# Start with landuse_mean as it contains lat/lon
ml_dataset = pd.merge(landuse_mean, hydro_mean, on='name', how='inner')

# Add static flow data
ml_dataset = pd.merge(ml_dataset, df_flow, on='name', how='left')

# Add pre-calculated encroachment data
ml_dataset = pd.merge(ml_dataset, df_encroach[['name', 'encroachment_pct']], on='name', how='left')

# Add the TARGET variable (Flood Frequency)
ml_dataset = pd.merge(ml_dataset, df_flood, on='name', how='left')

# 4. Final Cleanup
ml_dataset.fillna(0, inplace=True)

# 5. Save for ML
ml_dataset.to_csv('data/lake_flood_ml_ready.csv', index=False)

print(f"ML Dataset Created: {ml_dataset.shape[0]} lakes and {ml_dataset.shape[1]} features.")
print("Sample of predictors:", ml_dataset[['name', 'impervious_fraction', 'flow_accumulation_km2', 'sar_flood_freq_pct']].head())

---
---

### ML–Based Flood Risk Classification

This script builds and evaluates a **lake-level flood risk classification model** for Bengaluru using **observed SAR flood frequency** as the outcome and a set of **physically meaningful flood drivers** as predictors. The goal is to classify lakes into **Low Risk** and **High Risk** flood categories in a way that is interpretable and actionable.

---

**Data Loading**

- A pre-processed, lake-level dataset (`lake_flood_ml_ready.csv`) is loaded.
- Each row represents one lake, with rainfall, land-cover, drainage, and observed flood-frequency metrics already aggregated spatially.

---

**Feature Selection: The Three Pillars Framework**

- **Hydrological Drivers (Trigger):**
  - `max_3day_rain_mm` → cumulative wetness / system saturation  
  - `peak_30min_intensity_mm` → short-duration storm intensity

- **Land-Cover Vulnerability (Resistance):**
  - `impervious_fraction` → runoff efficiency  
  - `in_build_ha` → built-up pressure near lakes  
  - `encroachment_pct` → loss of natural buffer and storage

- **Landscape Topology (Gravity):**
  - `flow_accumulation_km2` → upstream drainage pressure  
  - `potential_ha` → lake basin scale

These features reflect **physical flood processes**, not just statistical convenience.

---

**Target Variable Construction**

- The continuous SAR-derived flood frequency (`sar_flood_freq_pct`) is converted into a binary risk label.
- Lakes with flood frequency **greater than 25%** are labelled as **High Risk** (`1`); others as **Low Risk** (`0`).
- This threshold produces a policy-friendly flood-risk classification while preserving an observational basis.

---

**Data Cleaning and Train–Test Split**

- Rows with missing feature or label values are removed.
- The dataset is split into:
  - **80% training data**
  - **20% testing data**
- A fixed random seed ensures reproducibility.

---

**Model Training**

- A **Random Forest Classifier** with 100 decision trees is trained.
- Random Forests are well suited here because:
  - flood drivers interact non-linearly
  - features operate at different scales
  - the model remains interpretable via feature importance

---

**Model Evaluation**

- Predictions are generated for the test set.
- Performance is assessed using:
  - **Accuracy** → overall correctness
  - **Classification report** → precision, recall, and F1-score for Low and High Risk classes
- This evaluates how well physical drivers explain observed flooding.

---

**Feature Importance Analysis**

- The contribution of each feature to the model’s decisions is extracted.
- Features are grouped by category (Rain, Buildings, Topology) to assess:
  - which physical processes dominate flood risk
- A bar plot visualizes relative importance for intuitive interpretation.

---

**Saving Final Predictions**

- Model predictions are mapped back to lake names.
- The output CSV contains:
  - lake name
  - observed flood frequency
  - true risk label
  - predicted risk class
- This enables direct comparison between observed and modelled flood risk.

---

**Conceptual Meaning**

- This workflow translates **observed flooding patterns** into a **predictive, interpretable risk classification**.
- It does not simulate floods; instead, it learns which combinations of rainfall, urbanisation, and drainage characteristics are associated with repeated inundation.
- The results are suitable for:
  - prioritising flood-prone lakes
  - policy and planning discussions
  - downstream regression or risk-index development

---

**One-line takeaway**

This code uses a Random Forest classifier to learn how rainfall, urban encroachment, and drainage topology jointly determine whether Bengaluru’s lakes are repeatedly flood-prone.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 1. LOAD AND MERGE
df_ml = pd.read_csv('data/lake_flood_ml_ready.csv')
df_topo = pd.read_csv('data/lake_slope_elevation.csv')
df = df_ml.merge(df_topo[['name', 'elevation', 'slope']], on='name', how='left')

# 2. FEATURE ENGINEERING
# WIBI Proxy: Detects hard debris in dry lake beds
df['wibi_proxy'] = df['encroachment_pct'] * (1 - (df['static_water_ha'] / df['potential_ha']))
# Urban Stress: Runoff pressure (Imperviousness x Catchment Flow)
df['urban_stress'] = df['impervious_fraction'] * df['flow_accumulation_km2']
# CSR: Catchment-to-Storage Ratio
df['csr_ratio'] = df['flow_accumulation_km2'] / (df['potential_ha'] + 0.01)

# 3. DEFINE FEATURE PILLARS
rain_feats = ['max_3day_rain_mm', 'peak_30min_intensity_mm']
modification_feats = ['impervious_fraction', 'wibi_proxy', 'urban_stress']
topology_feats = ['potential_ha', 'flow_accumulation_km2', 'csr_ratio', 'elevation', 'slope']

X_cols = rain_feats + modification_feats + topology_feats
df['risk_label'] = (df['sar_flood_freq_pct'] > 25).astype(int)
df_ml_final = df.dropna(subset=X_cols + ['risk_label'])

# 4. TRAIN MODEL
X = df_ml_final[X_cols]
y = df_ml_final['risk_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# 5. EXPORT PREDICTIONS
test_results = df_ml_final.loc[X_test.index, ['name', 'sar_flood_freq_pct', 'risk_label']].copy()
test_results['predicted_risk'] = model.predict(X_test)
test_results.to_csv('data/final_flood_risk_with_topo.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. LOAD AND PREPARE DATA
df_ml = pd.read_csv('data/lake_flood_ml_ready.csv')
df_topo = pd.read_csv('data/lake_slope_elevation.csv')
df = df_ml.merge(df_topo[['name', 'elevation', 'slope']], on='name', how='left')

# Feature Engineering
df['wibi_proxy'] = df['encroachment_pct'] * (1 - (df['static_water_ha'] / df['potential_ha']))
df['urban_stress'] = df['impervious_fraction'] * df['flow_accumulation_km2']
df['csr_ratio'] = df['flow_accumulation_km2'] / (df['potential_ha'] + 0.01)

# Categorization for Analysis
rain_feats = ['max_3day_rain_mm', 'peak_30min_intensity_mm']
modification_feats = ['impervious_fraction', 'wibi_proxy', 'urban_stress']
topology_feats = ['potential_ha', 'flow_accumulation_km2', 'csr_ratio', 'elevation', 'slope']

X_cols = rain_feats + modification_feats + topology_feats
category_map = {
    'max_3day_rain_mm': 'Rainfall (Trigger)',
    'peak_30min_intensity_mm': 'Rainfall (Trigger)',
    'impervious_fraction': 'Urban Modification',
    'wibi_proxy': 'Infilling Proxy (WIBI)',
    'urban_stress': 'Urban Modification',
    'potential_ha': 'Topology (Gravity)',
    'flow_accumulation_km2': 'Topology (Gravity)',
    'csr_ratio': 'Topology (Gravity)',
    'elevation': 'Topology (Gravity)',
    'slope': 'Topology (Gravity)'
}

# 2. MODEL TRAINING
df['risk_label'] = (df['sar_flood_freq_pct'] > 25).astype(int)
df_final = df.dropna(subset=X_cols + ['risk_label'])
X = df_final[X_cols]; y = df_final['risk_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# 3. VISUALIZATION: Feature Importance
feat_imp = pd.DataFrame({'Feature': X_cols, 'Importance': model.feature_importances_})
feat_imp['Category'] = feat_imp['Feature'].map(category_map)
feat_imp = feat_imp.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(data=feat_imp, x='Importance', y='Feature', hue='Category', dodge=False)
plt.title('Drivers of Flood Risk: Feature Contribution')
plt.savefig('flood_feature_importance.png')

# 4. VISUALIZATION: Correlation Heatmap
plt.figure(figsize=(12, 10))
corr_matrix = df_final[X_cols + ['sar_flood_freq_pct']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation: Topography, Urbanization & Flooding')
plt.savefig('flood_correlation_heatmap.png')

# 5. VISUALIZATION: Topographic Impact
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
sns.scatterplot(data=df_final, x='elevation', y='sar_flood_freq_pct', hue='risk_label', ax=ax1)
ax1.set_title('Elevation vs Flood Frequency')
sns.scatterplot(data=df_final, x='slope', y='sar_flood_freq_pct', hue='risk_label', ax=ax2)
ax2.set_title('Slope vs Flood Frequency')
plt.savefig('topography_impact.png')

print(classification_report(y_test, model.predict(X_test)))