# Flooding Event Dataset Generation 

This notebook generates a comprehensive JSON dataset from flooding event data sources with optimized code structure.

## Data Sources:
1. **Unified_Peak_Data_2016_2017_with_ID(1006).csv** - Gage data for 2016-2017
2. **Unified_Peak_Data_2018_and_later_with_ID(1006).csv** - Gage data for 2018+
3. **matched_records_1947_with_ID_2016_2017(1006).csv** - HWM data for 2016-2017
4. **matched_records_698_with_ID_2018_and_later(1006).csv** - HWM data for 2018+

## File Mapping:
- **Numpy files (embedding)**: 
  - embedding_1Y_later/ - for 2016-2017 data
  - embedding_1Y_early/ - for 2018+ data
- **Weather files**: 
  - gage_2016_2017_24h/ and hwm_2016_2017_24h/ - for 2016-2017 data
  - gage_2018_later_24h/ and hwm_2018_later_24h/ - for 2018+ data

## Naming Convention:
- **Gage data**: gage_[ID].npz, gage_[ID].csv
- **HWM data**: HWM_[ID].npz, HWM_[ID].csv


## Import Required Libraries


In [1]:
import csv
import json
import os
import re
from pathlib import Path
import pandas as pd
from datetime import datetime


## Configuration and File Paths


In [2]:
# Define input CSV files
csv_files = {
    'gage_2016_2017': "/u/wz53/alphaearth/Flooding_event_/Flood_dataset/Unified_Peak_Data_2016_2017_with_ID(1006).csv",
    'gage_2018_later': "/u/wz53/alphaearth/Flooding_event_/Flood_dataset/Unified_Peak_Data_2018_and_later_with_ID(1006).csv",
    'hwm_2016_2017': "/u/wz53/alphaearth/Flooding_event_/Flood_dataset/matched_records_1947_with_ID_2016_2017(1006).csv",
    'hwm_2018_later': "/u/wz53/alphaearth/Flooding_event_/Flood_dataset/matched_records_698_with_ID_2018_and_later(1006).csv"
}

# Define directories for numpy files (embeddings)
numpy_dirs = {
    'embedding_1Y_later': "/u/wz53/alphaearth/Flooding_event_/Flood_dataset/embedding_1Y_later",
    'embedding_1Y_early': "/u/wz53/alphaearth/Flooding_event_/Flood_dataset/embedding_1Y_early"
}

# Define directories for weather files
weather_dirs = {
    'gage_2016_2017': "/u/wz53/alphaearth/Flooding_event_/Flood_dataset/gage_2016_2017_24h",
    'gage_2018_later': "/u/wz53/alphaearth/Flooding_event_/Flood_dataset/gage_2018_later_24h",
    'hwm_2016_2017': "/u/wz53/alphaearth/Flooding_event_/Flood_dataset/hwm_2016_2017_24h",
    'hwm_2018_later': "/u/wz53/alphaearth/Flooding_event_/Flood_dataset/hwm_2018_later_24h"
}

# Output directory
output_dir = "/u/wz53/alphaearth/Flooding_event_/Flood_dataset_example"

print("File paths configured:")
print("\nInput CSV files:")
for key, path in csv_files.items():
    print(f"  {key}: {path}")

print("\nNumpy directories:")
for key, path in numpy_dirs.items():
    print(f"  {key}: {path}")

print("\nWeather directories:")
for key, path in weather_dirs.items():
    print(f"  {key}: {path}")

print(f"\nOutput directory: {output_dir}")


File paths configured:

Input CSV files:
  gage_2016_2017: /u/wz53/alphaearth/Flooding_event_/Flood_dataset/Unified_Peak_Data_2016_2017_with_ID(1006).csv
  gage_2018_later: /u/wz53/alphaearth/Flooding_event_/Flood_dataset/Unified_Peak_Data_2018_and_later_with_ID(1006).csv
  hwm_2016_2017: /u/wz53/alphaearth/Flooding_event_/Flood_dataset/matched_records_1947_with_ID_2016_2017(1006).csv
  hwm_2018_later: /u/wz53/alphaearth/Flooding_event_/Flood_dataset/matched_records_698_with_ID_2018_and_later(1006).csv

Numpy directories:
  embedding_1Y_later: /u/wz53/alphaearth/Flooding_event_/Flood_dataset/embedding_1Y_later
  embedding_1Y_early: /u/wz53/alphaearth/Flooding_event_/Flood_dataset/embedding_1Y_early

Weather directories:
  gage_2016_2017: /u/wz53/alphaearth/Flooding_event_/Flood_dataset/gage_2016_2017_24h
  gage_2018_later: /u/wz53/alphaearth/Flooding_event_/Flood_dataset/gage_2018_later_24h
  hwm_2016_2017: /u/wz53/alphaearth/Flooding_event_/Flood_dataset/hwm_2016_2017_24h
  hwm_2018_l

## Utility Functions


In [3]:
def scan_directory_files(directory):
    """
    Scan a directory and return a dictionary mapping file IDs to full paths.
    
    Args:
        directory (str): Directory path to scan
        
    Returns:
        dict: {file_id: full_path} mapping
    """
    file_map = {}
    if not os.path.exists(directory):
        print(f"Warning: Directory not found: {directory}")
        return file_map
    
    for filename in os.listdir(directory):
        if filename.endswith('.npz') or filename.endswith('.csv'):
            # Extract ID from filename
            # Format: gage_[ID].ext or HWM_[ID].ext
            basename = filename.rsplit('.', 1)[0]  # Remove extension
            if '_' in basename:
                file_id = basename.split('_', 1)[1]  # Get part after first underscore
                full_path = os.path.join(directory, filename)
                file_map[file_id] = full_path
    
    return file_map

def find_matching_files(record_id, data_type, period):
    """
    Find matching numpy and weather files based on ID, data type, and period.
    
    Args:
        record_id (str): Record ID to match
        data_type (str): 'gage' or 'hwm'
        period (str): '2016_2017' or '2018_later'
        
    Returns:
        tuple: (numpy_file_path, weather_file_path) with full paths or (None, None)
    """
    numpy_file = None
    weather_file = None
    
    # Determine numpy directory based on period
    if period == '2016_2017':
        numpy_dir = numpy_dirs['embedding_1Y_later']
    else:  # 2018_later
        numpy_dir = numpy_dirs['embedding_1Y_early']
    
    # Determine weather directory based on data type and period
    weather_key = f"{data_type}_{period}"
    weather_dir = weather_dirs.get(weather_key)
    
    if not weather_dir:
        return None, None
    
    # Build expected filenames
    if data_type == 'gage':
        numpy_filename = f"gage_{record_id}.npz"
        weather_filename = f"gage_{record_id}.csv"
    else:  # hwm
        numpy_filename = f"HWM_{record_id}.npz"
        weather_filename = f"HWM_{record_id}.csv"
    
    # Check if files exist
    numpy_path = os.path.join(numpy_dir, numpy_filename)
    weather_path = os.path.join(weather_dir, weather_filename)
    
    if os.path.exists(numpy_path):
        numpy_file = numpy_path
    
    if os.path.exists(weather_path):
        weather_file = weather_path
    
    return numpy_file, weather_file

def safe_float(value):
    """
    Safely convert value to float, return None if invalid.
    
    Args:
        value: Value to convert (can be number, string, or None)
        
    Returns:
        float: Converted value, or None if invalid
    """
    if value is None or pd.isna(value) or str(value).strip() in ['', 'nan', 'NaN', 'None']:
        return None
    try:
        return float(value)
    except (ValueError, TypeError):
        return None

def extract_record(row, data_type):
    """
    Extract required fields from CSV row (CORRECTED VERSION).
    
    IMPORTANT CHANGES:
    - For gage data: Uses elevation_m and height_above_gnd_m from CSV (already in meters)
    - For hwm data: Uses elev_ft_m as elevation_m and height_above_gnd_m from CSV (already converted)
    
    Args:
        row (dict): CSV row data
        data_type (str): 'gage' or 'hwm'
        
    Returns:
        dict: Processed record
    """
    if data_type == 'gage':
        # Gage data (Unified_Peak_Data)
        # CSV files have been updated with height_above_gnd_m column (converted from ft to m)
        return {
            "ID": row.get("ID", ""),
            "latitude": safe_float(row.get("latitude")),
            "longitude": safe_float(row.get("longitude")),
            "Ground_Elevation_m": safe_float(row.get("Ground_Elevation_m")),
            "site_no": row.get("site_no", ""),
            "station_id": row.get("station_id", ""),
            "peak_date": row.get("peak_date", ""),
            "elevation_m": safe_float(row.get("elevation_m")),
            "peak_stage": safe_float(row.get("peak_stage")),
            "event": row.get("event", ""),
            "source": row.get("source", ""),
            "height_above_gnd_m": safe_float(row.get("height_above_gnd_m")),  # Use converted column
            "data_type": "gage"
        }
    else:
        # HWM data (matched_records)
        # CORRECTED: Use elev_ft_m as elevation_m, and height_above_gnd_m (both already converted to meters)
        return {
            "ID": row.get("ID", ""),
            "latitude": safe_float(row.get("latitude")),
            "longitude": safe_float(row.get("longitude")),
            "site_no": row.get("site_no", ""),
            "elevation_m": safe_float(row.get("elev_ft_m")),  # CORRECTED: Use elev_ft_m
            "peak_stage": safe_float(row.get("elev_ft_m")),   # HWM uses same value for peak_stage
            "height_above_gnd_m": safe_float(row.get("height_above_gnd_m")),  # CORRECTED: Use converted column
            "data_type": "hwm"
        }

print("Utility functions defined successfully!")
print("CORRECTED VERSION: Now properly extracts elevation_m and height_above_gnd_m from CSV files")


Utility functions defined successfully!
CORRECTED VERSION: Now properly extracts elevation_m and height_above_gnd_m from CSV files


## Load Available Files


In [4]:
# Count files in each directory
print("Scanning directories and counting files...")
print("=" * 60)

# Scan numpy directories
numpy_file_counts = {}
for key, directory in numpy_dirs.items():
    if os.path.exists(directory):
        files = [f for f in os.listdir(directory) if f.endswith('.npz')]
        numpy_file_counts[key] = len(files)
        print(f"Numpy directory: {key}")
        print(f"  Path: {directory}")
        print(f"  Files: {len(files)}")
        if files[:3]:
            print(f"  Sample: {', '.join(files[:3])}")
    else:
        numpy_file_counts[key] = 0
        print(f"Numpy directory: {key} - NOT FOUND")

print()

# Scan weather directories
weather_file_counts = {}
for key, directory in weather_dirs.items():
    if os.path.exists(directory):
        files = [f for f in os.listdir(directory) if f.endswith('.csv')]
        weather_file_counts[key] = len(files)
        print(f"Weather directory: {key}")
        print(f"  Path: {directory}")
        print(f"  Files: {len(files)}")
        if files[:3]:
            print(f"  Sample: {', '.join(files[:3])}")
    else:
        weather_file_counts[key] = 0
        print(f"Weather directory: {key} - NOT FOUND")

print()
print("Directory scanning completed!")


Scanning directories and counting files...
Numpy directory: embedding_1Y_later
  Path: /u/wz53/alphaearth/Flooding_event_/Flood_dataset/embedding_1Y_later
  Files: 3896
  Sample: HWM_1.npz, HWM_10.npz, HWM_100.npz
Numpy directory: embedding_1Y_early
  Path: /u/wz53/alphaearth/Flooding_event_/Flood_dataset/embedding_1Y_early
  Files: 2995
  Sample: HWM_1949.npz, HWM_1950.npz, HWM_1951.npz

Weather directory: gage_2016_2017
  Path: /u/wz53/alphaearth/Flooding_event_/Flood_dataset/gage_2016_2017_24h
  Files: 1949
  Sample: gage_10001.csv, gage_10002.csv, gage_10003.csv
Weather directory: gage_2018_later
  Path: /u/wz53/alphaearth/Flooding_event_/Flood_dataset/gage_2018_later_24h
  Files: 2297
  Sample: gage_12002.csv, gage_12003.csv, gage_12004.csv
Weather directory: hwm_2016_2017
  Path: /u/wz53/alphaearth/Flooding_event_/Flood_dataset/hwm_2016_2017_24h
  Files: 1947
  Sample: HWM_1.csv, HWM_10.csv, HWM_100.csv
Weather directory: hwm_2018_later
  Path: /u/wz53/alphaearth/Flooding_event_/

## Process All CSV Files


In [5]:
def calculate_comprehensive_stats(all_data):
    """Calculate all statistics in one pass to avoid redundancy"""
    stats = {
        'total_records': len(all_data),
        'gage_records': 0,
        'hwm_records': 0,
        'records_with_numpy': 0,
        'records_with_weather': 0,
        'records_with_both': 0,
        'records_with_elevation': 0,
        'records_with_height': 0,
        'records_with_both_elev_height': 0,
        'by_period': {'2016_2017': 0, '2018_later': 0},
        'by_type_period': {}
    }
    
    for record in all_data:
        # Basic counts
        if record['data_type'] == 'gage':
            stats['gage_records'] += 1
        else:
            stats['hwm_records'] += 1
        
        # File availability
        if record['numpy_file']:
            stats['records_with_numpy'] += 1
        if record['weather_file']:
            stats['records_with_weather'] += 1
        if record['numpy_file'] and record['weather_file']:
            stats['records_with_both'] += 1
        
        # Elevation data
        if record.get('elevation_m') is not None:
            stats['records_with_elevation'] += 1
        if record.get('height_above_gnd_m') is not None:
            stats['records_with_height'] += 1
        if record.get('elevation_m') is not None and record.get('height_above_gnd_m') is not None:
            stats['records_with_both_elev_height'] += 1
        
        # Period counts
        period = record.get('period', 'unknown')
        if period in stats['by_period']:
            stats['by_period'][period] += 1
        
        # Type-period combinations
        key = f"{record['data_type']}_{period}"
        stats['by_type_period'][key] = stats['by_type_period'].get(key, 0) + 1
    
    return stats

print("Processing all CSV files...")
print("=" * 60)

all_data = []

# Process each CSV file
for file_key, csv_path in csv_files.items():
    print(f"\nProcessing: {file_key}")
    
    if not os.path.exists(csv_path):
        print(f"  ERROR: File not found!")
        continue
    
    # Determine data type and period
    data_type = 'gage' if 'gage' in file_key else 'hwm'
    period = '2016_2017' if '2016_2017' in file_key else '2018_later'
    
    # Read and process CSV file
    df = pd.read_csv(csv_path, low_memory=False)
    print(f"  Records: {len(df)}")
    
    # Process each record
    for idx, row in df.iterrows():
        record = extract_record(row.to_dict(), data_type)
        record_id = str(row.get('ID', ''))
        
        if not record_id:
            continue
        
        # Find matching files and add metadata
        numpy_file, weather_file = find_matching_files(record_id, data_type, period)
        record.update({
            'numpy_file': numpy_file,
            'weather_file': weather_file,
            'period': period
        })
        
        all_data.append(record)
    
    print(f"  Processed: {len(df)} records")

# Calculate all statistics once
stats = calculate_comprehensive_stats(all_data)

print(f"\n{'=' * 60}")
print(f"Processing completed: {stats['total_records']} total records")
print(f"  Gage: {stats['gage_records']}, HWM: {stats['hwm_records']}")
print(f"  With numpy: {stats['records_with_numpy']} ({stats['records_with_numpy']/stats['total_records']*100:.1f}%)")
print(f"  With weather: {stats['records_with_weather']} ({stats['records_with_weather']/stats['total_records']*100:.1f}%)")
print(f"  With both files: {stats['records_with_both']} ({stats['records_with_both']/stats['total_records']*100:.1f}%)")
print(f"  With both elevations: {stats['records_with_both_elev_height']} ({stats['records_with_both_elev_height']/stats['total_records']*100:.1f}%)")


Processing all CSV files...

Processing: gage_2016_2017
  Records: 1949
  Processed: 1949 records

Processing: gage_2018_later
  Records: 2297
  Processed: 2297 records

Processing: hwm_2016_2017
  Records: 1947
  Processed: 1947 records

Processing: hwm_2018_later
  Records: 698
  Processed: 698 records

Processing completed: 6891 total records
  Gage: 4246, HWM: 2645
  With numpy: 6891 (100.0%)
  With weather: 6891 (100.0%)
  With both files: 6891 (100.0%)
  With both elevations: 3720 (54.0%)


## Data Statistics and Summary


In [6]:
print("Data Statistics Summary")
print("=" * 60)

print(f"\nOverall Statistics:")
print(f"  Total records: {stats['total_records']}")
print(f"  Gage records: {stats['gage_records']}")
print(f"  HWM records: {stats['hwm_records']}")

print(f"\nDetailed Statistics by Type-Period:")
for key, count in stats['by_type_period'].items():
    print(f"  {key}: {count} records")

print(f"\nFile Matching Summary:")
print(f"  Records with numpy files: {stats['records_with_numpy']} ({stats['records_with_numpy']/stats['total_records']*100:.1f}%)")
print(f"  Records with weather files: {stats['records_with_weather']} ({stats['records_with_weather']/stats['total_records']*100:.1f}%)")
print(f"  Records with both files: {stats['records_with_both']} ({stats['records_with_both']/stats['total_records']*100:.1f}%)")

print(f"\nElevation Data Summary:")
print(f"  Records with elevation_m: {stats['records_with_elevation']} ({stats['records_with_elevation']/stats['total_records']*100:.1f}%)")
print(f"  Records with height_above_gnd_m: {stats['records_with_height']} ({stats['records_with_height']/stats['total_records']*100:.1f}%)")
print(f"  Records with both elevations: {stats['records_with_both_elev_height']} ({stats['records_with_both_elev_height']/stats['total_records']*100:.1f}%)")

print(f"\nData Completeness:")
print(f"  Records ready for analysis: {stats['records_with_both']}/{stats['total_records']}")


Data Statistics Summary

Overall Statistics:
  Total records: 6891
  Gage records: 4246
  HWM records: 2645

Detailed Statistics by Type-Period:
  gage_2016_2017: 1949 records
  gage_2018_later: 2297 records
  hwm_2016_2017: 1947 records
  hwm_2018_later: 698 records

File Matching Summary:
  Records with numpy files: 6891 (100.0%)
  Records with weather files: 6891 (100.0%)
  Records with both files: 6891 (100.0%)

Elevation Data Summary:
  Records with elevation_m: 6855 (99.5%)
  Records with height_above_gnd_m: 3743 (54.3%)
  Records with both elevations: 3720 (54.0%)

Data Completeness:
  Records ready for analysis: 6891/6891


## Generate JSON Dataset


In [7]:
def create_dataset_metadata(stats, numpy_file_counts, weather_file_counts):
    """Create comprehensive metadata using pre-calculated statistics"""
    return {
        "description": "Comprehensive flooding event dataset with file paths (Optimized Version)",
        "version": "5.0",
        "generated_timestamp": datetime.now().isoformat(),
        "total_records": stats['total_records'],
        "gage_records": stats['gage_records'],
        "hwm_records": stats['hwm_records'],
        "data_sources": csv_files,
        "file_directories": {
            "numpy": numpy_dirs,
            "weather": weather_dirs
        },
        "file_counts": {
            "numpy_files": numpy_file_counts,
            "weather_files": weather_file_counts
        },
        "statistics": {
            "records_with_numpy": stats['records_with_numpy'],
            "records_with_weather": stats['records_with_weather'],
            "records_with_both": stats['records_with_both'],
            "records_with_elevation_m": stats['records_with_elevation'],
            "records_with_height_above_gnd_m": stats['records_with_height'],
            "records_with_both_elev_height": stats['records_with_both_elev_height'],
            "matching_rates": {
                "numpy": f"{stats['records_with_numpy']/stats['total_records']*100:.2f}%",
                "weather": f"{stats['records_with_weather']/stats['total_records']*100:.2f}%",
                "both": f"{stats['records_with_both']/stats['total_records']*100:.2f}%",
                "both_elev_height": f"{stats['records_with_both_elev_height']/stats['total_records']*100:.2f}%"
            }
        },
        "by_type_period": stats['by_type_period'],
        "notes": [
            "numpy_file and weather_file fields contain full absolute paths",
            "Gage data uses prefix 'gage_' for files",
            "HWM data uses prefix 'HWM_' for files",
            "2016-2017 data uses embedding_1Y_later directory",
            "2018+ data uses embedding_1Y_early directory",
            "All elevation values are in meters",
            "Optimized version with consolidated statistics"
        ]
    }

# Create output filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = os.path.join(output_dir, f"flooding_dataset_optimized_{timestamp}.json")

print(f"Generating JSON dataset...")
print(f"Output file: {output_file}")

# Create dataset structure using pre-calculated statistics
dataset = {
    "metadata": create_dataset_metadata(stats, numpy_file_counts, weather_file_counts),
    "data": all_data
}

# Write JSON file
try:
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(dataset, f, indent=2, ensure_ascii=False)
    
    file_size_mb = os.path.getsize(output_file) / 1024 / 1024
    print(f"✓ JSON dataset generated successfully!")
    print(f"  File: {output_file}")
    print(f"  Size: {file_size_mb:.2f} MB")
    print(f"  Records: {stats['total_records']:,}")
    print(f"  Records with both files: {stats['records_with_both']:,} ({stats['records_with_both']/stats['total_records']*100:.1f}%)")
    print(f"  Records with both elevations: {stats['records_with_both_elev_height']:,} ({stats['records_with_both_elev_height']/stats['total_records']*100:.1f}%)")
    
except Exception as e:
    print(f"Error generating JSON file: {e}")


Generating JSON dataset...
Output file: /u/wz53/alphaearth/Flooding_event_/Flood_dataset_example/flooding_dataset_optimized_20251020_233912.json
✓ JSON dataset generated successfully!
  File: /u/wz53/alphaearth/Flooding_event_/Flood_dataset_example/flooding_dataset_optimized_20251020_233912.json
  Size: 3.98 MB
  Records: 6,891
  Records with both files: 6,891 (100.0%)
  Records with both elevations: 3,720 (54.0%)


## Verification and Summary


In [8]:
def verify_sample_records(all_data):
    """Verify sample records and check file existence"""
    print("Sample Record Verification")
    print("=" * 60)
    
    # Check a gage record
    gage_samples = [r for r in all_data if r['data_type'] == 'gage' and r['numpy_file']]
    if gage_samples:
        sample = gage_samples[0]
        print(f"\n1. Sample Gage Record:")
        print(f"  ID: {sample['ID']}, Site: {sample['site_no']}")
        print(f"  Coordinates: ({sample['latitude']}, {sample['longitude']})")
        print(f"  Files exist: numpy={os.path.exists(sample['numpy_file'])}, weather={os.path.exists(sample['weather_file']) if sample['weather_file'] else False}")
    
    # Check a HWM record
    hwm_samples = [r for r in all_data if r['data_type'] == 'hwm' and r['numpy_file']]
    if hwm_samples:
        sample = hwm_samples[0]
        print(f"\n2. Sample HWM Record:")
        print(f"  ID: {sample['ID']}, Site: {sample['site_no']}")
        print(f"  Coordinates: ({sample['latitude']}, {sample['longitude']})")
        print(f"  Files exist: numpy={os.path.exists(sample['numpy_file'])}, weather={os.path.exists(sample['weather_file']) if sample['weather_file'] else False}")

# Verify sample records
verify_sample_records(all_data)

print("\n" + "=" * 60)
print("Dataset Generation Summary")
print("=" * 60)
print(f"✓ Total records processed: {stats['total_records']:,}")
print(f"✓ Gage records: {stats['gage_records']:,}")
print(f"✓ HWM records: {stats['hwm_records']:,}")
print(f"✓ Records with both files: {stats['records_with_both']:,} ({stats['records_with_both']/stats['total_records']*100:.1f}%)")
print(f"✓ Records with both elevations: {stats['records_with_both_elev_height']:,} ({stats['records_with_both_elev_height']/stats['total_records']*100:.1f}%)")
print(f"\n✓ JSON file generated: {output_file}")
print(f"✓ File size: {os.path.getsize(output_file) / 1024 / 1024:.2f} MB")

print("\nDataset is ready for analysis! 🎉")


Sample Record Verification

1. Sample Gage Record:
  ID: 10001, Site: IAFAY24003
  Coordinates: (42.95753, -91.62403)
  Files exist: numpy=True, weather=True

2. Sample HWM Record:
  ID: 1, Site: IAFAY24003
  Coordinates: (42.95753, -91.62403)
  Files exist: numpy=True, weather=True

Dataset Generation Summary
✓ Total records processed: 6,891
✓ Gage records: 4,246
✓ HWM records: 2,645
✓ Records with both files: 6,891 (100.0%)
✓ Records with both elevations: 3,720 (54.0%)

✓ JSON file generated: /u/wz53/alphaearth/Flooding_event_/Flood_dataset_example/flooding_dataset_optimized_20251020_233912.json
✓ File size: 3.98 MB

Dataset is ready for analysis! 🎉


## Usage Example


## Generate Filtered Version (Optional)

Generate a filtered version containing only records with both elevation_m and height_above_gnd_m


In [9]:
def generate_filtered_dataset(all_data, stats, output_dir, source_file):
    """Generate filtered dataset with both elevation values"""
    print("Generating filtered version...")
    print("=" * 60)
    
    # Filter records using pre-calculated stats
    filtered_records = [r for r in all_data 
                        if r.get('elevation_m') is not None 
                        and r.get('height_above_gnd_m') is not None]
    
    # Calculate filtered statistics
    gage_filtered = len([r for r in filtered_records if r['data_type'] == 'gage'])
    hwm_filtered = len([r for r in filtered_records if r['data_type'] == 'hwm'])
    
    print(f"Filtered records: {len(filtered_records)} / {stats['total_records']}")
    print(f"  Gage: {gage_filtered}, HWM: {hwm_filtered}")
    
    # Create filtered dataset
    filtered_dataset = {
        "metadata": {
            "description": "Filtered flooding event dataset with both elevation_m and height_above_gnd_m values (Optimized Version)",
            "version": "5.0_filtered",
            "generated_timestamp": datetime.now().isoformat(),
            "source_file": source_file,
            "filter_criteria": "Records with both elevation_m and height_above_gnd_m not null",
            "original_total_records": stats['total_records'],
            "filtered_total_records": len(filtered_records),
            "total_records": len(filtered_records),
            "gage_records": gage_filtered,
            "hwm_records": hwm_filtered,
            "notes": [
                f"Filtered from {stats['total_records']} records to {len(filtered_records)} records",
                "All records have both elevation_m and height_above_gnd_m values",
                "Optimized version with consolidated statistics"
            ]
        },
        "data": filtered_records
    }
    
    # Save filtered file
    filtered_output = os.path.join(output_dir, f"flooding_dataset_filtered_with_both_elevations_{len(filtered_records)}items.json")
    
    with open(filtered_output, 'w', encoding='utf-8') as f:
        json.dump(filtered_dataset, f, indent=2, ensure_ascii=False)
    
    file_size_mb = os.path.getsize(filtered_output) / 1024 / 1024
    
    print(f"\n{'=' * 60}")
    print(f"✓ Filtered version generated!")
    print(f"  File: {filtered_output}")
    print(f"  Size: {file_size_mb:.2f} MB")
    print(f"  Records: {len(filtered_records):,}")
    print(f"  Percentage: {len(filtered_records)/stats['total_records']*100:.1f}% of total")
    
    return filtered_output

# Generate filtered version
filtered_output = generate_filtered_dataset(all_data, stats, output_dir, output_file)


Generating filtered version...
Filtered records: 3720 / 6891
  Gage: 1448, HWM: 2272

✓ Filtered version generated!
  File: /u/wz53/alphaearth/Flooding_event_/Flood_dataset_example/flooding_dataset_filtered_with_both_elevations_3720items.json
  Size: 1.98 MB
  Records: 3,720
  Percentage: 54.0% of total


In [10]:
# Example: How to load and use the generated JSON dataset
print("How to Use the Generated Dataset")
print("=" * 60)

print("""
# Load the JSON dataset
import json
import numpy as np
import pandas as pd

# Load JSON file
with open('flooding_dataset_updated_TIMESTAMP.json', 'r') as f:
    dataset = json.load(f)

# Access metadata
metadata = dataset['metadata']
print(f"Total records: {metadata['total_records']}")
print(f"Gage records: {metadata['gage_records']}")
print(f"HWM records: {metadata['hwm_records']}")

# Access data records
data_records = dataset['data']

# Filter records with both files
complete_records = [r for r in data_records if r['numpy_file'] and r['weather_file']]
print(f"Complete records: {len(complete_records)}")

# Example: Load a specific record's data
record = complete_records[0]
print(f"\\nRecord ID: {record['ID']}")
print(f"Type: {record['data_type']}")
print(f"Location: ({record['latitude']}, {record['longitude']})")

# Load numpy embedding
embedding = np.load(record['numpy_file'])
print(f"Embedding shape: {embedding['image_data'].shape}")

# Load weather data
weather_df = pd.read_csv(record['weather_file'])
print(f"Weather data shape: {weather_df.shape}")
print(f"Weather columns: {list(weather_df.columns)}")

# Example: Batch processing
for record in complete_records[:10]:  # Process first 10 records
    # Load embedding
    embedding = np.load(record['numpy_file'])
    
    # Load weather
    weather = pd.read_csv(record['weather_file'])
    
    # Your processing logic here
    pass
""")

print("\nKey Features:")
print("  • All file paths are absolute paths")
print("  • Easy to filter by data_type (gage/hwm) and period (2016_2017/2018_later)")
print("  • Metadata provides complete statistics and directory information")
print("  • Ready for machine learning and analysis workflows")

print("\n" + "=" * 60)
print("Dataset generation completed successfully! 🎉")
print("=" * 60)


How to Use the Generated Dataset

# Load the JSON dataset
import json
import numpy as np
import pandas as pd

# Load JSON file
with open('flooding_dataset_updated_TIMESTAMP.json', 'r') as f:
    dataset = json.load(f)

# Access metadata
metadata = dataset['metadata']
print(f"Total records: {metadata['total_records']}")
print(f"Gage records: {metadata['gage_records']}")
print(f"HWM records: {metadata['hwm_records']}")

# Access data records
data_records = dataset['data']

# Filter records with both files
complete_records = [r for r in data_records if r['numpy_file'] and r['weather_file']]
print(f"Complete records: {len(complete_records)}")

# Example: Load a specific record's data
record = complete_records[0]
print(f"\nRecord ID: {record['ID']}")
print(f"Type: {record['data_type']}")
print(f"Location: ({record['latitude']}, {record['longitude']})")

# Load numpy embedding
embedding = np.load(record['numpy_file'])
print(f"Embedding shape: {embedding['image_data'].shape}")

# Load weather