In [2]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
from shapely.wkt import loads
from shapely.geometry import Polygon
import cv2
import glob
from tqdm import tqdm
import re
import geopandas as gpd
from matplotlib.colors import ListedColormap
import matplotlib.patches as patches
import matplotlib.patheffects as PathEffects

In [3]:
# Define the base paths
BASE_DIR = "data/xBD"
OUTPUT_DIR = "output/analysis"

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Get all disaster type directories
disaster_dirs = [d for d in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))]
print(f"Found {len(disaster_dirs)} disaster types: {', '.join(disaster_dirs)}")

Found 19 disaster types: guatemala-volcano, nepal-flooding, mexico-earthquake, hurricane-matthew, pinery-bushfire, midwest-flooding, lower-puna-volcano, moore-tornado, hurricane-harvey, joplin-tornado, portugal-wildfire, sunda-tsunami, palu-tsunami, santa-rosa-wildfire, woolsey-fire, hurricane-michael, tuscaloosa-tornado, hurricane-florence, socal-fire


In [4]:
# Analyze the dataset structure
dataset_stats = {}

for disaster in disaster_dirs:
    disaster_path = os.path.join(BASE_DIR, disaster)
    
    # Count images
    pre_images = len(glob.glob(os.path.join(disaster_path, "images", "*_pre_disaster.png")))
    post_images = len(glob.glob(os.path.join(disaster_path, "images", "*_post_disaster.png")))
    
    # Count labels
    pre_labels = len(glob.glob(os.path.join(disaster_path, "labels", "*_pre_disaster.json")))
    post_labels = len(glob.glob(os.path.join(disaster_path, "labels", "*_post_disaster.json")))
    
    dataset_stats[disaster] = {
        "pre_images": pre_images,
        "post_images": post_images,
        "pre_labels": pre_labels,
        "post_labels": post_labels
    }

# Convert to DataFrame for better visualization
stats_df = pd.DataFrame.from_dict(dataset_stats, orient='index')
stats_df['total_images'] = stats_df['pre_images'] + stats_df['post_images']
stats_df['total_labels'] = stats_df['pre_labels'] + stats_df['post_labels']

print(stats_df)

                     pre_images  post_images  pre_labels  post_labels  \
guatemala-volcano            18           18          18           18   
nepal-flooding              619          619         619          619   
mexico-earthquake           121          121         121          121   
hurricane-matthew           238          238         238          238   
pinery-bushfire            1845         1845        1845         1845   
midwest-flooding            279          279         279          279   
lower-puna-volcano          291          291         291          291   
moore-tornado               227          227         227          227   
hurricane-harvey            319          319         319          319   
joplin-tornado              149          149         149          149   
portugal-wildfire          1869         1869        1869         1869   
sunda-tsunami               148          148         148          148   
palu-tsunami                113          113       

In [5]:
# Parse JSON files to extract damage classes and count buildings
damage_stats = defaultdict(lambda: defaultdict(int))
building_counts = defaultdict(int)
total_buildings = 0

# Define damage class mapping
damage_class_map = {
    'no-damage': 0,
    'minor-damage': 1,
    'major-damage': 2,
    'destroyed': 3,
    'un-classified': 4
}

# Process each disaster type
for disaster in tqdm(disaster_dirs, desc="Processing disaster types"):
    label_path = os.path.join(BASE_DIR, disaster, "labels")
    post_labels = glob.glob(os.path.join(label_path, "*_post_disaster.json"))
    
    for label_file in tqdm(post_labels, desc=f"Processing {disaster} labels", leave=False):
        with open(label_file, 'r') as f:
            try:
                label_data = json.load(f)
                
                # Extract building polygons and their damage types from features
                if 'features' in label_data:
                    features = label_data['features']
                    
                    # Check both lng_lat and xy fields for building data
                    building_features = []
                    if 'lng_lat' in features:
                        building_features.extend(features['lng_lat'])
                    if 'xy' in features:
                        building_features.extend(features['xy'])
                    
                    for feature in building_features:
                        if 'properties' in feature and 'subtype' in feature['properties']:
                            subtype = feature['properties']['subtype']
                            damage_stats[disaster][subtype] += 1
                            building_counts[disaster] += 1
                            total_buildings += 1
            except json.JSONDecodeError:
                print(f"Error reading JSON file: {label_file}")
                continue

# Convert to DataFrame
damage_df = pd.DataFrame(damage_stats)
damage_df = damage_df.fillna(0)
damage_df = damage_df.astype(int)

print(f"Total buildings in dataset: {total_buildings}")
print("\nDamage class distribution by disaster type:")
print(damage_df)

# Calculate percentages
damage_percentages = {}
for disaster in damage_stats:
    total = sum(damage_stats[disaster].values())
    damage_percentages[disaster] = {
        cls: (count / total) * 100 
        for cls, count in damage_stats[disaster].items()
    }

damage_pct_df = pd.DataFrame(damage_percentages)
damage_pct_df = damage_pct_df.fillna(0)

print("\nDamage class distribution percentages by disaster type:")
print(damage_pct_df.round(2))

Processing disaster types: 100%|██████████| 19/19 [00:02<00:00,  8.05it/s]

Total buildings in dataset: 632228

Damage class distribution by disaster type:
               guatemala-volcano  nepal-flooding  mexico-earthquake  \
no-damage                   1278           62450              64132   
major-damage                  18            9442                 36   
destroyed                     50            1004                  4   
minor-damage                  18           10268                220   
un-classified                348            3366                150   

               hurricane-matthew  pinery-bushfire  midwest-flooding  \
no-damage                   5030            10054             16256   
major-damage                3890              198               238   
destroyed                   4294              458               150   
minor-damage               13096              164               298   
un-classified               1568             1048               570   

               lower-puna-volcano  moore-tornado  hurricane-harvey


