In [58]:
import os
import glob
import geopandas as gpd
import pandas as pd

def load_all_combined_geojson(base_dir):
    """
    Loads all *_combined.geojson files from subdirectories in the specified base directory.
    """
    pattern = os.path.join(base_dir, "*", "*_combined.geojson")
    files = glob.glob(pattern)
    all_data = []

    for file in files:
        try:
            gdf = gpd.read_file(file)
            gdf["tile_id"] = os.path.basename(os.path.dirname(file))  # optional tile ID tag
            all_data.append(gdf)
        except Exception as e:
            print(f"⚠️ Failed to load {file}: {e}")

    if all_data:
        return pd.concat(all_data, ignore_index=True)
    else:
        raise FileNotFoundError("No *_combined.geojson files found in subfolders.")

def classify_cases(df):
    """
    Classifies each row into one of the signage misplacement cases.
    """
    def classify(row):
        source = str(row.get("source", "")).lower()
        pedestrian_access = row.get("pedestrianAccess", False)
        bridge = "bridge" in str(row.get("name", "")).lower() or "bridge" in str(row.get("type", "")).lower()

        # Handle confidence safely
        try:
            confidence = float(row.get("confidence", 0))
        except (TypeError, ValueError):
            confidence = 0.0

        # === Case 1: High-confidence real-time issue ===
        if row.get("issue", False) and confidence >= 0.5:
            return "Case 1"
        # === Case 2: Pedestrian access area with issue ===
        elif row.get("issue", False) and pedestrian_access:
            return "Case 2"
        # === Case 3: No visible issue but low confidence ===
        elif not row.get("issue", False) and confidence <= 0.2:
            return "Case 3"
        # === Case 4: Pedestrian bridge ===
        elif bridge:
            return "Case 4"
        else:
            return "No Issue"

    df["case"] = df.apply(classify, axis=1)
    return df

def run_validation_pipeline(base_dir):
    print("📥 Loading GeoJSON files...")
    all_data = load_all_combined_geojson(base_dir)
    print(f"✅ Loaded {len(all_data)} records.")

    print("🧠 Running classification...")
    classified = classify_cases(all_data)

    # Save output
    output_path = os.path.join(base_dir, "classified_signage_issues.geojson")
    classified.to_file(output_path, driver="GeoJSON")
    print(f"💾 Results saved to: {output_path}")

    print("\n📊 Summary of Cases Detected:")
    print(classified["case"].value_counts())

# Replace this path with your actual base folder
base_dir = "/Users/kush/Chicago_Hackathon_base_datasets"
run_validation_pipeline(base_dir)

📥 Loading GeoJSON files...
✅ Loaded 295963 records.
🧠 Running classification...
💾 Results saved to: /Users/kush/Chicago_Hackathon_base_datasets/classified_signage_issues.geojson

📊 Summary of Cases Detected:
case
Case 3    295963
Name: count, dtype: int64
