# SoV Data Exploration

This notebook explores datasets in `data/SoV_data` for quick validation and profiling.

In [None]:
from pathlib import Path
import json
import pandas as pd

try:
    import geopandas as gpd
except ImportError:
    gpd = None

DATA_DIR = Path('../data/SoV_data')
DATA_DIR.resolve()

In [None]:
geojson_files = sorted(DATA_DIR.rglob('*.geojson'))
len(geojson_files), [f.name for f in geojson_files[:10]]

In [None]:
def quick_geojson_summary(path: Path):
    with path.open('r', encoding='utf-8') as f:
        doc = json.load(f)

    features = doc.get('features', [])
    feature_count = len(features)

    geom_types = {}
    prop_keys = set()
    for feat in features:
        geom_type = (feat.get('geometry') or {}).get('type', 'None')
        geom_types[geom_type] = geom_types.get(geom_type, 0) + 1
        props = feat.get('properties') or {}
        prop_keys.update(props.keys())

    return {
        'file': str(path.relative_to(DATA_DIR)),
        'feature_count': feature_count,
        'geometry_types': geom_types,
        'property_count': len(prop_keys),
        'sample_properties': sorted(prop_keys)[:20],
    }

In [None]:
summaries = [quick_geojson_summary(path) for path in geojson_files]
summary_df = pd.DataFrame([{k: v for k, v in s.items() if k != 'sample_properties'} for s in summaries])
summary_df

In [None]:
# Pick a file to inspect in detail
target = geojson_files[0]
target

In [None]:
if gpd is not None:
    gdf = gpd.read_file(target)
    display(gdf.head())
    print('Rows:', len(gdf))
    print('Columns:', list(gdf.columns))
    if hasattr(gdf, 'geometry'):
        print('Geometry types:', gdf.geometry.geom_type.value_counts(dropna=False).to_dict())
else:
    print('geopandas is not installed. Install with: pip install geopandas')

In [None]:
# Optional: inspect missing values for the loaded file
if gpd is not None:
    missing = gdf.isna().sum().sort_values(ascending=False)
    missing[missing > 0].head(25)