Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Change log

### 0.3.0
- Default to OSW 0.3 dataset-specific schemas (edges, lines, nodes, points, polygons, zones) with filename-driven selection; removed legacy monolithic/geometry schema files.
- Enforce the six canonical OSW 0.3 filenames inside datasets; reject non-standard names and detect duplicates/missing required files (with new unit tests).
- Validation now ignores `$schema` hints and does not fall back to geometry typing; line schema is the final fallback when filenames give no hint.
- Expanded test coverage for extension read failures, invalid extension ID extraction, `_w_id` missing in zones, cleanup edge cases, and required-file detection.

### 0.2.15
- Update the base schema to make the $schema key is required
- Added unit test cases for that
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ This package validates the OSW geojson file. Package requires a OSW zip file pat

- It unzip the provided zip files
- Check for the required nodes and edges geojson files inside the unzipped folder
- Validate each file (nodes, edges and points) against schema, schema can be found here
- Validate each file (edges, lines, nodes, points, polygons and zones) against the matching schema (0.3 defaults live in `src/python_osw_validation/schema`)
- Return true or false according to validation
- you can check the error if it returned false.

Expand Down Expand Up @@ -127,4 +127,4 @@ To use the library locally, use the [example.py](./src/example.py) code
- Choose `main` branch for release
- Publish the release.
- This release triggers a workflow to generate the new version of the Package.
- The new package will be available at https://pypi.org/project/python-osw-validation/
- The new package will be available at https://pypi.org/project/python-osw-validation/
2 changes: 1 addition & 1 deletion src/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
INVALID_ZIP_FILE = os.path.join(ASSETS_DIR, '4151.zip')
INVALID_VANCOUVER_ZIP_FILE = os.path.join(ASSETS_DIR, 'vancouver-dataset.zip')
SCHEMA_DIR = os.path.join(PARENT_DIR, 'src/python_osw_validation/schema')
SCHEMA_FILE_PATH = os.path.join(SCHEMA_DIR, 'opensidewalks.schema.json')
SCHEMA_FILE_PATH = os.path.join(SCHEMA_DIR, 'opensidewalks.schema-0.3.json')


def valid_test_with_provided_schema():
Expand Down
103 changes: 77 additions & 26 deletions src/python_osw_validation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@
from .helpers import _feature_index_from_error, _pretty_message, _rank_for

SCHEMA_PATH = os.path.join(os.path.dirname(__file__), 'schema')
DEFAULT_DATASET_SCHEMAS = {
"edges": os.path.join(SCHEMA_PATH, 'opensidewalks.edges.schema-0.3.json'),
"lines": os.path.join(SCHEMA_PATH, 'opensidewalks.lines.schema-0.3.json'),
"nodes": os.path.join(SCHEMA_PATH, 'opensidewalks.nodes.schema-0.3.json'),
"points": os.path.join(SCHEMA_PATH, 'opensidewalks.points.schema-0.3.json'),
"polygons": os.path.join(SCHEMA_PATH, 'opensidewalks.polygons.schema-0.3.json'),
"zones": os.path.join(SCHEMA_PATH, 'opensidewalks.zones.schema-0.3.json'),
}


class ValidationResult:
Expand All @@ -33,17 +41,18 @@ def __init__(self, is_valid: bool, errors: Optional[List[str]] = None,


class OSWValidation:
default_schema_file_path = os.path.join(SCHEMA_PATH, 'opensidewalks.schema.json')
default_schema_file_path_03 = os.path.join(SCHEMA_PATH, 'opensidewalks.schema-0.3.json')

# per-geometry defaults
default_point_schema = os.path.join(SCHEMA_PATH, 'Point_schema.json')
default_line_schema = os.path.join(SCHEMA_PATH, 'Linestring_schema.json')
default_polygon_schema = os.path.join(SCHEMA_PATH, 'Polygon_schema.json')
default_point_schema = DEFAULT_DATASET_SCHEMAS['points']
default_line_schema = DEFAULT_DATASET_SCHEMAS['edges']
default_polygon_schema = DEFAULT_DATASET_SCHEMAS['zones']

def __init__(
self,
zipfile_path: str,
schema_file_path=None,
schema_paths: Optional[Dict[str, str]] = None,
point_schema_path: Optional[str] = None,
line_schema_path: Optional[str] = None,
polygon_schema_path: Optional[str] = None,
Expand All @@ -57,10 +66,15 @@ def __init__(
# Legacy single schema (if set, used for all)
self.schema_file_path = schema_file_path # may be None

# Dataset-specific schemas (override via schema_paths)
self.dataset_schema_paths = {**DEFAULT_DATASET_SCHEMAS}
if schema_paths:
self.dataset_schema_paths.update({k: v for k, v in schema_paths.items() if v})

# Per-geometry schemas (with defaults)
self.point_schema_path = point_schema_path or self.default_point_schema
self.line_schema_path = line_schema_path or self.default_line_schema
self.polygon_schema_path = polygon_schema_path or self.default_polygon_schema
self.point_schema_path = point_schema_path or self.dataset_schema_paths['points']
self.line_schema_path = line_schema_path or self.dataset_schema_paths['edges']
self.polygon_schema_path = polygon_schema_path or self.dataset_schema_paths['zones']

# ----------------------------
# Utilities & helpers
Expand Down Expand Up @@ -92,6 +106,45 @@ def _get_colset(self, gdf: Optional[gpd.GeoDataFrame], col: str, filekey: str) -
self.log_errors(f"Could not create set for column '{col}' in {filekey}.", filekey, None)
return set()

def _schema_key_from_text(self, text: Optional[str]) -> Optional[str]:
"""Return dataset key (edges/nodes/points/lines/polygons/zones) if mentioned in text."""
if not text:
return None
lower = text.lower()
aliases = {
"edges": ("edge", "edges"),
"lines": ("line", "lines", "linestring"),
"nodes": ("node", "nodes"),
"points": ("point", "points"),
"polygons": ("polygon", "polygons", "area"),
"zones": ("zone", "zones"),
}
for key, variants in aliases.items():
if any(alias in lower for alias in variants):
return key
return None

def _contains_disallowed_features_for_02(self, geojson_data: Dict[str, Any]) -> bool:
"""Detect Tree coverage or Custom Point/Line/Polygon in legacy 0.2 datasets."""
for feat in geojson_data.get("features", []):
props = feat.get("properties") or {}
val = props.get("natural")
if isinstance(val, str) and val.strip().lower() in {"tree", "wood"}:
return True
if any(k in props for k in ("leaf_cycle", "leaf_type")):
return True
for k, v in props.items():
target = ""
if isinstance(v, str):
target = v.lower()
elif isinstance(k, str):
target = k.lower()
if any(tok in target for tok in ["custom point", "custom_point", "custompoint",
"custom line", "custom_line", "customline",
"custom polygon", "custom_polygon", "custompolygon"]):
return True
return False

# ----------------------------
# Schema selection
# ----------------------------
Expand All @@ -118,25 +171,12 @@ def are_ids_unique(self, gdf):
def pick_schema_for_file(self, file_path: str, geojson_data: Dict[str, Any]) -> str:
if self.schema_file_path:
return self.schema_file_path
try:
features = geojson_data.get('features', [])
if features:
gtype = (features[0].get('geometry') or {}).get('type')
if gtype == 'Point':
return self.point_schema_path
if gtype == 'LineString':
return self.line_schema_path
if gtype == 'Polygon':
return self.polygon_schema_path
except Exception:
pass
lower = os.path.basename(file_path).lower()
if 'node' in lower or 'point' in lower:
return self.point_schema_path
if 'edge' in lower or 'line' in lower:
return self.line_schema_path
if 'zone' in lower or 'polygon' in lower or 'area' in lower:
return self.polygon_schema_path

basename = os.path.basename(file_path)
schema_key = self._schema_key_from_text(basename)
if schema_key and schema_key in self.dataset_schema_paths:
return self.dataset_schema_paths[schema_key]

return self.line_schema_path

# ----------------------------
Expand Down Expand Up @@ -432,6 +472,17 @@ def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
return False
except OSError:
return False

schema_url = geojson_data.get('$schema')
if isinstance(schema_url, str) and '0.2/schema.json' in schema_url:
if self._contains_disallowed_features_for_02(geojson_data):
self.log_errors(
message="0.2 schema does not support Tree coverage, Custom Point, Custom Line, and Custom Polygon",
filename=os.path.basename(file_path),
feature_index=None,
)
return False

schema_path = self.pick_schema_for_file(file_path, geojson_data)
schema = self.load_osw_schema(schema_path)
validator = jsonschema_rs.Draft7Validator(schema)
Expand Down
54 changes: 49 additions & 5 deletions src/python_osw_validation/extracted_data_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,24 @@
}
}

ALLOWED_OSW_03_FILENAMES = (
"opensidewalks.edges.geojson",
"opensidewalks.lines.geojson",
"opensidewalks.nodes.geojson",
"opensidewalks.points.geojson",
"opensidewalks.polygons.geojson",
"opensidewalks.zones.geojson",
)

_FILENAME_TO_KEY = {
"opensidewalks.edges.geojson": "edges",
"opensidewalks.lines.geojson": "lines",
"opensidewalks.nodes.geojson": "nodes",
"opensidewalks.points.geojson": "points",
"opensidewalks.polygons.geojson": "polygons",
"opensidewalks.zones.geojson": "zones",
}


class ExtractedDataValidator:
def __init__(self, extracted_dir: str):
Expand All @@ -45,15 +63,41 @@ def is_valid(self) -> bool:

# Look for required files at the root level
geojson_files = glob.glob(os.path.join(self.extracted_dir, '*.geojson'))

# If not found at the root, check inside folders
if not geojson_files:
geojson_files = glob.glob(os.path.join(self.extracted_dir, '*', '*.geojson'))
nested_files = glob.glob(os.path.join(self.extracted_dir, '*', '*.geojson'))
for f in nested_files:
if f not in geojson_files:
geojson_files.append(f)

if not geojson_files:
self.error = 'No .geojson files found in the specified directory or its subdirectories.'
return False

basenames = [os.path.basename(f) for f in geojson_files]
is_osw_03 = any(name.startswith("opensidewalks.") for name in basenames)

if is_osw_03:
invalid_basenames = [bn for bn in basenames if bn not in ALLOWED_OSW_03_FILENAMES]
if invalid_basenames:
allowed_fmt = ", ".join(ALLOWED_OSW_03_FILENAMES)
self.error = f'Dataset contains non-standard file names. The only allowed file names are {{{allowed_fmt}}}'
return False

duplicate_keys = []
for filename in ALLOWED_OSW_03_FILENAMES:
occurrences = [f for f in geojson_files if os.path.basename(f) == filename]
if len(occurrences) > 1:
duplicate_keys.append(_FILENAME_TO_KEY.get(filename, filename))
elif len(occurrences) == 1:
self.files.append(occurrences[0])

if duplicate_keys:
self.error = f'Multiple .geojson files of the same type found: {", ".join(duplicate_keys)}.'
return False

self.externalExtensions.extend([item for item in geojson_files if item not in self.files])
gc.collect()
return True

required_files = [key for key, value in OSW_DATASET_FILES.items() if value['required']]
optional_files = [key for key, value in OSW_DATASET_FILES.items() if not value['required']]
missing_files = []
Expand Down Expand Up @@ -106,7 +150,7 @@ def is_valid(self) -> bool:

finally:
# Cleanup large lists and call garbage collector
del geojson_files, required_files, optional_files, missing_files, duplicate_files
del geojson_files, basenames, required_files, optional_files, missing_files, duplicate_files
gc.collect()

return True
Loading
Loading