Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Change log

### 0.2.14
- Improved GeoJSON parse error reporting with detailed file, line, and column context.
- Added unit tests covering JSON parsing and file read failure scenarios.
- Capped duplicate `_id` validation messages default it to first 20 values while reporting the total duplicate count to avoid excessively large issue payloads.
- Added a new unit test that verifies duplicate ID logging trims the displayed IDs to 20 while reporting the total number of duplicates.

### 0.2.13
- Updated Schema

Expand Down
60 changes: 46 additions & 14 deletions src/python_osw_validation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,15 @@ def validate(self, max_errors=20) -> ValidationResult:
continue
is_valid, duplicates = self.are_ids_unique(gdf)
if not is_valid:
total_duplicates = len(duplicates)
displayed = ', '.join(map(str, duplicates[:max_errors]))
if total_duplicates > max_errors:
message = (f"Duplicate _id's found in {osw_file}: showing first {max_errors} "
f"of {total_duplicates} duplicates: {displayed}")
else:
message = f"Duplicate _id's found in {osw_file}: {displayed}"
self.log_errors(
message=f"Duplicate _id's found in {osw_file} : {duplicates}",
message=message,
filename=osw_file,
feature_index=None
)
Expand Down Expand Up @@ -238,11 +245,11 @@ def validate(self, max_errors=20) -> ValidationResult:
if unmatched:
unmatched_list = list(unmatched)
num_unmatched = len(unmatched_list)
limit = min(num_unmatched, 20)
limit = min(num_unmatched, max_errors)
displayed_unmatched = ', '.join(map(str, unmatched_list[:limit]))
self.log_errors(
message=(f"All _u_id's in edges should be part of _id's mentioned in nodes. "
f"Showing {'20' if num_unmatched > 20 else 'all'} out of {num_unmatched} "
f"Showing {max_errors if num_unmatched > max_errors else 'all'} out of {num_unmatched} "
f"unmatched _u_id's: {displayed_unmatched}"),
filename='All',
feature_index=None
Expand All @@ -253,11 +260,11 @@ def validate(self, max_errors=20) -> ValidationResult:
if unmatched:
unmatched_list = list(unmatched)
num_unmatched = len(unmatched_list)
limit = min(num_unmatched, 20)
limit = min(num_unmatched, max_errors)
displayed_unmatched = ', '.join(map(str, unmatched_list[:limit]))
self.log_errors(
message=(f"All _v_id's in edges should be part of _id's mentioned in nodes. "
f"Showing {'20' if num_unmatched > 20 else 'all'} out of {num_unmatched} "
f"Showing {max_errors if num_unmatched > max_errors else 'all'} out of {num_unmatched} "
f"unmatched _v_id's: {displayed_unmatched}"),
filename='All',
feature_index=None
Expand All @@ -268,11 +275,11 @@ def validate(self, max_errors=20) -> ValidationResult:
if unmatched:
unmatched_list = list(unmatched)
num_unmatched = len(unmatched_list)
limit = min(num_unmatched, 20)
limit = min(num_unmatched, max_errors)
displayed_unmatched = ', '.join(map(str, unmatched_list[:limit]))
self.log_errors(
message=(f"All _w_id's in zones should be part of _id's mentioned in nodes. "
f"Showing {'20' if num_unmatched > 20 else 'all'} out of {num_unmatched} "
f"Showing {max_errors if num_unmatched > max_errors else 'all'} out of {num_unmatched} "
f"unmatched _w_id's: {displayed_unmatched}"),
filename='All',
feature_index=None
Expand All @@ -295,10 +302,10 @@ def validate(self, max_errors=20) -> ValidationResult:
ids_series = invalid_geojson['_id'] if '_id' in invalid_geojson.columns else invalid_geojson.index
invalid_ids = list(set(ids_series))
num_invalid = len(invalid_ids)
limit = min(num_invalid, 20)
limit = min(num_invalid, max_errors)
displayed_invalid = ', '.join(map(str, invalid_ids[:limit]))
self.log_errors(
message=(f"Showing {'20' if num_invalid > 20 else 'all'} out of {num_invalid} "
message=(f"Showing {max_errors if num_invalid > max_errors else 'all'} out of {num_invalid} "
f"invalid {osw_file} geometries, id's of invalid geometries: {displayed_invalid}"),
filename='All',
feature_index=None
Expand All @@ -323,11 +330,11 @@ def validate(self, max_errors=20) -> ValidationResult:
try:
invalid_ids = list(set(invalid_geojson.get('_id', invalid_geojson.index)))
num_invalid = len(invalid_ids)
limit = min(num_invalid, 20)
limit = min(num_invalid, max_errors)
displayed_invalid = ', '.join(map(str, invalid_ids[:limit]))
self.log_errors(
message=(f"Invalid geometries found in extension file `{file_name}`. "
f"Showing {limit if num_invalid > 20 else 'all'} of {num_invalid} "
f"Showing {max_errors if num_invalid > max_errors else 'all'} of {num_invalid} "
f"invalid geometry IDs: {displayed_invalid}"),
filename=file_name,
feature_index=None
Expand Down Expand Up @@ -388,8 +395,28 @@ def validate(self, max_errors=20) -> ValidationResult:
gc.collect()

def load_osw_file(self, graph_geojson_path: str) -> Dict[str, Any]:
with open(graph_geojson_path, 'r') as file:
return json.load(file)
try:
with open(graph_geojson_path, 'r') as file:
return json.load(file)
except json.JSONDecodeError as e:
filename = os.path.basename(graph_geojson_path)
self.log_errors(
message=(
f"Failed to parse '{filename}' as valid JSON. "
f"{e.msg} (line {e.lineno}, column {e.colno}, char {e.pos})."
),
filename=filename,
feature_index=None,
)
raise
except OSError as e:
filename = os.path.basename(graph_geojson_path)
self.log_errors(
message=f"Unable to read file '{filename}': {e.strerror or e}",
filename=filename,
feature_index=None,
)
raise

def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
"""Validate one OSW GeoJSON against the appropriate schema (streaming).
Expand All @@ -399,7 +426,12 @@ def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
before returning, pushes a single human-friendly message per feature
into `self.issues` (like your sample: "must include one of: ...").
"""
geojson_data = self.load_osw_file(file_path)
try:
geojson_data = self.load_osw_file(file_path)
except json.JSONDecodeError:
return False
except OSError:
return False
schema_path = self.pick_schema_for_file(file_path, geojson_data)
schema = self.load_osw_schema(schema_path)
validator = jsonschema_rs.Draft7Validator(schema)
Expand Down
2 changes: 1 addition & 1 deletion src/python_osw_validation/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.2.13'
__version__ = '0.2.14'
109 changes: 108 additions & 1 deletion tests/unit_tests/test_osw_validation_extras.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import json
import os
import tempfile
import unittest
from unittest.mock import patch, MagicMock
import geopandas as gpd
Expand Down Expand Up @@ -172,6 +174,83 @@ def _rf(path):
shown_ids = [x.strip() for x in displayed.split(",")]
self.assertLessEqual(len(shown_ids), 20)

def test_load_osw_file_logs_json_decode_error(self):
"""Invalid JSON should surface a detailed message with location context."""
validator = OSWValidation(zipfile_path="dummy.zip")
with tempfile.NamedTemporaryFile("w", suffix=".geojson", delete=False) as tmp:
tmp.write('{"features": [1, }')
bad_path = tmp.name
try:
with self.assertRaises(json.JSONDecodeError):
validator.load_osw_file(bad_path)
finally:
os.unlink(bad_path)

self.assertTrue(any("Failed to parse" in e for e in (validator.errors or [])),
f"Errors were: {validator.errors}")
message = validator.errors[-1]
basename = os.path.basename(bad_path)
self.assertIn(basename, message)
self.assertIn("line", message)
self.assertIn("column", message)

issue = validator.issues[-1]
self.assertEqual(issue["filename"], basename)
self.assertIsNone(issue["feature_index"])
self.assertEqual(issue["error_message"], message)

def test_load_osw_file_logs_os_error(self):
"""Missing files should log a readable OS error message."""
validator = OSWValidation(zipfile_path="dummy.zip")
missing_path = os.path.join(tempfile.gettempdir(), "nonexistent_osw_file.geojson")
if os.path.exists(missing_path):
os.unlink(missing_path)

with self.assertRaises(OSError):
validator.load_osw_file(missing_path)

self.assertTrue(any("Unable to read file" in e for e in (validator.errors or [])),
f"Errors were: {validator.errors}")
message = validator.errors[-1]
basename = os.path.basename(missing_path)
self.assertIn(basename, message)
self.assertIn("Unable to read file", message)

issue = validator.issues[-1]
self.assertEqual(issue["filename"], basename)
self.assertIsNone(issue["feature_index"])
self.assertEqual(issue["error_message"], message)

def test_validate_reports_json_decode_error(self):
"""Full validation flow should surface parse errors before schema checks."""
with tempfile.NamedTemporaryFile("w", suffix=".geojson", delete=False) as tmp:
tmp.write('{"type": "FeatureCollection", "features": [1, }')
bad_path = tmp.name

try:
with patch(_PATCH_ZIP) as PZip, patch(_PATCH_EV) as PVal:
z = MagicMock()
z.extract_zip.return_value = "/tmp/extracted"
z.remove_extracted_files.return_value = None
PZip.return_value = z

PVal.return_value = self._fake_validator([bad_path])

result = OSWValidation(zipfile_path="dummy.zip").validate()

self.assertFalse(result.is_valid)
message = next((e for e in (result.errors or []) if "Failed to parse" in e), None)
self.assertIsNotNone(message, f"Expected parse error message. Errors: {result.errors}")
basename = os.path.basename(bad_path)
self.assertIn(basename, message)
self.assertIn("line", message)

issue = next((i for i in (result.issues or []) if i["filename"] == basename), None)
self.assertIsNotNone(issue, f"Issues were: {result.issues}")
self.assertEqual(issue["error_message"], message)
finally:
os.unlink(bad_path)

def test_duplicate_ids_detection(self):
"""Duplicates inside a single file are reported."""
fake_files = ["/tmp/nodes.geojson"]
Expand All @@ -190,7 +269,35 @@ def test_duplicate_ids_detection(self):

res = OSWValidation(zipfile_path="dummy.zip").validate()
self.assertFalse(res.is_valid)
self.assertTrue(any("Duplicate _id's found in nodes" in e for e in (res.errors or [])))
msg = next((e for e in (res.errors or []) if "Duplicate _id's found in nodes" in e), None)
self.assertEqual(msg, "Duplicate _id's found in nodes: 2")

def test_duplicate_ids_detection_is_limited_to_20(self):
"""Duplicate messages cap the number of displayed IDs."""
fake_files = ["/tmp/nodes.geojson"]
duplicate_ids = [f"id{i}" for i in range(25) for _ in (0, 1)] # 25 unique duplicates
nodes = self._gdf_nodes(duplicate_ids)

with patch(_PATCH_ZIP) as PZip, \
patch(_PATCH_EV) as PVal, \
patch(_PATCH_VALIDATE, return_value=True), \
patch(_PATCH_READ_FILE, return_value=nodes), \
patch(_PATCH_DATASET_FILES, _CANON_DATASET_FILES):
z = MagicMock()
z.extract_zip.return_value = "/tmp/extracted"
PZip.return_value = z
PVal.return_value = self._fake_validator(fake_files)

res = OSWValidation(zipfile_path="dummy.zip").validate()
self.assertFalse(res.is_valid)
msg = next((e for e in (res.errors or []) if "Duplicate _id's found in nodes" in e), None)
self.assertIsNotNone(msg, "Expected duplicate-id error not found")
self.assertIn("showing first 20 of 25 duplicates", msg)
displayed = msg.split("duplicates:")[-1].strip()
shown_ids = [part.strip() for part in displayed.split(",") if part.strip()]
self.assertLessEqual(len(shown_ids), 20)
expected_ids = [f"id{i}" for i in range(20)]
self.assertEqual(shown_ids, expected_ids)

def test_pick_schema_by_geometry_and_by_filename(self):
"""Point/LineString/Polygon ⇒ proper schema; filename fallback when features empty."""
Expand Down
Loading