From 0bcbe6def149c0451ab008de04549f21a060baa9 Mon Sep 17 00:00:00 2001 From: sujata-m Date: Mon, 29 Sep 2025 14:39:48 +0530 Subject: [PATCH] [0.2.14] Fixed Task 2365 ## Dev Board Ticket https://dev.azure.com/TDEI-UW/TDEI/_workitems/edit/2365 ## Changes - Improved GeoJSON parse error reporting with detailed file, line, and column context. - Added unit tests covering JSON parsing and file read failure scenarios. - Capped duplicate `_id` validation messages default it to first 20 values while reporting the total duplicate count to avoid excessively large issue payloads. - Added a new unit test that verifies duplicate ID logging trims the displayed IDs to 20 while reporting the total number of duplicates. ## Testing - Added new unit test cases to support the changes --- CHANGELOG.md | 6 + src/python_osw_validation/__init__.py | 60 +++++++--- src/python_osw_validation/version.py | 2 +- .../unit_tests/test_osw_validation_extras.py | 109 +++++++++++++++++- 4 files changed, 161 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 46baa9e..6040b38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Change log +### 0.2.14 +- Improved GeoJSON parse error reporting with detailed file, line, and column context. +- Added unit tests covering JSON parsing and file read failure scenarios. +- Capped duplicate `_id` validation messages default it to first 20 values while reporting the total duplicate count to avoid excessively large issue payloads. +- Added a new unit test that verifies duplicate ID logging trims the displayed IDs to 20 while reporting the total number of duplicates. + ### 0.2.13 - Updated Schema diff --git a/src/python_osw_validation/__init__.py b/src/python_osw_validation/__init__.py index 7894b0e..ea6af2a 100644 --- a/src/python_osw_validation/__init__.py +++ b/src/python_osw_validation/__init__.py @@ -202,8 +202,15 @@ def validate(self, max_errors=20) -> ValidationResult: continue is_valid, duplicates = self.are_ids_unique(gdf) if not is_valid: + total_duplicates = len(duplicates) + displayed = ', '.join(map(str, duplicates[:max_errors])) + if total_duplicates > max_errors: + message = (f"Duplicate _id's found in {osw_file}: showing first {max_errors} " + f"of {total_duplicates} duplicates: {displayed}") + else: + message = f"Duplicate _id's found in {osw_file}: {displayed}" self.log_errors( - message=f"Duplicate _id's found in {osw_file} : {duplicates}", + message=message, filename=osw_file, feature_index=None ) @@ -238,11 +245,11 @@ def validate(self, max_errors=20) -> ValidationResult: if unmatched: unmatched_list = list(unmatched) num_unmatched = len(unmatched_list) - limit = min(num_unmatched, 20) + limit = min(num_unmatched, max_errors) displayed_unmatched = ', '.join(map(str, unmatched_list[:limit])) self.log_errors( message=(f"All _u_id's in edges should be part of _id's mentioned in nodes. " - f"Showing {'20' if num_unmatched > 20 else 'all'} out of {num_unmatched} " + f"Showing {max_errors if num_unmatched > max_errors else 'all'} out of {num_unmatched} " f"unmatched _u_id's: {displayed_unmatched}"), filename='All', feature_index=None @@ -253,11 +260,11 @@ def validate(self, max_errors=20) -> ValidationResult: if unmatched: unmatched_list = list(unmatched) num_unmatched = len(unmatched_list) - limit = min(num_unmatched, 20) + limit = min(num_unmatched, max_errors) displayed_unmatched = ', '.join(map(str, unmatched_list[:limit])) self.log_errors( message=(f"All _v_id's in edges should be part of _id's mentioned in nodes. " - f"Showing {'20' if num_unmatched > 20 else 'all'} out of {num_unmatched} " + f"Showing {max_errors if num_unmatched > max_errors else 'all'} out of {num_unmatched} " f"unmatched _v_id's: {displayed_unmatched}"), filename='All', feature_index=None @@ -268,11 +275,11 @@ def validate(self, max_errors=20) -> ValidationResult: if unmatched: unmatched_list = list(unmatched) num_unmatched = len(unmatched_list) - limit = min(num_unmatched, 20) + limit = min(num_unmatched, max_errors) displayed_unmatched = ', '.join(map(str, unmatched_list[:limit])) self.log_errors( message=(f"All _w_id's in zones should be part of _id's mentioned in nodes. " - f"Showing {'20' if num_unmatched > 20 else 'all'} out of {num_unmatched} " + f"Showing {max_errors if num_unmatched > max_errors else 'all'} out of {num_unmatched} " f"unmatched _w_id's: {displayed_unmatched}"), filename='All', feature_index=None @@ -295,10 +302,10 @@ def validate(self, max_errors=20) -> ValidationResult: ids_series = invalid_geojson['_id'] if '_id' in invalid_geojson.columns else invalid_geojson.index invalid_ids = list(set(ids_series)) num_invalid = len(invalid_ids) - limit = min(num_invalid, 20) + limit = min(num_invalid, max_errors) displayed_invalid = ', '.join(map(str, invalid_ids[:limit])) self.log_errors( - message=(f"Showing {'20' if num_invalid > 20 else 'all'} out of {num_invalid} " + message=(f"Showing {max_errors if num_invalid > max_errors else 'all'} out of {num_invalid} " f"invalid {osw_file} geometries, id's of invalid geometries: {displayed_invalid}"), filename='All', feature_index=None @@ -323,11 +330,11 @@ def validate(self, max_errors=20) -> ValidationResult: try: invalid_ids = list(set(invalid_geojson.get('_id', invalid_geojson.index))) num_invalid = len(invalid_ids) - limit = min(num_invalid, 20) + limit = min(num_invalid, max_errors) displayed_invalid = ', '.join(map(str, invalid_ids[:limit])) self.log_errors( message=(f"Invalid geometries found in extension file `{file_name}`. " - f"Showing {limit if num_invalid > 20 else 'all'} of {num_invalid} " + f"Showing {max_errors if num_invalid > max_errors else 'all'} of {num_invalid} " f"invalid geometry IDs: {displayed_invalid}"), filename=file_name, feature_index=None @@ -388,8 +395,28 @@ def validate(self, max_errors=20) -> ValidationResult: gc.collect() def load_osw_file(self, graph_geojson_path: str) -> Dict[str, Any]: - with open(graph_geojson_path, 'r') as file: - return json.load(file) + try: + with open(graph_geojson_path, 'r') as file: + return json.load(file) + except json.JSONDecodeError as e: + filename = os.path.basename(graph_geojson_path) + self.log_errors( + message=( + f"Failed to parse '{filename}' as valid JSON. " + f"{e.msg} (line {e.lineno}, column {e.colno}, char {e.pos})." + ), + filename=filename, + feature_index=None, + ) + raise + except OSError as e: + filename = os.path.basename(graph_geojson_path) + self.log_errors( + message=f"Unable to read file '{filename}': {e.strerror or e}", + filename=filename, + feature_index=None, + ) + raise def validate_osw_errors(self, file_path: str, max_errors: int) -> bool: """Validate one OSW GeoJSON against the appropriate schema (streaming). @@ -399,7 +426,12 @@ def validate_osw_errors(self, file_path: str, max_errors: int) -> bool: before returning, pushes a single human-friendly message per feature into `self.issues` (like your sample: "must include one of: ..."). """ - geojson_data = self.load_osw_file(file_path) + try: + geojson_data = self.load_osw_file(file_path) + except json.JSONDecodeError: + return False + except OSError: + return False schema_path = self.pick_schema_for_file(file_path, geojson_data) schema = self.load_osw_schema(schema_path) validator = jsonschema_rs.Draft7Validator(schema) diff --git a/src/python_osw_validation/version.py b/src/python_osw_validation/version.py index eeb48b4..4a8a4ed 100644 --- a/src/python_osw_validation/version.py +++ b/src/python_osw_validation/version.py @@ -1 +1 @@ -__version__ = '0.2.13' \ No newline at end of file +__version__ = '0.2.14' \ No newline at end of file diff --git a/tests/unit_tests/test_osw_validation_extras.py b/tests/unit_tests/test_osw_validation_extras.py index 863ddd1..6e1e1c1 100644 --- a/tests/unit_tests/test_osw_validation_extras.py +++ b/tests/unit_tests/test_osw_validation_extras.py @@ -1,4 +1,6 @@ +import json import os +import tempfile import unittest from unittest.mock import patch, MagicMock import geopandas as gpd @@ -172,6 +174,83 @@ def _rf(path): shown_ids = [x.strip() for x in displayed.split(",")] self.assertLessEqual(len(shown_ids), 20) + def test_load_osw_file_logs_json_decode_error(self): + """Invalid JSON should surface a detailed message with location context.""" + validator = OSWValidation(zipfile_path="dummy.zip") + with tempfile.NamedTemporaryFile("w", suffix=".geojson", delete=False) as tmp: + tmp.write('{"features": [1, }') + bad_path = tmp.name + try: + with self.assertRaises(json.JSONDecodeError): + validator.load_osw_file(bad_path) + finally: + os.unlink(bad_path) + + self.assertTrue(any("Failed to parse" in e for e in (validator.errors or [])), + f"Errors were: {validator.errors}") + message = validator.errors[-1] + basename = os.path.basename(bad_path) + self.assertIn(basename, message) + self.assertIn("line", message) + self.assertIn("column", message) + + issue = validator.issues[-1] + self.assertEqual(issue["filename"], basename) + self.assertIsNone(issue["feature_index"]) + self.assertEqual(issue["error_message"], message) + + def test_load_osw_file_logs_os_error(self): + """Missing files should log a readable OS error message.""" + validator = OSWValidation(zipfile_path="dummy.zip") + missing_path = os.path.join(tempfile.gettempdir(), "nonexistent_osw_file.geojson") + if os.path.exists(missing_path): + os.unlink(missing_path) + + with self.assertRaises(OSError): + validator.load_osw_file(missing_path) + + self.assertTrue(any("Unable to read file" in e for e in (validator.errors or [])), + f"Errors were: {validator.errors}") + message = validator.errors[-1] + basename = os.path.basename(missing_path) + self.assertIn(basename, message) + self.assertIn("Unable to read file", message) + + issue = validator.issues[-1] + self.assertEqual(issue["filename"], basename) + self.assertIsNone(issue["feature_index"]) + self.assertEqual(issue["error_message"], message) + + def test_validate_reports_json_decode_error(self): + """Full validation flow should surface parse errors before schema checks.""" + with tempfile.NamedTemporaryFile("w", suffix=".geojson", delete=False) as tmp: + tmp.write('{"type": "FeatureCollection", "features": [1, }') + bad_path = tmp.name + + try: + with patch(_PATCH_ZIP) as PZip, patch(_PATCH_EV) as PVal: + z = MagicMock() + z.extract_zip.return_value = "/tmp/extracted" + z.remove_extracted_files.return_value = None + PZip.return_value = z + + PVal.return_value = self._fake_validator([bad_path]) + + result = OSWValidation(zipfile_path="dummy.zip").validate() + + self.assertFalse(result.is_valid) + message = next((e for e in (result.errors or []) if "Failed to parse" in e), None) + self.assertIsNotNone(message, f"Expected parse error message. Errors: {result.errors}") + basename = os.path.basename(bad_path) + self.assertIn(basename, message) + self.assertIn("line", message) + + issue = next((i for i in (result.issues or []) if i["filename"] == basename), None) + self.assertIsNotNone(issue, f"Issues were: {result.issues}") + self.assertEqual(issue["error_message"], message) + finally: + os.unlink(bad_path) + def test_duplicate_ids_detection(self): """Duplicates inside a single file are reported.""" fake_files = ["/tmp/nodes.geojson"] @@ -190,7 +269,35 @@ def test_duplicate_ids_detection(self): res = OSWValidation(zipfile_path="dummy.zip").validate() self.assertFalse(res.is_valid) - self.assertTrue(any("Duplicate _id's found in nodes" in e for e in (res.errors or []))) + msg = next((e for e in (res.errors or []) if "Duplicate _id's found in nodes" in e), None) + self.assertEqual(msg, "Duplicate _id's found in nodes: 2") + + def test_duplicate_ids_detection_is_limited_to_20(self): + """Duplicate messages cap the number of displayed IDs.""" + fake_files = ["/tmp/nodes.geojson"] + duplicate_ids = [f"id{i}" for i in range(25) for _ in (0, 1)] # 25 unique duplicates + nodes = self._gdf_nodes(duplicate_ids) + + with patch(_PATCH_ZIP) as PZip, \ + patch(_PATCH_EV) as PVal, \ + patch(_PATCH_VALIDATE, return_value=True), \ + patch(_PATCH_READ_FILE, return_value=nodes), \ + patch(_PATCH_DATASET_FILES, _CANON_DATASET_FILES): + z = MagicMock() + z.extract_zip.return_value = "/tmp/extracted" + PZip.return_value = z + PVal.return_value = self._fake_validator(fake_files) + + res = OSWValidation(zipfile_path="dummy.zip").validate() + self.assertFalse(res.is_valid) + msg = next((e for e in (res.errors or []) if "Duplicate _id's found in nodes" in e), None) + self.assertIsNotNone(msg, "Expected duplicate-id error not found") + self.assertIn("showing first 20 of 25 duplicates", msg) + displayed = msg.split("duplicates:")[-1].strip() + shown_ids = [part.strip() for part in displayed.split(",") if part.strip()] + self.assertLessEqual(len(shown_ids), 20) + expected_ids = [f"id{i}" for i in range(20)] + self.assertEqual(shown_ids, expected_ids) def test_pick_schema_by_geometry_and_by_filename(self): """Point/LineString/Polygon ⇒ proper schema; filename fallback when features empty."""