TaskarCenterAtUW · sujata-m · Sep 29, 2025 · Sep 29, 2025 · Sep 29, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Change log
 
+### 0.2.14
+- Improved GeoJSON parse error reporting with detailed file, line, and column context.
+- Added unit tests covering JSON parsing and file read failure scenarios.
+- Capped duplicate `_id` validation messages default it to first 20 values while reporting the total duplicate count to avoid excessively large issue payloads.
+- Added a new unit test that verifies duplicate ID logging trims the displayed IDs to 20 while reporting the total number of duplicates.
+
 ### 0.2.13
 - Updated Schema
 

diff --git a/src/python_osw_validation/__init__.py b/src/python_osw_validation/__init__.py
@@ -202,8 +202,15 @@ def validate(self, max_errors=20) -> ValidationResult:
                     continue
                 is_valid, duplicates = self.are_ids_unique(gdf)
                 if not is_valid:
+                    total_duplicates = len(duplicates)
+                    displayed = ', '.join(map(str, duplicates[:max_errors]))
+                    if total_duplicates > max_errors:
+                        message = (f"Duplicate _id's found in {osw_file}: showing first {max_errors} "
+                                   f"of {total_duplicates} duplicates: {displayed}")
+                    else:
+                        message = f"Duplicate _id's found in {osw_file}: {displayed}"
                     self.log_errors(
-                        message=f"Duplicate _id's found in {osw_file} : {duplicates}",
+                        message=message,
                         filename=osw_file,
                         feature_index=None
                     )
@@ -238,11 +245,11 @@ def validate(self, max_errors=20) -> ValidationResult:
                 if unmatched:
                     unmatched_list = list(unmatched)
                     num_unmatched = len(unmatched_list)
-                    limit = min(num_unmatched, 20)
+                    limit = min(num_unmatched, max_errors)
                     displayed_unmatched = ', '.join(map(str, unmatched_list[:limit]))
                     self.log_errors(
                         message=(f"All _u_id's in edges should be part of _id's mentioned in nodes. "
-                                 f"Showing {'20' if num_unmatched > 20 else 'all'} out of {num_unmatched} "
+                                 f"Showing {max_errors if num_unmatched > max_errors else 'all'} out of {num_unmatched} "
                                  f"unmatched _u_id's: {displayed_unmatched}"),
                         filename='All',
                         feature_index=None
@@ -253,11 +260,11 @@ def validate(self, max_errors=20) -> ValidationResult:
                 if unmatched:
                     unmatched_list = list(unmatched)
                     num_unmatched = len(unmatched_list)
-                    limit = min(num_unmatched, 20)
+                    limit = min(num_unmatched, max_errors)
                     displayed_unmatched = ', '.join(map(str, unmatched_list[:limit]))
                     self.log_errors(
                         message=(f"All _v_id's in edges should be part of _id's mentioned in nodes. "
-                                 f"Showing {'20' if num_unmatched > 20 else 'all'} out of {num_unmatched} "
+                                 f"Showing {max_errors if num_unmatched > max_errors else 'all'} out of {num_unmatched} "
                                  f"unmatched _v_id's: {displayed_unmatched}"),
                         filename='All',
                         feature_index=None
@@ -268,11 +275,11 @@ def validate(self, max_errors=20) -> ValidationResult:
                 if unmatched:
                     unmatched_list = list(unmatched)
                     num_unmatched = len(unmatched_list)
-                    limit = min(num_unmatched, 20)
+                    limit = min(num_unmatched, max_errors)
                     displayed_unmatched = ', '.join(map(str, unmatched_list[:limit]))
                     self.log_errors(
                         message=(f"All _w_id's in zones should be part of _id's mentioned in nodes. "
-                                 f"Showing {'20' if num_unmatched > 20 else 'all'} out of {num_unmatched} "
+                                 f"Showing {max_errors if num_unmatched > max_errors else 'all'} out of {num_unmatched} "
                                  f"unmatched _w_id's: {displayed_unmatched}"),
                         filename='All',
                         feature_index=None
@@ -295,10 +302,10 @@ def validate(self, max_errors=20) -> ValidationResult:
                     ids_series = invalid_geojson['_id'] if '_id' in invalid_geojson.columns else invalid_geojson.index
                     invalid_ids = list(set(ids_series))
                     num_invalid = len(invalid_ids)
-                    limit = min(num_invalid, 20)
+                    limit = min(num_invalid, max_errors)
                     displayed_invalid = ', '.join(map(str, invalid_ids[:limit]))
                     self.log_errors(
-                        message=(f"Showing {'20' if num_invalid > 20 else 'all'} out of {num_invalid} "
+                        message=(f"Showing {max_errors if num_invalid > max_errors else 'all'} out of {num_invalid} "
                                  f"invalid {osw_file} geometries, id's of invalid geometries: {displayed_invalid}"),
                         filename='All',
                         feature_index=None
@@ -323,11 +330,11 @@ def validate(self, max_errors=20) -> ValidationResult:
                     try:
                         invalid_ids = list(set(invalid_geojson.get('_id', invalid_geojson.index)))
                         num_invalid = len(invalid_ids)
-                        limit = min(num_invalid, 20)
+                        limit = min(num_invalid, max_errors)
                         displayed_invalid = ', '.join(map(str, invalid_ids[:limit]))
                         self.log_errors(
                             message=(f"Invalid geometries found in extension file `{file_name}`. "
-                                     f"Showing {limit if num_invalid > 20 else 'all'} of {num_invalid} "
+                                     f"Showing {max_errors if num_invalid > max_errors else 'all'} of {num_invalid} "
                                      f"invalid geometry IDs: {displayed_invalid}"),
                             filename=file_name,
                             feature_index=None
@@ -388,8 +395,28 @@ def validate(self, max_errors=20) -> ValidationResult:
             gc.collect()
 
     def load_osw_file(self, graph_geojson_path: str) -> Dict[str, Any]:
-        with open(graph_geojson_path, 'r') as file:
-            return json.load(file)
+        try:
+            with open(graph_geojson_path, 'r') as file:
+                return json.load(file)
+        except json.JSONDecodeError as e:
+            filename = os.path.basename(graph_geojson_path)
+            self.log_errors(
+                message=(
+                    f"Failed to parse '{filename}' as valid JSON. "
+                    f"{e.msg} (line {e.lineno}, column {e.colno}, char {e.pos})."
+                ),
+                filename=filename,
+                feature_index=None,
+            )
+            raise
+        except OSError as e:
+            filename = os.path.basename(graph_geojson_path)
+            self.log_errors(
+                message=f"Unable to read file '{filename}': {e.strerror or e}",
+                filename=filename,
+                feature_index=None,
+            )
+            raise
 
     def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
         """Validate one OSW GeoJSON against the appropriate schema (streaming).
@@ -399,7 +426,12 @@ def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
           before returning, pushes a single human-friendly message per feature
           into `self.issues` (like your sample: "must include one of: ...").
         """
-        geojson_data = self.load_osw_file(file_path)
+        try:
+            geojson_data = self.load_osw_file(file_path)
+        except json.JSONDecodeError:
+            return False
+        except OSError:
+            return False
         schema_path = self.pick_schema_for_file(file_path, geojson_data)
         schema = self.load_osw_schema(schema_path)
         validator = jsonschema_rs.Draft7Validator(schema)

diff --git a/src/python_osw_validation/version.py b/src/python_osw_validation/version.py
@@ -1 +1 @@
-__version__ = '0.2.13'
+__version__ = '0.2.14'
diff --git a/tests/unit_tests/test_osw_validation_extras.py b/tests/unit_tests/test_osw_validation_extras.py
@@ -1,4 +1,6 @@
+import json
 import os
+import tempfile
 import unittest
 from unittest.mock import patch, MagicMock
 import geopandas as gpd
@@ -172,6 +174,83 @@ def _rf(path):
                 shown_ids = [x.strip() for x in displayed.split(",")]
                 self.assertLessEqual(len(shown_ids), 20)
 
+    def test_load_osw_file_logs_json_decode_error(self):
+        """Invalid JSON should surface a detailed message with location context."""
+        validator = OSWValidation(zipfile_path="dummy.zip")
+        with tempfile.NamedTemporaryFile("w", suffix=".geojson", delete=False) as tmp:
+            tmp.write('{"features": [1, }')
+            bad_path = tmp.name
+        try:
+            with self.assertRaises(json.JSONDecodeError):
+                validator.load_osw_file(bad_path)
+        finally:
+            os.unlink(bad_path)
+
+        self.assertTrue(any("Failed to parse" in e for e in (validator.errors or [])),
+                        f"Errors were: {validator.errors}")
+        message = validator.errors[-1]
+        basename = os.path.basename(bad_path)
+        self.assertIn(basename, message)
+        self.assertIn("line", message)
+        self.assertIn("column", message)
+
+        issue = validator.issues[-1]
+        self.assertEqual(issue["filename"], basename)
+        self.assertIsNone(issue["feature_index"])
+        self.assertEqual(issue["error_message"], message)
+
+    def test_load_osw_file_logs_os_error(self):
+        """Missing files should log a readable OS error message."""
+        validator = OSWValidation(zipfile_path="dummy.zip")
+        missing_path = os.path.join(tempfile.gettempdir(), "nonexistent_osw_file.geojson")
+        if os.path.exists(missing_path):
+            os.unlink(missing_path)
+
+        with self.assertRaises(OSError):
+            validator.load_osw_file(missing_path)
+
+        self.assertTrue(any("Unable to read file" in e for e in (validator.errors or [])),
+                        f"Errors were: {validator.errors}")
+        message = validator.errors[-1]
+        basename = os.path.basename(missing_path)
+        self.assertIn(basename, message)
+        self.assertIn("Unable to read file", message)
+
+        issue = validator.issues[-1]
+        self.assertEqual(issue["filename"], basename)
+        self.assertIsNone(issue["feature_index"])
+        self.assertEqual(issue["error_message"], message)
+
+    def test_validate_reports_json_decode_error(self):
+        """Full validation flow should surface parse errors before schema checks."""
+        with tempfile.NamedTemporaryFile("w", suffix=".geojson", delete=False) as tmp:
+            tmp.write('{"type": "FeatureCollection", "features": [1, }')
+            bad_path = tmp.name
+
+        try:
+            with patch(_PATCH_ZIP) as PZip, patch(_PATCH_EV) as PVal:
+                z = MagicMock()
+                z.extract_zip.return_value = "/tmp/extracted"
+                z.remove_extracted_files.return_value = None
+                PZip.return_value = z
+
+                PVal.return_value = self._fake_validator([bad_path])
+
+                result = OSWValidation(zipfile_path="dummy.zip").validate()
+
+            self.assertFalse(result.is_valid)
+            message = next((e for e in (result.errors or []) if "Failed to parse" in e), None)
+            self.assertIsNotNone(message, f"Expected parse error message. Errors: {result.errors}")
+            basename = os.path.basename(bad_path)
+            self.assertIn(basename, message)
+            self.assertIn("line", message)
+
+            issue = next((i for i in (result.issues or []) if i["filename"] == basename), None)
+            self.assertIsNotNone(issue, f"Issues were: {result.issues}")
+            self.assertEqual(issue["error_message"], message)
+        finally:
+            os.unlink(bad_path)
+
     def test_duplicate_ids_detection(self):
         """Duplicates inside a single file are reported."""
         fake_files = ["/tmp/nodes.geojson"]
@@ -190,7 +269,35 @@ def test_duplicate_ids_detection(self):
 
             res = OSWValidation(zipfile_path="dummy.zip").validate()
             self.assertFalse(res.is_valid)
-            self.assertTrue(any("Duplicate _id's found in nodes" in e for e in (res.errors or [])))
+            msg = next((e for e in (res.errors or []) if "Duplicate _id's found in nodes" in e), None)
+            self.assertEqual(msg, "Duplicate _id's found in nodes: 2")
+
+    def test_duplicate_ids_detection_is_limited_to_20(self):
+        """Duplicate messages cap the number of displayed IDs."""
+        fake_files = ["/tmp/nodes.geojson"]
+        duplicate_ids = [f"id{i}" for i in range(25) for _ in (0, 1)]  # 25 unique duplicates
+        nodes = self._gdf_nodes(duplicate_ids)
+
+        with patch(_PATCH_ZIP) as PZip, \
+                patch(_PATCH_EV) as PVal, \
+                patch(_PATCH_VALIDATE, return_value=True), \
+                patch(_PATCH_READ_FILE, return_value=nodes), \
+                patch(_PATCH_DATASET_FILES, _CANON_DATASET_FILES):
+            z = MagicMock()
+            z.extract_zip.return_value = "/tmp/extracted"
+            PZip.return_value = z
+            PVal.return_value = self._fake_validator(fake_files)
+
+            res = OSWValidation(zipfile_path="dummy.zip").validate()
+            self.assertFalse(res.is_valid)
+            msg = next((e for e in (res.errors or []) if "Duplicate _id's found in nodes" in e), None)
+            self.assertIsNotNone(msg, "Expected duplicate-id error not found")
+            self.assertIn("showing first 20 of 25 duplicates", msg)
+            displayed = msg.split("duplicates:")[-1].strip()
+            shown_ids = [part.strip() for part in displayed.split(",") if part.strip()]
+            self.assertLessEqual(len(shown_ids), 20)
+            expected_ids = [f"id{i}" for i in range(20)]
+            self.assertEqual(shown_ids, expected_ids)
 
     def test_pick_schema_by_geometry_and_by_filename(self):
         """Point/LineString/Polygon ⇒ proper schema; filename fallback when features empty."""