Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
# Change log

### 0.2.6
- Add garbage collection to free up memory after validation


### 0.2.5
- Updated geopandas package
- Updated geopandas package


### 0.2.3
- Performance improvement if there are any errors
Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
jsonschema~=4.19.1
zipfile36~=0.1.3
coverage~=7.5.1
jsonschema
zipfile36
coverage
geopandas
18 changes: 7 additions & 11 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,5 @@
import os
from setuptools import setup, find_packages, Extension
from version import version

project_path = os.path.dirname(os.path.realpath(__file__))
requirements_file = '{}/requirements.txt'.format(project_path)

with open(requirements_file) as f:
content = f.readlines()
install_requires = [x.strip() for x in content]
from setuptools import setup, find_packages

with open('README.md', 'r') as fh:
long_description = fh.read()
Expand All @@ -26,7 +18,11 @@
},
long_description_content_type='text/markdown',
url='https://github.com/TaskarCenterAtUW/TDEI-python-lib-osw-validation',
install_requires=install_requires,
install_requires=[
'jsonschema',
'zipfile36',
'geopandas'
],
packages=find_packages(where='src'),
classifiers=[
'Programming Language :: Python :: 3',
Expand All @@ -38,4 +34,4 @@
package_data={
'python_osw_validation': ['schema/*'],
},
)
)
59 changes: 30 additions & 29 deletions src/python_osw_validation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os
import gc
import json
import jsonschema
from typing import Dict, Any, Optional, List
import geopandas as gpd
from .zipfile_handler import ZipFileHandler
from .extracted_data_validator import ExtractedDataValidator, OSW_dataset_files
from .extracted_data_validator import ExtractedDataValidator, OSW_DATASET_FILES
from .version import __version__

SCHEMA_PATH = os.path.join(os.path.dirname(__file__), 'schema')
Expand All @@ -29,7 +30,7 @@ def __init__(self, zipfile_path: str, schema_file_path=None):
self.schema_file_path = schema_file_path

def load_osw_schema(self, schema_path: str) -> Dict[str, Any]:
'''Load OSW Schema'''
"""Load OSW Schema"""
try:
with open(schema_path, 'r') as file:
return json.load(file)
Expand All @@ -40,12 +41,12 @@ def load_osw_schema(self, schema_path: str) -> Dict[str, Any]:
def are_ids_unique(self, gdf):
"""Check for duplicate values in the _id field"""
duplicates = gdf[gdf.duplicated('_id', keep=False)]['_id'].unique()

is_valid = len(duplicates) == 0

return is_valid, list(duplicates)

def validate(self, max_errors=20) -> ValidationResult:
zip_handler = None
OSW_DATASET = {}
try:
# Extract the zipfile
zip_handler = ZipFileHandler(self.zipfile_path)
Expand All @@ -60,43 +61,42 @@ def validate(self, max_errors=20) -> ValidationResult:
if not validator.is_valid():
self.errors.append(validator.error)
return ValidationResult(False, self.errors)

for file in validator.files:
file_path = os.path.join(file)
if not self.validate_osw_errors(file_path, max_errors):
if not self.validate_osw_errors(file_path=str(file_path), max_errors=max_errors):
break

if self.errors:
zip_handler.remove_extracted_files()
return ValidationResult(False, self.errors)

# Validate data integrity
OSW_dataset = {}
for file in validator.files:
file_path = os.path.join(file)
osw_file = next((osw_file_any for osw_file_any in OSW_dataset_files.keys() if osw_file_any in file_path), '')
OSW_dataset[osw_file] = gpd.read_file(file_path)
osw_file = next((osw_file_any for osw_file_any in OSW_DATASET_FILES.keys() if osw_file_any in file_path), '')
OSW_DATASET[osw_file] = gpd.read_file(file_path)

# Are all id's unique in each file? No need to check uniqueness across files yet since we do not have a global OSW ID format yet
for osw_file in OSW_dataset:
is_valid, duplicates = self.are_ids_unique(OSW_dataset[osw_file])
for osw_file in OSW_DATASET:
is_valid, duplicates = self.are_ids_unique(OSW_DATASET[osw_file])
if not is_valid:
self.errors.append(f"Duplicate _id's found in {osw_file} : {duplicates}")

# Create sets of node id's and foreign keys to be used in validation
if "nodes" in OSW_dataset:
node_ids = set(OSW_dataset['nodes']['_id'])
if 'nodes' in OSW_DATASET:
node_ids = set(OSW_DATASET['nodes']['_id'])
else:
node_ids = set()

if "edges" in OSW_dataset:
node_ids_edges_u = set(OSW_dataset['edges']['_u_id'])
node_ids_edges_v = set(OSW_dataset['edges']['_v_id'])
if 'edges' in OSW_DATASET:
node_ids_edges_u = set(OSW_DATASET['edges']['_u_id'])
node_ids_edges_v = set(OSW_DATASET['edges']['_v_id'])
else:
node_ids_edges_u = set()
node_ids_edges_v = set()

if "zones" in OSW_dataset:
node_ids_zones_w = set([item for sublist in OSW_dataset['zones']['_w_id'] for item in sublist])
if 'zones' in OSW_DATASET:
node_ids_zones_w = set([item for sublist in OSW_DATASET['zones']['_w_id'] for item in sublist])
else:
node_ids_zones_w = set()

Expand All @@ -119,8 +119,8 @@ def validate(self, max_errors=20) -> ValidationResult:
self.errors.append(f"All _w_id's in zones should be part of _id's mentioned in nodes, _w_id's not in nodes are: {unmatched}")

# Geometry validation: check geometry type in each file and test if coordinates make a shape that is reasonable geometric shape according to the Simple Feature Access standard
for osw_file in OSW_dataset:
invalid_geojson = OSW_dataset[osw_file][(OSW_dataset[osw_file].geometry.type != OSW_dataset_files[osw_file]['geometry']) | (OSW_dataset[osw_file].is_valid == False)]
for osw_file in OSW_DATASET:
invalid_geojson = OSW_DATASET[osw_file][(OSW_DATASET[osw_file].geometry.type != OSW_DATASET_FILES[osw_file]['geometry']) | (OSW_DATASET[osw_file].is_valid == False)]
is_valid = len(invalid_geojson) == 0
if not is_valid:
self.errors.append(f"Invalid {osw_file} geometries found, id's of invalid geometries: {set(invalid_geojson['_id'])}")
Expand All @@ -135,30 +135,31 @@ def validate(self, max_errors=20) -> ValidationResult:
self.errors.append(f"Invalid geometries found in extension file {file}, list of invalid geometries: {invalid_geojson.to_json()}")

if self.errors:
zip_handler.remove_extracted_files()
return ValidationResult(False, self.errors)
else:
return ValidationResult(True)
except Exception as e:
self.errors.append(f'Unable to validate: {e}')
return ValidationResult(False, self.errors)
finally:
del OSW_DATASET
if zip_handler:
zip_handler.remove_extracted_files()
gc.collect()

def load_osw_file(self, graph_geojson_path: str) -> Dict[str, Any]:
'''Load OSW Data'''
"""Load OSW Data"""
with open(graph_geojson_path, 'r') as file:
return json.load(file)

def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
'''Validate OSW Data against the schema and process all errors'''
"""Validate OSW Data against the schema and process all errors"""
geojson_data = self.load_osw_file(file_path)
validator = jsonschema.Draft7Validator(self.load_osw_schema(self.schema_file_path))

for error in validator.iter_errors(geojson_data):
self.errors.append(f'Validation error: {error.message}')
if len(self.errors) == max_errors:
break

if len(self.errors) >= max_errors:
return False
if len(self.errors) >= max_errors:
return False

return True
return len(self.errors) < max_errors
60 changes: 30 additions & 30 deletions src/python_osw_validation/extracted_data_validator.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
import os
import glob


OSW_dataset_files = {"edges": {
"required": False,
"geometry": "LineString"
},
"nodes": {
"required": False,
"geometry": "Point"
},
"points": {
"required": False,
"geometry": "Point"
},
"lines": {
"required": False,
"geometry": "LineString"
},
"zones": {
"required": False,
"geometry": "Polygon"
},
"polygons": {
"required": False,
"geometry": "Polygon"
}
}
OSW_DATASET_FILES = {
"edges": {
"required": False,
"geometry": "LineString"
},
"nodes": {
"required": False,
"geometry": "Point"
},
"points": {
"required": False,
"geometry": "Point"
},
"lines": {
"required": False,
"geometry": "LineString"
},
"zones": {
"required": False,
"geometry": "Polygon"
},
"polygons": {
"required": False,
"geometry": "Polygon"
}
}


class ExtractedDataValidator:
Expand All @@ -53,8 +53,8 @@ def is_valid(self) -> bool:
self.error = 'No .geojson files found in the specified directory or its subdirectories.'
return False

required_files = [key for key, value in OSW_dataset_files.items() if value['required']]
optional_files = [key for key, value in OSW_dataset_files.items() if not value['required']]
required_files = [key for key, value in OSW_DATASET_FILES.items() if value['required']]
optional_files = [key for key, value in OSW_DATASET_FILES.items() if not value['required']]
missing_files = []
duplicate_files = []
for required_file in required_files:
Expand Down Expand Up @@ -89,11 +89,11 @@ def is_valid(self) -> bool:
if missing_files:
self.error = f'Missing required .geojson files: {", ".join(missing_files)}.'
return False

if duplicate_files:
self.error = f'Multiple .geojson files of the same type found: {", ".join(duplicate_files)}.'
return False

# Add OSW external extensions, GeoJSON files we know nothing about
self.externalExtensions.extend([item for item in geojson_files if item not in self.files])

Expand Down
2 changes: 1 addition & 1 deletion src/python_osw_validation/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.2.5'
__version__ = '0.2.6'