diff --git a/CHANGELOG.md b/CHANGELOG.md index 9725d08..85df33b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,12 @@ # Change log +### 0.2.6 +- Add garbage collection to free up memory after validation + + ### 0.2.5 -- Updated geopandas package +- Updated geopandas package + ### 0.2.3 - Performance improvement if there are any errors diff --git a/requirements.txt b/requirements.txt index 0917296..11a52c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -jsonschema~=4.19.1 -zipfile36~=0.1.3 -coverage~=7.5.1 +jsonschema +zipfile36 +coverage geopandas \ No newline at end of file diff --git a/setup.py b/setup.py index baf3bad..d9b4936 100644 --- a/setup.py +++ b/setup.py @@ -1,13 +1,5 @@ -import os -from setuptools import setup, find_packages, Extension from version import version - -project_path = os.path.dirname(os.path.realpath(__file__)) -requirements_file = '{}/requirements.txt'.format(project_path) - -with open(requirements_file) as f: - content = f.readlines() -install_requires = [x.strip() for x in content] +from setuptools import setup, find_packages with open('README.md', 'r') as fh: long_description = fh.read() @@ -26,7 +18,11 @@ }, long_description_content_type='text/markdown', url='https://github.com/TaskarCenterAtUW/TDEI-python-lib-osw-validation', - install_requires=install_requires, + install_requires=[ + 'jsonschema', + 'zipfile36', + 'geopandas' + ], packages=find_packages(where='src'), classifiers=[ 'Programming Language :: Python :: 3', @@ -38,4 +34,4 @@ package_data={ 'python_osw_validation': ['schema/*'], }, -) \ No newline at end of file +) diff --git a/src/python_osw_validation/__init__.py b/src/python_osw_validation/__init__.py index 1078a8f..05df85b 100644 --- a/src/python_osw_validation/__init__.py +++ b/src/python_osw_validation/__init__.py @@ -1,10 +1,11 @@ import os +import gc import json import jsonschema from typing import Dict, Any, Optional, List import geopandas as gpd from .zipfile_handler import ZipFileHandler -from .extracted_data_validator import ExtractedDataValidator, OSW_dataset_files +from .extracted_data_validator import ExtractedDataValidator, OSW_DATASET_FILES from .version import __version__ SCHEMA_PATH = os.path.join(os.path.dirname(__file__), 'schema') @@ -29,7 +30,7 @@ def __init__(self, zipfile_path: str, schema_file_path=None): self.schema_file_path = schema_file_path def load_osw_schema(self, schema_path: str) -> Dict[str, Any]: - '''Load OSW Schema''' + """Load OSW Schema""" try: with open(schema_path, 'r') as file: return json.load(file) @@ -40,12 +41,12 @@ def load_osw_schema(self, schema_path: str) -> Dict[str, Any]: def are_ids_unique(self, gdf): """Check for duplicate values in the _id field""" duplicates = gdf[gdf.duplicated('_id', keep=False)]['_id'].unique() - is_valid = len(duplicates) == 0 - return is_valid, list(duplicates) def validate(self, max_errors=20) -> ValidationResult: + zip_handler = None + OSW_DATASET = {} try: # Extract the zipfile zip_handler = ZipFileHandler(self.zipfile_path) @@ -60,43 +61,42 @@ def validate(self, max_errors=20) -> ValidationResult: if not validator.is_valid(): self.errors.append(validator.error) return ValidationResult(False, self.errors) + for file in validator.files: file_path = os.path.join(file) - if not self.validate_osw_errors(file_path, max_errors): + if not self.validate_osw_errors(file_path=str(file_path), max_errors=max_errors): break if self.errors: - zip_handler.remove_extracted_files() return ValidationResult(False, self.errors) # Validate data integrity - OSW_dataset = {} for file in validator.files: file_path = os.path.join(file) - osw_file = next((osw_file_any for osw_file_any in OSW_dataset_files.keys() if osw_file_any in file_path), '') - OSW_dataset[osw_file] = gpd.read_file(file_path) + osw_file = next((osw_file_any for osw_file_any in OSW_DATASET_FILES.keys() if osw_file_any in file_path), '') + OSW_DATASET[osw_file] = gpd.read_file(file_path) # Are all id's unique in each file? No need to check uniqueness across files yet since we do not have a global OSW ID format yet - for osw_file in OSW_dataset: - is_valid, duplicates = self.are_ids_unique(OSW_dataset[osw_file]) + for osw_file in OSW_DATASET: + is_valid, duplicates = self.are_ids_unique(OSW_DATASET[osw_file]) if not is_valid: self.errors.append(f"Duplicate _id's found in {osw_file} : {duplicates}") # Create sets of node id's and foreign keys to be used in validation - if "nodes" in OSW_dataset: - node_ids = set(OSW_dataset['nodes']['_id']) + if 'nodes' in OSW_DATASET: + node_ids = set(OSW_DATASET['nodes']['_id']) else: node_ids = set() - if "edges" in OSW_dataset: - node_ids_edges_u = set(OSW_dataset['edges']['_u_id']) - node_ids_edges_v = set(OSW_dataset['edges']['_v_id']) + if 'edges' in OSW_DATASET: + node_ids_edges_u = set(OSW_DATASET['edges']['_u_id']) + node_ids_edges_v = set(OSW_DATASET['edges']['_v_id']) else: node_ids_edges_u = set() node_ids_edges_v = set() - if "zones" in OSW_dataset: - node_ids_zones_w = set([item for sublist in OSW_dataset['zones']['_w_id'] for item in sublist]) + if 'zones' in OSW_DATASET: + node_ids_zones_w = set([item for sublist in OSW_DATASET['zones']['_w_id'] for item in sublist]) else: node_ids_zones_w = set() @@ -119,8 +119,8 @@ def validate(self, max_errors=20) -> ValidationResult: self.errors.append(f"All _w_id's in zones should be part of _id's mentioned in nodes, _w_id's not in nodes are: {unmatched}") # Geometry validation: check geometry type in each file and test if coordinates make a shape that is reasonable geometric shape according to the Simple Feature Access standard - for osw_file in OSW_dataset: - invalid_geojson = OSW_dataset[osw_file][(OSW_dataset[osw_file].geometry.type != OSW_dataset_files[osw_file]['geometry']) | (OSW_dataset[osw_file].is_valid == False)] + for osw_file in OSW_DATASET: + invalid_geojson = OSW_DATASET[osw_file][(OSW_DATASET[osw_file].geometry.type != OSW_DATASET_FILES[osw_file]['geometry']) | (OSW_DATASET[osw_file].is_valid == False)] is_valid = len(invalid_geojson) == 0 if not is_valid: self.errors.append(f"Invalid {osw_file} geometries found, id's of invalid geometries: {set(invalid_geojson['_id'])}") @@ -135,30 +135,31 @@ def validate(self, max_errors=20) -> ValidationResult: self.errors.append(f"Invalid geometries found in extension file {file}, list of invalid geometries: {invalid_geojson.to_json()}") if self.errors: - zip_handler.remove_extracted_files() return ValidationResult(False, self.errors) else: return ValidationResult(True) except Exception as e: self.errors.append(f'Unable to validate: {e}') return ValidationResult(False, self.errors) + finally: + del OSW_DATASET + if zip_handler: + zip_handler.remove_extracted_files() + gc.collect() def load_osw_file(self, graph_geojson_path: str) -> Dict[str, Any]: - '''Load OSW Data''' + """Load OSW Data""" with open(graph_geojson_path, 'r') as file: return json.load(file) def validate_osw_errors(self, file_path: str, max_errors: int) -> bool: - '''Validate OSW Data against the schema and process all errors''' + """Validate OSW Data against the schema and process all errors""" geojson_data = self.load_osw_file(file_path) validator = jsonschema.Draft7Validator(self.load_osw_schema(self.schema_file_path)) for error in validator.iter_errors(geojson_data): self.errors.append(f'Validation error: {error.message}') - if len(self.errors) == max_errors: - break - - if len(self.errors) >= max_errors: - return False + if len(self.errors) >= max_errors: + return False - return True + return len(self.errors) < max_errors diff --git a/src/python_osw_validation/extracted_data_validator.py b/src/python_osw_validation/extracted_data_validator.py index 45d8769..cae10e8 100644 --- a/src/python_osw_validation/extracted_data_validator.py +++ b/src/python_osw_validation/extracted_data_validator.py @@ -1,32 +1,32 @@ import os import glob - -OSW_dataset_files = {"edges": { - "required": False, - "geometry": "LineString" - }, - "nodes": { - "required": False, - "geometry": "Point" - }, - "points": { - "required": False, - "geometry": "Point" - }, - "lines": { - "required": False, - "geometry": "LineString" - }, - "zones": { - "required": False, - "geometry": "Polygon" - }, - "polygons": { - "required": False, - "geometry": "Polygon" - } - } +OSW_DATASET_FILES = { + "edges": { + "required": False, + "geometry": "LineString" + }, + "nodes": { + "required": False, + "geometry": "Point" + }, + "points": { + "required": False, + "geometry": "Point" + }, + "lines": { + "required": False, + "geometry": "LineString" + }, + "zones": { + "required": False, + "geometry": "Polygon" + }, + "polygons": { + "required": False, + "geometry": "Polygon" + } +} class ExtractedDataValidator: @@ -53,8 +53,8 @@ def is_valid(self) -> bool: self.error = 'No .geojson files found in the specified directory or its subdirectories.' return False - required_files = [key for key, value in OSW_dataset_files.items() if value['required']] - optional_files = [key for key, value in OSW_dataset_files.items() if not value['required']] + required_files = [key for key, value in OSW_DATASET_FILES.items() if value['required']] + optional_files = [key for key, value in OSW_DATASET_FILES.items() if not value['required']] missing_files = [] duplicate_files = [] for required_file in required_files: @@ -89,11 +89,11 @@ def is_valid(self) -> bool: if missing_files: self.error = f'Missing required .geojson files: {", ".join(missing_files)}.' return False - + if duplicate_files: self.error = f'Multiple .geojson files of the same type found: {", ".join(duplicate_files)}.' return False - + # Add OSW external extensions, GeoJSON files we know nothing about self.externalExtensions.extend([item for item in geojson_files if item not in self.files]) diff --git a/src/python_osw_validation/version.py b/src/python_osw_validation/version.py index d0ba488..2da372e 100644 --- a/src/python_osw_validation/version.py +++ b/src/python_osw_validation/version.py @@ -1 +1 @@ -__version__ = '0.2.5' \ No newline at end of file +__version__ = '0.2.6' \ No newline at end of file