Skip to content

Commit

Permalink
Implement semantic validation of various types of metadata field
Browse files Browse the repository at this point in the history
ref #5
  • Loading branch information
mark-saeon committed Jun 29, 2018
1 parent 538df7c commit 594ac78
Show file tree
Hide file tree
Showing 11 changed files with 700 additions and 17 deletions.
188 changes: 188 additions & 0 deletions ckanext/metadata/jsonschema_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# encoding: utf-8

# Permissible date/time values: http://www.w3.org/TR/NOTE-datetime
# Permissible date/time ranges: http://www.ukoln.ac.uk/metadata/dcmi/collection-RKMS-ISO8601

import logging
import jsonschema
import jsonschema.validators
from datetime import datetime
import re
import urlparse

import ckan.plugins.toolkit as tk
from ckan.common import _

log = logging.getLogger(__name__)

DOI_RE = re.compile(r'^10\.\d+(\.\d+)*/.+$')
TIME_RE = re.compile(r'^(?P<h>\d{2}):(?P<m>\d{2})(:(?P<s>\d{2})(\.\d+)?)?(Z|[+-](?P<tzh>\d{2}):(?P<tzm>\d{2}))$')
GEO_POINT_RE = re.compile(r'^(?P<lat>[+-]?\d+(\.\d+)?)\s+(?P<lon>[+-]?\d+(\.\d+)?)$')
GEO_BOX_RE = re.compile(r'^(?P<lat1>[+-]?\d+(\.\d+)?)\s+(?P<lon1>[+-]?\d+(\.\d+)?)\s+(?P<lat2>[+-]?\d+(\.\d+)?)\s+(?P<lon2>[+-]?\d+(\.\d+)?)$')

checks_format = jsonschema.FormatChecker.cls_checks


def create_validator(schema):
cls = jsonschema.validators.validator_for(schema)
cls.check_schema(schema)
cls.VALIDATORS.update({
'vocabulary': vocabulary_validator,
})
return cls(schema, format_checker=jsonschema.FormatChecker(
formats=[
'doi',
'uri',
'url',
'year',
'yearmonth',
'date',
'datetime',
'year-range',
'yearmonth-range',
'date-range',
'datetime-range',
'geolocation-point',
'geolocation-box',
]))


def vocabulary_validator(validator, vocabulary_name, instance, schema):
if validator.is_type(instance, "string"):
try:
vocabulary = tk.get_action('vocabulary_show')(data_dict={'id': vocabulary_name})
tags = [tag['name'] for tag in vocabulary['tags']]
if instance not in tags:
yield jsonschema.ValidationError(_('Tag not found in vocabulary'))
except tk.ObjectNotFound:
yield jsonschema.ValidationError('%s: %s' % (_('Not found'), _('Vocabulary')))


@checks_format('doi')
def is_doi(instance):
if not isinstance(instance, basestring):
return True
return re.match(DOI_RE, instance) is not None


@checks_format('url')
def is_url(instance):
if not isinstance(instance, basestring):
return True
try:
urlparts = urlparse.urlparse(instance)
if not urlparts.scheme or not urlparts.netloc:
return False
return True
except ValueError:
return False


@checks_format('year')
def is_year(instance):
if not isinstance(instance, basestring):
return True
try:
datetime.strptime(instance, '%Y')
return True
except ValueError:
return False


@checks_format('yearmonth')
def is_yearmonth(instance):
if not isinstance(instance, basestring):
return True
try:
datetime.strptime(instance, '%Y-%m')
return True
except ValueError:
return False


# TODO: check if this replaces or adds to the existing date format checkers
@checks_format('date')
def is_date(instance):
if not isinstance(instance, basestring):
return True
try:
datetime.strptime(instance, '%Y-%m-%d')
return True
except ValueError:
return False


@checks_format('datetime')
def is_datetime(instance):
if not isinstance(instance, basestring):
return True
try:
datestr, timestr = instance.split('T')
datetime.strptime(datestr, '%Y-%m-%d')
time_match = re.match(TIME_RE, timestr)
if time_match:
h, m, s, tzh, tzm = time_match.group('h', 'm', 's', 'tzh', 'tzm')
if 0 <= int(h) <= 23 and 0 <= int(m) <= 59 and 0 <= int(s) <= 59 and 0 <= int(tzm) <= 59:
return True
return False
except ValueError:
return False


def _is_range(instance, func):
if not isinstance(instance, basestring):
return True
try:
start, end = instance.split('/')
if not start and not end:
return False
valid_start = not start or func(start)
valid_end = not end or func(end)
return valid_start and valid_end
except ValueError:
return False


@checks_format('year-range')
def is_year_range(instance):
return _is_range(instance, is_year)


@checks_format('yearmonth-range')
def is_yearmonth_range(instance):
return _is_range(instance, is_yearmonth)


@checks_format('date-range')
def is_date_range(instance):
return _is_range(instance, is_date)


@checks_format('datetime-range')
def is_datetime_range(instance):
return _is_range(instance, is_datetime)


@checks_format('geolocation-point')
def is_geolocation_point(instance):
if not isinstance(instance, basestring):
return True
match = re.match(GEO_POINT_RE, instance)
if match:
lat, lon = match.group('lat', 'lon')
if -90 <= float(lat) <= 90 and -180 <= float(lon) <= 180:
return True
return False


@checks_format('geolocation-box')
def is_geolocation_box(instance):
if not isinstance(instance, basestring):
return True
match = re.match(GEO_BOX_RE, instance)
if match:
lat1, lon1, lat2, lon2 = match.group('lat1', 'lon1', 'lat2', 'lon2')
if -90 <= float(lat1) <= 90 and -180 <= float(lon1) <= 180 and -90 <= float(lat2) <= 90 and -180 <= float(lon2) <= 180 and \
float(lat1) <= float(lat2) and float(lon1) <= float(lon2):
return True
return False
24 changes: 16 additions & 8 deletions ckanext/metadata/logic/action/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
import logging
from paste.deploy.converters import asbool
from sqlalchemy import or_
import jsonschema
import json

import ckan.plugins.toolkit as tk
from ckan.common import _
from ckanext.metadata.logic import schema, METADATA_VALIDATION_ACTIVITY_TYPE, METADATA_WORKFLOW_ACTIVITY_TYPE
from ckanext.metadata.lib.dictization import model_dictize
from ckanext.metadata.jsonschema_validation import create_validator
import ckanext.metadata.model as ckanext_model

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -553,20 +554,27 @@ def metadata_validity_check(context, data_dict):
:param model_json: JSON dictionary defining a metadata model
:type model_json: string
:rtype: dictionary of errors; empty dict implies that the metadata is 100% valid
:rtype: dictionary of metadata errors; empty dict implies that the metadata is 100% valid
against the given model
"""
log.debug("Checking metadata validity")
tk.check_access('metadata_validity_check', context, data_dict)

metadata_json, model_json = tk.get_or_bust(data_dict, ['metadata_json', 'model_json'])
session = context['session']
data, errors = tk.navl_validate(data_dict, schema.metadata_validity_check_schema(), context)
if errors:
session.rollback()
raise tk.ValidationError(errors)

metadata_json = json.loads(data['metadata_json'])
model_json = json.loads(data['model_json'])

errors = {}
validator = jsonschema.Draft4Validator(model_json)
for error in validator.iter_errors(metadata_json):
errors[tuple(error.path)] = error.message
metadata_errors = {}
validator = create_validator(model_json)
for metadata_error in validator.iter_errors(metadata_json):
metadata_errors[tuple(metadata_error.path)] = metadata_error.message

return errors
return metadata_errors


@tk.side_effect_free
Expand Down
2 changes: 1 addition & 1 deletion ckanext/metadata/logic/action/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ def metadata_record_validate(context, data_dict):
for metadata_model in validation_models:
validation_errors = tk.get_action('metadata_validity_check')(context, {
'metadata_json': metadata_record.extras['metadata_json'],
'model_json': metadata_model['model_json'],
'model_json': json.dumps(metadata_model['model_json']),
})
validation_result = {
'metadata_model_id': metadata_model['id'],
Expand Down
10 changes: 9 additions & 1 deletion ckanext/metadata/logic/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,14 @@ def metadata_record_show_schema():
return schema


def metadata_validity_check_schema():
schema = {
'metadata_json': [v.not_missing, unicode, v.json_dict_validator],
'model_json': [v.not_missing, unicode, v.json_schema_validator],
}
return schema


def metadata_collection_create_schema():
schema = {
# from the default group schema
Expand Down Expand Up @@ -248,7 +256,7 @@ def metadata_model_create_schema():
'metadata_schema_id': [v.not_empty, unicode, v.metadata_schema_exists],
'organization_id': [v.not_missing, unicode, v.group_exists('organization')],
'infrastructure_id': [v.not_missing, unicode, v.group_exists('infrastructure')],
'model_json': [v.not_missing, unicode, v.json_dict_validator, v.json_schema_validator],
'model_json': [v.not_missing, unicode, v.json_schema_validator],
'state': [ignore_not_sysadmin, ignore_missing],

# post-validation
Expand Down
2 changes: 1 addition & 1 deletion ckanext/metadata/logic/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def json_schema_validator(value):
except ValueError, e:
raise tk.Invalid(_("JSON decode error: %s") % e.message)
except AttributeError, e:
raise tk.Invalid(_("Invalid JSON object type: %s") % e.message)
raise tk.Invalid(_("Expecting a JSON dictionary"))
except jsonschema.SchemaError, e:
raise tk.Invalid(_("Invalid JSON schema: %s") % e.message)

Expand Down
2 changes: 1 addition & 1 deletion ckanext/metadata/tests/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class MetadataModel(factory.Factory):

title = factory.Sequence(lambda n: 'Test Metadata Model {0:02d}'.format(n))
description = 'A test description for this test metadata model.'
model_json = '{}'
model_json = '{"type": "object"}'
organization_id = ''
infrastructure_id = ''

Expand Down
10 changes: 6 additions & 4 deletions ckanext/metadata/tests/test_metadata_model_actions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# encoding: utf-8

import json

from ckan.plugins import toolkit as tk
from ckan.tests import factories as ckan_factories
from ckan.tests.helpers import call_action
Expand Down Expand Up @@ -545,7 +547,7 @@ def test_update_infrastructure_invalidate_records_1(self):
metadata_schema_id=metadata_model['metadata_schema_id'],
organization_id='',
infrastructure_id=metadata_record_1['infrastructures'][0]['id'],
model_json='{}')
model_json=json.dumps(metadata_model['model_json']))

assert_package_has_extra(metadata_record_1['id'], 'validated', True)
assert_package_has_extra(metadata_record_2['id'], 'validated', False)
Expand All @@ -567,7 +569,7 @@ def test_update_infrastructure_invalidate_records_2(self):
metadata_schema_id=metadata_model_1['metadata_schema_id'],
organization_id='',
infrastructure_id='',
model_json='{}')
model_json=json.dumps(metadata_model_1['model_json']))

assert_package_has_extra(metadata_record_1['id'], 'validated', True)
assert_package_has_extra(metadata_record_2['id'], 'validated', False)
Expand All @@ -589,7 +591,7 @@ def test_update_organization_invalidate_records_1(self):
metadata_schema_id=metadata_model['metadata_schema_id'],
organization_id=metadata_record_1['owner_org'],
infrastructure_id='',
model_json='{}')
model_json=json.dumps(metadata_model['model_json']))

assert_package_has_extra(metadata_record_1['id'], 'validated', True)
assert_package_has_extra(metadata_record_2['id'], 'validated', False)
Expand All @@ -611,7 +613,7 @@ def test_update_organization_invalidate_records_2(self):
metadata_schema_id=metadata_model_1['metadata_schema_id'],
organization_id='',
infrastructure_id='',
model_json='{}')
model_json=json.dumps(metadata_model_1['model_json']))

assert_package_has_extra(metadata_record_1['id'], 'validated', True)
assert_package_has_extra(metadata_record_2['id'], 'validated', False)
Expand Down
Empty file.
Loading

0 comments on commit 594ac78

Please sign in to comment.