Skip to content

Commit

Permalink
[#118] Add counts for additional fields.
Browse files Browse the repository at this point in the history
Count how many times the additional fields occur.

Only show fields that have a parent in the schema, so not to display too many results.
  • Loading branch information
kindly committed Dec 15, 2015
1 parent 327a845 commit bebaec6
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 30 deletions.
20 changes: 20 additions & 0 deletions cove/templates/explore.html
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,26 @@ <h4 class="panel-title">
</div>
{% endif %}

{% if data_only %}
<div class="panel panel-warning">
<div class="panel-heading">
<h4 class="panel-title">
{% trans 'Additional Fields (fields in data not in schema)' %}
</h4>
</div>
<table class="table">
<thead> <th>{% trans 'Field' %}</th> <th>{% trans 'Path to Field' %}</th> <th>{% trans 'Usage Count' %}</th> </thead>
<tbody>
{% for path, error, count in data_only %}
<tr>
<td>{{error}}</td> <td>{{path}}</td> <td>{{count}}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% endif %}


{% block explore_content %}
{% endblock %}
Expand Down
18 changes: 10 additions & 8 deletions cove/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,16 @@ def test_get_releases_aggregates():


def test_fields_present():
assert v.get_fields_present({}) == set()
assert v.get_fields_present({'a': 1, 'b': 2}) == set(['/a', '/b'])
assert v.get_fields_present({'a': {}, 'b': 2}) == set(['/a', '/b'])
assert v.get_fields_present({'a': {'c': 1}, 'b': 2}) == set(['/a', '/b', '/a/c'])
assert v.get_fields_present({'a': {'c': 1}, 'b': 2}) == set(['/a', '/b', '/a/c'])
assert v.get_fields_present({'a': {'c': {'d': 1}}, 'b': 2}) == set(['/a', '/b', '/a/c', '/a/c/d'])
assert v.get_fields_present({'a': [{'c': 1}], 'b': 2}) == set(['/a', '/b', '/a[]/c'])
assert v.get_fields_present({'a': {'c': [{'d': 1}]}, 'b': 2}) == set(['/a', '/b', '/a/c', '/a/c[]/d'])
assert v.get_fields_present({}) == {}
assert v.get_fields_present({'a': 1, 'b': 2}) == {"/a": 1, "/b": 1}
assert v.get_fields_present({'a': {}, 'b': 2}) == {'/a': 1, '/b': 1}
assert v.get_fields_present({'a': {'c': 1}, 'b': 2}) == {'/a': 1, '/b': 1, '/a/c': 1}
assert v.get_fields_present({'a': {'c': 1}, 'b': 2}) == {'/a': 1, '/b': 1, '/a/c': 1}
assert v.get_fields_present({'a': {'c': {'d': 1}}, 'b': 2}) == {'/a': 1, '/b': 1, '/a/c': 1, '/a/c/d': 1}
assert v.get_fields_present({'a': [{'c': 1}], 'b': 2}) == {'/a': 1, '/b': 1, '/a/c': 1}
assert v.get_fields_present({'a': {'c': [{'d': 1}]}, 'b': 2}) == {'/a': 1, '/b': 1, '/a/c': 1, '/a/c/d': 1}
assert v.get_fields_present({'a': {'c_1': [{'d': 1}]}, 'b_1': 2}) == {'/a': 1, '/a/c_1': 1, '/a/c_1/d': 1}
assert v.get_fields_present({'a': {'c_1': [{'d': 1}, {'d': 1}]}, 'b_1': 2}) == {'/a': 1, '/a/c_1': 1, '/a/c_1/d': 2}


def test_get_file_type_xlsx():
Expand Down
71 changes: 50 additions & 21 deletions cove/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,39 +178,69 @@ def get_grants_aggregates(json_data):


def fields_present_generator(json_data, prefix=''):
if hasattr(json_data, 'items'):
for key, value in json_data.items():
if not isinstance(json_data, dict):
return
for key, value in json_data.items():
if isinstance(value, list):
for item in value:
if isinstance(item, dict):
yield from fields_present_generator(item, prefix + '/' + key)
yield prefix + '/' + key
elif isinstance(value, dict):
yield from fields_present_generator(value, prefix + '/' + key)
yield prefix + '/' + key
elif isinstance(json_data, list):
for x in json_data:
yield from fields_present_generator(x, prefix + '[]')
else:
# if a string value has an underscore in it, assume its a language property
# and do not count as a present field.
if '_' not in key:
yield prefix + '/' + key


def get_fields_present(*args, **kwargs):
return set(fields_present_generator(*args, **kwargs))
counter = collections.Counter()
counter.update(fields_present_generator(*args, **kwargs))
return dict(counter)


def schema_dict_fields_generator(schema_dict):
if 'properties' in schema_dict:
for property_name, property_schema_dict in schema_dict['properties'].items():
property_type_set = get_property_type_set(property_schema_dict)
if 'object' in property_type_set:
for field in schema_dict_fields_generator(property_schema_dict):
yield '/' + property_name + field

elif 'array' in property_type_set:
fields = schema_dict_fields_generator(property_schema_dict['items'])
for field in fields:
yield '/' + property_name + '[]' + field
yield '/' + property_name
for property_name, value in schema_dict['properties'].items():
if 'oneOf' in value:
property_schema_dicts = value['oneOf']
else:
property_schema_dicts = [value]
for property_schema_dict in property_schema_dicts:
property_type_set = get_property_type_set(property_schema_dict)
if 'object' in property_type_set:
for field in schema_dict_fields_generator(property_schema_dict):
yield '/' + property_name + field
elif 'array' in property_type_set:
fields = schema_dict_fields_generator(property_schema_dict['items'])
for field in fields:
yield '/' + property_name + field
yield '/' + property_name


def get_schema_fields(schema_filename):
r = requests.get(schema_filename)
return set(schema_dict_fields_generator(jsonref.loads(r.text, object_pairs_hook=OrderedDict)))


def get_counts_additional_fields(schema_url, json_data):
fields_present = get_fields_present(json_data)
schema_fields = get_schema_fields(schema_url)
data_only_all = set(fields_present) - schema_fields
data_only = set()
for field in data_only_all:
parent_field = "/".join(field.split('/')[:-1])
# only take fields with parent in schema (and top level fields)
# to make results less verbose
if not parent_field or parent_field in schema_fields:
data_only.add(field)

return [('/'.join(key.split('/')[:-1]), key.split('/')[-1], fields_present[key]) for key in data_only]


def get_schema_validation_errors(json_data, schema_url):
schema = requests.get(schema_url).json()
validation_errors = collections.defaultdict(list)
Expand Down Expand Up @@ -423,8 +453,9 @@ def explore(request, pk):
schema_url = schema_url['record'] if 'records' in json_data else schema_url['release']

if schema_url:
fields_present = get_fields_present(json_data)
schema_fields = get_schema_fields(schema_url)
context.update({
'data_only': sorted(get_counts_additional_fields(schema_url, json_data))
})

validation_errors_path = os.path.join(data.upload_dir(), 'validation_errors.json')
if os.path.exists(validation_errors_path):
Expand All @@ -440,8 +471,6 @@ def explore(request, pk):
'schema_url': schema_url,
'validation_errors': validation_errors,
'json_data': json_data # Pass the JSON data to the template so we can display values that need little processing
'schema_only': schema_fields - fields_present,
'data_only': fields_present - schema_fields
})

view = 'explore.html'
Expand Down
4 changes: 3 additions & 1 deletion fts/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,9 @@ def buttons():
'Unique Grant IDs: 2',
'Duplicate IDs: 2',
'Silent Signal',
'Showing 1 to 4 of 4 entries'], True),
'Showing 1 to 4 of 4 entries',
'Additional Fields',
'Data source'], True),
# Test a 360 spreadsheet with titles, rather than fields
('/360/', 'WellcomeTrust-grants_2_grants.xlsx', 'Download Files', True),
# Test a 360 csv in cp1252 incoding
Expand Down

0 comments on commit bebaec6

Please sign in to comment.