New --remove-empty-schema-columns flag to flatten

Original work done by @Bjwebb #241 OpenDataServices/cove#1019
OpenDataServices · Oct 23, 2018 · c7be44e · c7be44e
1 parent e0b12b5
commit c7be44e
Show file tree

Hide file tree

Showing 13 changed files with 127 additions and 13 deletions.
diff --git a/.gitignore b/.gitignore
@@ -20,6 +20,8 @@ examples/flatten/simple/actual
 examples/flatten/simple/actual.*
 examples/flatten/sheet-prefix/actual
 examples/flatten/sheet-prefix/actual.*
+examples/flatten/remove-empty-schema-columns/actual
+examples/flatten/remove-empty-schema-columns/actual.*
 examples/flatten/root-is-list/actual
 examples/flatten/root-is-list/actual.*
 examples/receipt/source-map/actual

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ### Added
 
 - Add --disable-local-refs to flatten, unflatten and create-template.
+- Add --remove-empty-schema-columns flag to flatten  https://github.com/OpenDataServices/cove/issues/1019
 
 ## [0.3.0] - 2018-10-12
 

diff --git a/docs/flatten.rst b/docs/flatten.rst
@@ -123,6 +123,30 @@ No `dishes` sheet is produced, and the main sheet does not have a `coffee` colum
 
 The field specified must be a field directly on the data object - it's not possible to filter on fields like `pints/0/title` .
 
+Remove Empty Schema Columns
+---------------------------
+
+By default, columns that are empty (have no data) will be kept in the output.
+
+But you can pass the `remove-empty-schema-columns` flag to have these removed.
+
+This shows without and with the flag:
+
+.. literalinclude:: ../examples/flatten/simple/cmd.txt
+   :language: bash
+
+.. csv-table:: sheet: cafe.csv
+   :file: ../examples/flatten/simple/expected/cafe.csv
+   :header-rows: 1
+
+.. literalinclude:: ../examples/flatten/remove-empty-schema-columns/cmd.txt
+   :language: bash
+
+.. csv-table:: sheet: cafe.csv
+   :file: ../examples/flatten/remove-empty-schema-columns/expected/cafe.csv
+   :header-rows: 1
+
+
 All flatten options
 -------------------
 

diff --git a/examples/flatten/remove-empty-schema-columns/cmd.txt b/examples/flatten/remove-empty-schema-columns/cmd.txt
@@ -0,0 +1 @@
+$ flatten-tool flatten --remove-empty-schema-columns --root-list-path=cafe --main-sheet-name=cafe --schema=examples/receipt/cafe.schema examples/flatten/remove-empty-schema-columns/input.json -o examples/flatten/remove-empty-schema-columns/actual
diff --git a/examples/flatten/remove-empty-schema-columns/expected/cafe.csv b/examples/flatten/remove-empty-schema-columns/expected/cafe.csv
@@ -0,0 +1,3 @@
+id,name
+CAFE-HEALTH,Healthy Cafe
+CAFE-VEG,Vegetarian Cafe
diff --git a/examples/flatten/remove-empty-schema-columns/expected/table.csv b/examples/flatten/remove-empty-schema-columns/expected/table.csv
@@ -0,0 +1,6 @@
+id,table/0/id,table/0/number
+CAFE-HEALTH,TABLE-1,1
+CAFE-HEALTH,TABLE-2,2
+CAFE-HEALTH,TABLE-3,3
+CAFE-VEG,TABLE-16,16
+CAFE-VEG,TABLE-17,17
diff --git a/examples/flatten/remove-empty-schema-columns/input.json b/examples/flatten/remove-empty-schema-columns/input.json
@@ -0,0 +1,36 @@
+{
+    "cafe": [
+        {
+            "id": "CAFE-HEALTH",
+            "name": "Healthy Cafe",
+            "table": [
+                {
+                    "id": "TABLE-1",
+                    "number": "1"
+                },
+                {
+                    "id": "TABLE-2",
+                    "number": "2"
+                },
+                {
+                    "id": "TABLE-3",
+                    "number": "3"
+                }
+            ]
+        },
+        {
+            "id": "CAFE-VEG",
+            "name": "Vegetarian Cafe",
+            "table": [
+                {
+                    "id": "TABLE-16",
+                    "number": "16"
+                },
+                {
+                    "id": "TABLE-17",
+                    "number": "17"
+                }
+            ]
+        }
+    ]
+}
diff --git a/examples/help/flatten/expected.txt b/examples/help/flatten/expected.txt
@@ -6,6 +6,7 @@ usage: flatten-tool flatten [-h] [-s SCHEMA] [-f {csv,xlsx,all}] [--xml]
                             [--filter-field FILTER_FIELD]
                             [--filter-value FILTER_VALUE]
                             [--disable-local-refs]
+                            [--remove-empty-schema-columns]
                             input_name
 
 positional arguments:
@@ -46,3 +47,5 @@ optional arguments:
                         Data Filter - only data with this will be processed.
                         Use with --filter-field
   --disable-local-refs  Disable local refs when parsing JSON Schema.
+  --remove-empty-schema-columns
+                        Remove columns from the schema that contain no data.
diff --git a/flattentool/__init__.py b/flattentool/__init__.py
@@ -45,7 +45,8 @@ def spreadsheet_output(spreadsheet_output_class, name):
 
 def flatten(input_name, schema=None, output_name='flattened', output_format='all', main_sheet_name='main',
             root_list_path='main', root_is_list=False, sheet_prefix='', filter_field=None, filter_value=None,
-            rollup=False, root_id=None, use_titles=False, xml=False, id_name='id', disable_local_refs=False, **_):
+            rollup=False, root_id=None, use_titles=False, xml=False, id_name='id', disable_local_refs=False,
+            remove_empty_schema_columns=False, **_):
     """
     Flatten a nested structure (JSON) to a flat structure (spreadsheet - csv or xlsx).
 
@@ -73,7 +74,9 @@ def flatten(input_name, schema=None, output_name='flattened', output_format='all
         xml=xml,
         id_name=id_name,
         filter_field=filter_field,
-        filter_value=filter_value)
+        filter_value=filter_value,
+        remove_empty_schema_columns=remove_empty_schema_columns,
+        )
     parser.parse()
 
     def spreadsheet_output(spreadsheet_output_class, name):

diff --git a/flattentool/cli.py b/flattentool/cli.py
@@ -123,7 +123,10 @@ def create_parser():
         "--disable-local-refs",
         action='store_true',
         help="Disable local refs when parsing JSON Schema.")
-
+    parser_flatten.add_argument(
+        "--remove-empty-schema-columns",
+        action='store_true',
+        help="Remove columns from the schema that contain no data.")
 
     parser_unflatten = subparsers.add_parser(
         'unflatten',

diff --git a/flattentool/json_input.py b/flattentool/json_input.py
@@ -35,7 +35,10 @@ def sheet_key_title(sheet, key):
 
     """
     if key in sheet.titles:
-        return sheet.titles[key]
+        title = sheet.titles[key]
+        if title not in sheet:
+            sheet.append(title)
+        return title
     else:
         if key not in sheet:
             sheet.append(key)
@@ -47,7 +50,8 @@ class JSONParser(object):
     # Similarily with methods like parse_json_dict
 
     def __init__(self, json_filename=None, root_json_dict=None, schema_parser=None, root_list_path=None,
-                 root_id='ocid', use_titles=False, xml=False, id_name='id', filter_field=None, filter_value=None):
+                 root_id='ocid', use_titles=False, xml=False, id_name='id', filter_field=None, filter_value=None,
+                 remove_empty_schema_columns=False):
         self.sub_sheets = {}
         self.main_sheet = Sheet()
         self.root_list_path = root_list_path
@@ -57,9 +61,16 @@ def __init__(self, json_filename=None, root_json_dict=None, schema_parser=None,
         self.xml = xml
         self.filter_field = filter_field
         self.filter_value = filter_value
+        self.remove_empty_schema_columns = remove_empty_schema_columns
         if schema_parser:
-            self.main_sheet = schema_parser.main_sheet
-            self.sub_sheets = schema_parser.sub_sheets
+            self.main_sheet = copy.deepcopy(schema_parser.main_sheet)
+            self.sub_sheets = copy.deepcopy(schema_parser.sub_sheets)
+            if remove_empty_schema_columns:
+                # Don't use columns from the schema parser
+                # (avoids empty columns)
+                self.main_sheet.columns = []
+                for sheet_name, sheet in list(self.sub_sheets.items()):
+                    sheet.columns = []
             # Rollup is pulled from the schema_parser, as rollup is only possible if a schema parser is specified
             self.rollup = schema_parser.rollup
             self.schema_parser = schema_parser
@@ -104,6 +115,12 @@ def parse(self):
                 # fallover on empty activity, e.g. <iati-activity/>
                 continue
             self.parse_json_dict(json_dict, sheet=self.main_sheet)
+
+        if self.remove_empty_schema_columns:
+            # Remove sheets with no lines of data
+            for sheet_name, sheet in list(self.sub_sheets.items()):
+                if not sheet.lines:
+                    del self.sub_sheets[sheet_name]
 
     def parse_json_dict(self, json_dict, sheet, json_key=None, parent_name='', flattened_dict=None, parent_id_fields=None, top_level_of_sub_sheet=False):
         """

diff --git a/flattentool/tests/test_docs.py b/flattentool/tests/test_docs.py
@@ -99,9 +99,9 @@ def test_examples_in_docs():
                 tests_passed += 1
     # Check that the number of tests were run that we expected
     if sys.version_info[:2] < (3,4):
-        assert tests_passed == 44
-    else:
         assert tests_passed == 45
+    else:
+        assert tests_passed == 46
 
 def _simplify_warnings(lines):
     return '\n'.join([_simplify_line(line) for line in lines.split('\n')])

diff --git a/flattentool/tests/test_json_input.py b/flattentool/tests/test_json_input.py
@@ -264,13 +264,23 @@ def test_parse_ids_nested(self):
 
 
 class TestParseUsingSchema(object):
-    def test_sub_sheet_names(self, tmpdir):
+    @pytest.mark.parametrize('remove_empty_schema_columns', [False, True])
+    def test_sub_sheets(self, tmpdir, remove_empty_schema_columns):
         test_schema = tmpdir.join('test.json')
         test_schema.write('''{
             "properties": {
                 "c": {
                     "type": "array",
                     "items": {"$ref": "#/testB"}
+                },
+                "g": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "h": { "type": "string"}
+                        }
+                    }
                 }
             },
             "testB": {
@@ -291,15 +301,20 @@ def test_sub_sheet_names(self, tmpdir):
                 ('a', 'b'),
                 ('c', [OrderedDict([('d', 'e')])]),
             ])],
-            schema_parser=schema_parser
+            schema_parser=schema_parser,
+            remove_empty_schema_columns=remove_empty_schema_columns,
         )
         parser.parse()
         assert list(parser.main_sheet) == [ 'a' ]
         assert parser.main_sheet.lines == [
             {'a': 'b'}
         ]
-        assert len(parser.sub_sheets) == 1
-        assert list(parser.sub_sheets['c']) == list(['ocid', 'c/0/d', 'c/0/f'])
+        assert len(parser.sub_sheets) == 2 if not remove_empty_schema_columns else 1
+        if not remove_empty_schema_columns:
+            assert list(parser.sub_sheets['c']) == list(['ocid', 'c/0/d', 'c/0/f'])
+            assert list(parser.sub_sheets['g']) == list(['ocid', 'g/0/h'])
+        else:
+            assert list(parser.sub_sheets['c']) == list(['ocid', 'c/0/d'])
         assert parser.sub_sheets['c'].lines == [{'c/0/d':'e'}]
 
     def test_column_matching(self, tmpdir):