add: library API for non-path input, accept markdown and dict input

- add: xls2xform.convert for library users to call pyxform without needing to use files (accepts bytes/file handles/strings) - accepts markdown input since this is widely used by pyxform - accepts dict to avoid needing to use internal funcs that may change - chg: avoid writing to files unless validate=True (for ODK Validate) - also avoid assuming any files were written, e.g. missing_ok=True - chg: move xls/x_sheet_to_csv, sheet_to_csv from utils.py to xls2json_backends.py because they are backends for csv input. - chg: move md_to_dict from test directory into xls2json_backends.py - chg: refactor pyxform_test_case.py to use xls2xform.convert only, instead of internal funcs associated with md_to_dict, so that the existing tests check API stability e.g. file types, dict input, etc.
XLSForm · Jun 25, 2024 · 1a1462a · 1a1462a
1 parent 59c37e0
commit 1a1462a
Show file tree

Hide file tree

Showing 51 changed files with 1,385 additions and 1,414 deletions.
diff --git a/clean_for_build.py b/clean_for_build.py
diff --git a/pyxform/constants.py b/pyxform/constants.py
@@ -34,6 +34,7 @@
 SUBMISSION_URL = "submission_url"
 AUTO_SEND = "auto_send"
 AUTO_DELETE = "auto_delete"
+DEFAULT_FORM_NAME = "data"
 DEFAULT_LANGUAGE_KEY = "default_language"
 DEFAULT_LANGUAGE_VALUE = "default"
 LABEL = "label"

diff --git a/pyxform/entities/entities_parsing.py b/pyxform/entities/entities_parsing.py
@@ -72,7 +72,7 @@ def get_validated_dataset_name(entity):
 
     if not is_valid_xml_tag(dataset):
         if isinstance(dataset, bytes):
-            dataset = dataset.encode("utf-8")
+            dataset = dataset.decode("utf-8")
 
         raise PyXFormError(
             f"Invalid entity list name: '{dataset}'. Names must begin with a letter, colon, or underscore. Other characters can include numbers or dashes."
@@ -117,7 +117,7 @@ def validate_entity_saveto(
 
     if not is_valid_xml_tag(save_to):
         if isinstance(save_to, bytes):
-            save_to = save_to.encode("utf-8")
+            save_to = save_to.decode("utf-8")
 
         raise PyXFormError(
             f"{error_start} '{save_to}'. Entity property names {const.XML_IDENTIFIER_ERROR_MESSAGE}"

diff --git a/pyxform/errors.py b/pyxform/errors.py
@@ -9,3 +9,7 @@ class PyXFormError(Exception):
 
 class ValidationError(PyXFormError):
     """Common base class for pyxform validation exceptions."""
+
+
+class PyXFormReadError(PyXFormError):
+    """Common base class for pyxform exceptions occuring during reading XLSForm data."""
diff --git a/pyxform/instance.py b/pyxform/instance.py
@@ -2,6 +2,8 @@
 SurveyInstance class module.
 """
 
+import os.path
+
 from pyxform.errors import PyXFormError
 from pyxform.xform_instance_parser import parse_xform_instance
 
@@ -76,8 +78,6 @@ def answers(self):
         return self._answers
 
     def import_from_xml(self, xml_string_or_filename):
-        import os.path
-
         if os.path.isfile(xml_string_or_filename):
             xml_str = open(xml_string_or_filename, encoding="utf-8").read()
         else:

diff --git a/pyxform/survey.py b/pyxform/survey.py
@@ -10,6 +10,7 @@
 from collections.abc import Generator, Iterator
 from datetime import datetime
 from functools import lru_cache
+from pathlib import Path
 
 from pyxform import aliases, constants
 from pyxform.constants import EXTERNAL_INSTANCE_EXTENSIONS, NSMAP
@@ -970,10 +971,10 @@ def date_stamp(self):
         """Returns a date string with the format of %Y_%m_%d."""
         return self._created.strftime("%Y_%m_%d")
 
-    def _to_ugly_xml(self):
+    def _to_ugly_xml(self) -> str:
         return '<?xml version="1.0"?>' + self.xml().toxml()
 
-    def _to_pretty_xml(self):
+    def _to_pretty_xml(self) -> str:
         """Get the XForm with human readable formatting."""
         return '<?xml version="1.0"?>\n' + self.xml().toprettyxml(indent="  ")
 
@@ -1171,10 +1172,9 @@ def _var_repl_output_function(matchobj):
         else:
             return text, False
 
-    # pylint: disable=too-many-arguments
     def print_xform_to_file(
         self, path=None, validate=True, pretty_print=True, warnings=None, enketo=False
-    ):
+    ) -> str:
         """
         Print the xForm to a file and optionally validate it as well by
         throwing exceptions and adding warnings to the warnings array.
@@ -1183,12 +1183,13 @@ def print_xform_to_file(
             warnings = []
         if not path:
             path = self._print_name + ".xml"
+        if pretty_print:
+            xml = self._to_pretty_xml()
+        else:
+            xml = self._to_ugly_xml()
         try:
             with open(path, mode="w", encoding="utf-8") as file_obj:
-                if pretty_print:
-                    file_obj.write(self._to_pretty_xml())
-                else:
-                    file_obj.write(self._to_ugly_xml())
+                file_obj.write(xml)
         except Exception:
             if os.path.exists(path):
                 os.unlink(path)
@@ -1210,6 +1211,7 @@ def print_xform_to_file(
                     + ". "
                     + "Learn more: http://xlsform.org#multiple-language-support"
                 )
+        return xml
 
     def to_xml(self, validate=True, pretty_print=True, warnings=None, enketo=False):
         """
@@ -1227,20 +1229,16 @@ def to_xml(self, validate=True, pretty_print=True, warnings=None, enketo=False):
         tmp.close()
         try:
             # this will throw an exception if the xml is not valid
-            self.print_xform_to_file(
+            xml = self.print_xform_to_file(
                 path=tmp.name,
                 validate=validate,
                 pretty_print=pretty_print,
                 warnings=warnings,
                 enketo=enketo,
             )
         finally:
-            if os.path.exists(tmp.name):
-                os.remove(tmp.name)
-        if pretty_print:
-            return self._to_pretty_xml()
-
-        return self._to_ugly_xml()
+            Path(tmp.name).unlink(missing_ok=True)
+        return xml
 
     def instantiate(self):
         """

diff --git a/pyxform/utils.py b/pyxform/utils.py
@@ -7,17 +7,16 @@
 import json
 import os
 import re
+from io import StringIO
 from json.decoder import JSONDecodeError
-from typing import NamedTuple
+from typing import Any, NamedTuple
 from xml.dom import Node
 from xml.dom.minidom import Element, Text, _write_data
 
-import openpyxl
-import xlrd
 from defusedxml.minidom import parseString
 
+from pyxform import constants as const
 from pyxform.errors import PyXFormError
-from pyxform.xls2json_backends import is_empty, xls_value_to_unicode, xlsx_value_to_str
 
 SEP = "_"
 
@@ -167,66 +166,32 @@ def flatten(li):
         yield from subli
 
 
-def sheet_to_csv(workbook_path, csv_path, sheet_name):
-    if workbook_path.endswith(".xls"):
-        return xls_sheet_to_csv(workbook_path, csv_path, sheet_name)
-    else:
-        return xlsx_sheet_to_csv(workbook_path, csv_path, sheet_name)
-
+def external_choices_to_csv(
+    workbook_dict: dict[str, Any], warnings: list | None = None
+) -> str | None:
+    """
+    Convert the 'external_choices' sheet data to CSV.
 
-def xls_sheet_to_csv(workbook_path, csv_path, sheet_name):
-    wb = xlrd.open_workbook(workbook_path)
-    try:
-        sheet = wb.sheet_by_name(sheet_name)
-    except xlrd.biffh.XLRDError:
-        return False
-    if not sheet or sheet.nrows < 2:
-        return False
-    with open(csv_path, mode="w", encoding="utf-8", newline="") as f:
-        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
-        mask = [v and len(v.strip()) > 0 for v in sheet.row_values(0)]
-        for row_idx in range(sheet.nrows):
-            csv_data = []
-            try:
-                for v, m in zip(sheet.row(row_idx), mask, strict=False):
-                    if m:
-                        value = v.value
-                        value_type = v.ctype
-                        data = xls_value_to_unicode(value, value_type, wb.datemode)
-                        # clean the values of leading and trailing whitespaces
-                        data = data.strip()
-                        csv_data.append(data)
-            except TypeError:
-                continue
-            writer.writerow(csv_data)
-
-    return True
-
-
-def xlsx_sheet_to_csv(workbook_path, csv_path, sheet_name):
-    wb = openpyxl.open(workbook_path, read_only=True, data_only=True)
+    :param workbook_dict: The result from xls2json.workbook_to_json.
+    :param warnings: The conversions warnings list.
+    """
+    warnings = coalesce(warnings, [])
+    if const.EXTERNAL_CHOICES not in workbook_dict:
+        warnings.append(
+            f"Could not export itemsets.csv, the '{const.EXTERNAL_CHOICES}' sheet is missing."
+        )
+        return None
+
+    itemsets = StringIO(newline="")
+    csv_writer = csv.writer(itemsets, quoting=csv.QUOTE_ALL)
     try:
-        sheet = wb[sheet_name]
-    except KeyError:
-        return False
-
-    with open(csv_path, mode="w", encoding="utf-8", newline="") as f:
-        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
-        mask = [not is_empty(cell.value) for cell in sheet[1]]
-        for row in sheet.rows:
-            csv_data = []
-            try:
-                for v, m in zip(row, mask, strict=False):
-                    if m:
-                        data = xlsx_value_to_str(v.value)
-                        # clean the values of leading and trailing whitespaces
-                        data = data.strip()
-                        csv_data.append(data)
-            except TypeError:
-                continue
-            writer.writerow(csv_data)
-    wb.close()
-    return True
+        header = workbook_dict["external_choices_header"][0]
+    except (IndexError, KeyError, TypeError):
+        header = {k for d in workbook_dict[const.EXTERNAL_CHOICES] for k in d}
+    csv_writer.writerow(header)
+    for row in workbook_dict[const.EXTERNAL_CHOICES]:
+        csv_writer.writerow(row.values())
+    return itemsets.getvalue()
 
 
 def has_external_choices(json_struct):
@@ -235,7 +200,11 @@ def has_external_choices(json_struct):
     """
     if isinstance(json_struct, dict):
         for k, v in json_struct.items():
-            if k == "type" and isinstance(v, str) and v.startswith("select one external"):
+            if (
+                k == const.TYPE
+                and isinstance(v, str)
+                and v.startswith(const.SELECT_ONE_EXTERNAL)
+            ):
                 return True
             elif has_external_choices(v):
                 return True

diff --git a/pyxform/xls2json.py b/pyxform/xls2json.py
@@ -22,7 +22,7 @@
 )
 from pyxform.errors import PyXFormError
 from pyxform.parsing.expression import is_single_token_expression
-from pyxform.utils import PYXFORM_REFERENCE_REGEX, default_is_dynamic
+from pyxform.utils import PYXFORM_REFERENCE_REGEX, coalesce, default_is_dynamic
 from pyxform.validators.pyxform import parameters_generic, select_from_file
 from pyxform.validators.pyxform.android_package_name import validate_android_package_name
 from pyxform.validators.pyxform.translations_checks import SheetTranslations
@@ -395,7 +395,7 @@ def workbook_to_json(
     workbook_dict,
     form_name: str | None = None,
     fallback_form_name: str | None = None,
-    default_language: str = constants.DEFAULT_LANGUAGE_VALUE,
+    default_language: str | None = None,
     warnings: list[str] | None = None,
 ) -> dict[str, Any]:
     """
@@ -416,8 +416,7 @@ def workbook_to_json(
     returns a nested dictionary equivalent to the format specified in the
     json form spec.
     """
-    if warnings is None:
-        warnings = []
+    warnings = coalesce(warnings, [])
     is_valid = False
     # Sheet names should be case-insensitive
     workbook_dict = {x.lower(): y for x, y in workbook_dict.items()}
@@ -441,8 +440,8 @@ def workbook_to_json(
         )
 
     # Make sure the passed in vars are unicode
-    form_name = str(form_name)
-    default_language = str(default_language)
+    form_name = str(coalesce(form_name, constants.DEFAULT_FORM_NAME))
+    default_language = str(coalesce(default_language, constants.DEFAULT_LANGUAGE_VALUE))
 
     # We check for double columns to determine whether to use them
     # or single colons to delimit grouped headers.
@@ -500,7 +499,9 @@ def workbook_to_json(
         )
 
     # Here we create our json dict root with default settings:
-    id_string = settings.get(constants.ID_STRING, fallback_form_name)
+    id_string = settings.get(
+        constants.ID_STRING, coalesce(fallback_form_name, constants.DEFAULT_FORM_NAME)
+    )
     sms_keyword = settings.get(constants.SMS_KEYWORD, id_string)
     json_dict = {
         constants.TYPE: constants.SURVEY,
@@ -970,7 +971,7 @@ def workbook_to_json(
         question_name = str(row[constants.NAME])
         if not is_valid_xml_tag(question_name):
             if isinstance(question_name, bytes):
-                question_name = question_name.encode("utf-8")
+                question_name = question_name.decode("utf-8")
 
             raise PyXFormError(
                 f"{ROW_FORMAT_STRING % row_number} Invalid question name '{question_name}'. Names {XML_IDENTIFIER_ERROR_MESSAGE}"
@@ -1591,7 +1592,7 @@ def get_filename(path):
 
 def parse_file_to_json(
     path: str,
-    default_name: str = "data",
+    default_name: str = constants.DEFAULT_FORM_NAME,
     default_language: str = constants.DEFAULT_LANGUAGE_VALUE,
     warnings: list[str] | None = None,
     file_object: IO | None = None,