Merge pull request #571 from lindsay-stevens/pyxform-157

Warn when some languages are missing translations
XLSForm · Feb 10, 2022 · f4cbc67 · f4cbc67
2 parents 501b821 + 8c12e4e
commit f4cbc67
Show file tree

Hide file tree

Showing 8 changed files with 1,651 additions and 76 deletions.
diff --git a/pyxform/aliases.py b/pyxform/aliases.py
@@ -106,6 +106,25 @@
     "body": "control",
     "parameters": "parameters",
 }
+# Key is the pyxform internal name, Value is the name used in error/warning messages.
+TRANSLATABLE_SURVEY_COLUMNS = {
+    constants.LABEL: constants.LABEL,
+    # Per ODK Spec, could include "short" once pyxform supports it.
+    constants.HINT: constants.HINT,
+    "guidance_hint": "guidance_hint",
+    "image": survey_header["image"],
+    # Per ODK Spec, could include "big-image" once pyxform supports it.
+    "audio": survey_header["audio"],
+    "video": survey_header["video"],
+    "jr:constraintMsg": "constraint_message",
+    "jr:requiredMsg": "required_message",
+}
+TRANSLATABLE_CHOICES_COLUMNS = {
+    "label": constants.LABEL,
+    "image": "media::image",
+    "audio": "media::audio",
+    "video": "media::video",
+}
 list_header = {
     "caption": constants.LABEL,
     "list_name": constants.LIST_NAME,

diff --git a/pyxform/constants.py b/pyxform/constants.py
@@ -32,7 +32,8 @@
 SUBMISSION_URL = "submission_url"
 AUTO_SEND = "auto_send"
 AUTO_DELETE = "auto_delete"
-DEFAULT_LANGUAGE = "default_language"
+DEFAULT_LANGUAGE_KEY = "default_language"
+DEFAULT_LANGUAGE_VALUE = "default"
 LABEL = "label"
 HINT = "hint"
 STYLE = "style"

diff --git a/pyxform/section.py b/pyxform/section.py
@@ -18,15 +18,16 @@ def validate(self):
     # there's a stronger test of this when creating the xpath
     # dictionary for a survey.
     def _validate_uniqueness_of_element_names(self):
-        element_slugs = []
+        element_slugs = set()
         for element in self.children:
-            if any(element.name.lower() == s.lower() for s in element_slugs):
+            elem_lower = element.name.lower()
+            if elem_lower in element_slugs:
                 raise PyXFormError(
                     "There are more than one survey elements named '%s' "
                     "(case-insensitive) in the section named '%s'."
-                    % (element.name.lower(), self.name)
+                    % (elem_lower, self.name)
                 )
-            element_slugs.append(element.name)
+            element_slugs.add(elem_lower)
 
     def xml_instance(self, **kwargs):
         """

diff --git a/pyxform/validators/pyxform/__init__.py b/pyxform/validators/pyxform/__init__.py
diff --git a/pyxform/validators/pyxform/missing_translations_check.py b/pyxform/validators/pyxform/missing_translations_check.py
@@ -0,0 +1,138 @@
+from collections import defaultdict
+from typing import TYPE_CHECKING
+
+from pyxform import aliases, constants
+from pyxform.errors import PyXFormError
+
+if TYPE_CHECKING:
+    from typing import Dict, List, Optional, Sequence, Union
+
+    SheetData = List[Dict[str, Union[str, Dict]]]
+
+
+def format_missing_translations_msg(
+    _in: "Dict[str, Dict[str, Sequence]]",
+) -> "Optional[str]":
+    """
+    Format the missing translations data into a warning message.
+
+    :param _in: A dict structured as Dict[survey|choices: Dict[language: (columns)]].
+      In other words, for the survey or choices sheet, a dict of the language(s) and
+      column names for which there are missing translations.
+    :return: The warning message, or None if there were no missing columns.
+    """
+
+    def get_sheet_msg(name, sheet):
+        if sheet is not None:
+            langs = sorted(sheet.keys())
+            if 0 < len(langs):
+                lang_msgs = []
+                for lang in langs:
+                    cols = sheet[lang]
+                    if isinstance(cols, str):
+                        msg = f"Expected a sequence of columns, got a string for {lang}."
+                        PyXFormError(msg)
+                    if 1 == len(cols):
+                        msg = f"Language '{lang}' is missing the {name} {cols[0]} column."
+                        lang_msgs.append(msg)
+                    if 1 < len(cols):
+                        c = ", ".join(sorted(cols))
+                        msg = f"Language '{lang}' is missing the {name} columns {c}."
+                        lang_msgs.append(msg)
+                return "\n".join(lang_msgs)
+        return None
+
+    survey = get_sheet_msg(name=constants.SURVEY, sheet=_in.get(constants.SURVEY))
+    choices = get_sheet_msg(name=constants.CHOICES, sheet=_in.get(constants.CHOICES))
+
+    messages = tuple(i for i in (survey, choices) if i is not None)
+    if 0 == len(messages):
+        return None
+    return "\n".join(messages)
+
+
+def find_missing_translations(
+    sheet_data: "SheetData",
+    translatable_columns: "Dict[str, str]",
+) -> "Dict[str, List[str]]":
+    """
+    Find missing translation columns in the sheet data.
+
+    For each translatable column used in the sheet, there should be a translation for
+    each language (including the default / unspecified language) that is used for any
+    other translatable column.
+
+    This checks the first row only since it is concerned with the presence of columns, not
+    individual cells. It therefore assumes that each row object has the same structure.
+
+    :param sheet_data: The survey or choices sheet data.
+    :param translatable_columns: The translatable columns for a sheet. The structure
+      should be Dict[internal_name, external_name]. See the aliases module.
+    :return: Dict[column_name, List[languages]]
+    """
+    translations_seen = defaultdict(list)
+    translation_columns_seen = set()
+
+    def process_cell(typ, cell):
+        if cell is not None:
+            if typ in translatable_columns.keys():
+                name = translatable_columns[typ]
+                if isinstance(cell, str):
+                    translations_seen[constants.DEFAULT_LANGUAGE_VALUE].append(name)
+                    translation_columns_seen.add(name)
+                elif isinstance(cell, dict):
+                    for lng in cell:
+                        translations_seen[lng].append(name)
+                        translation_columns_seen.add(name)
+
+    if 0 < len(sheet_data):
+        # e.g. ("name", "q1"), ("label", {"en": "Hello", "fr": "Bonjour"})
+        for column_type, cell_content in sheet_data[0].items():
+            if column_type == constants.MEDIA:
+                # e.g. ("audio", {"eng": "my.mp3"})
+                for media_type, media_cell in cell_content.items():
+                    process_cell(typ=media_type, cell=media_cell)
+            if column_type == constants.BIND:
+                # e.g. ("jr:constraintMsg", "Try again")
+                for bind_type, bind_cell in cell_content.items():
+                    process_cell(typ=bind_type, cell=bind_cell)
+            else:
+                process_cell(typ=column_type, cell=cell_content)
+
+    missing = defaultdict(list)
+    for lang, lang_trans in translations_seen.items():
+        for seen_tran in translation_columns_seen:
+            if seen_tran not in lang_trans:
+                missing[lang].append(seen_tran)
+
+    return missing
+
+
+def missing_translations_check(
+    survey_sheet: "SheetData",
+    choices_sheet: "SheetData",
+    warnings: "List[str]",
+):
+    """
+    Add a warning if there are missing translation columns in the survey or choices data.
+
+    :param survey_sheet: The survey sheet data.
+    :param choices_sheet: The choices sheet data.
+    :param warnings: The warnings list, which may be empty.
+    :return: The warnings list, possibly with a new message, otherwise unchanged.
+    """
+    survey_missing_trans = find_missing_translations(
+        sheet_data=survey_sheet,
+        translatable_columns=aliases.TRANSLATABLE_SURVEY_COLUMNS,
+    )
+    choices_missing_trans = find_missing_translations(
+        sheet_data=choices_sheet,
+        translatable_columns=aliases.TRANSLATABLE_CHOICES_COLUMNS,
+    )
+    if 0 < len(survey_missing_trans) or 0 < len(choices_missing_trans):
+        msg = format_missing_translations_msg(
+            _in={"survey": survey_missing_trans, "choices": choices_missing_trans}
+        )
+        if msg is not None:
+            warnings.append(msg)
+    return warnings
diff --git a/pyxform/xls2json.py b/pyxform/xls2json.py
@@ -13,10 +13,13 @@
 from pyxform import aliases, constants
 from pyxform.errors import PyXFormError
 from pyxform.utils import default_is_dynamic, is_valid_xml_tag, levenshtein_distance
+from pyxform.validators.pyxform.missing_translations_check import (
+    missing_translations_check,
+)
 from pyxform.xls2json_backends import csv_to_dict, xls_to_dict, xlsx_to_dict
 
 if TYPE_CHECKING:
-    from typing import Any, Dict, KeysView, Optional
+    from typing import Any, Dict, KeysView, List, Optional
 
 
 SMART_QUOTES = {"\u2018": "'", "\u2019": "'", "\u201c": '"', "\u201d": '"'}
@@ -92,11 +95,11 @@ def replace_smart_quotes_in_dict(_d):
 
 
 def dealias_and_group_headers(
-    dict_array,
-    header_aliases,
-    use_double_colons,
-    default_language="default",
-    ignore_case=False,
+    dict_array: "List[Dict]",
+    header_aliases: "Dict",
+    use_double_colons: bool,
+    default_language: str = constants.DEFAULT_LANGUAGE_VALUE,
+    ignore_case: bool = False,
 ):
     """
     For each row in the worksheet, group all keys that contain a double colon.
@@ -220,7 +223,7 @@ def group_dictionaries_by_key(list_of_dicts, key, remove_key=True):
     return dict_of_lists
 
 
-def has_double_colon(workbook_dict):
+def has_double_colon(workbook_dict) -> bool:
     """
     Look for a column header with a doublecolon (::) and
     return true if one is found.
@@ -344,7 +347,7 @@ def workbook_to_json(
     workbook_dict,
     form_name=None,
     fallback_form_name=None,
-    default_language="default",
+    default_language=constants.DEFAULT_LANGUAGE_VALUE,
     warnings=None,
 ) -> "Dict[str, Any]":
     """
@@ -438,7 +441,7 @@ def workbook_to_json(
     settings = settings_sheet[0] if len(settings_sheet) > 0 else {}
     replace_smart_quotes_in_dict(settings)
 
-    default_language = settings.get(constants.DEFAULT_LANGUAGE, default_language)
+    default_language = settings.get(constants.DEFAULT_LANGUAGE_KEY, default_language)
 
     # add_none_option is a boolean that when true,
     # indicates a none option should automatically be added to selects.
@@ -457,7 +460,7 @@ def workbook_to_json(
         constants.TITLE: id_string,
         constants.ID_STRING: id_string,
         constants.SMS_KEYWORD: sms_keyword,
-        constants.DEFAULT_LANGUAGE: default_language,
+        constants.DEFAULT_LANGUAGE_KEY: default_language,
         # By default the version is based on the date and time yyyymmddhh
         # Leaving default version out for now since it might cause
         # problems for formhub.
@@ -487,6 +490,8 @@ def workbook_to_json(
         choices_sheet, aliases.list_header, use_double_colons, default_language
     )
     combined_lists = group_dictionaries_by_key(choices_sheet, constants.LIST_NAME)
+    # To combine the warning into one message, the check for missing choices translation
+    # columns is run with Survey sheet below.
 
     choices = combined_lists
     # Make sure all the options have the required properties:
@@ -568,6 +573,14 @@ def workbook_to_json(
     )
     survey_sheet = dealias_types(survey_sheet)
 
+    # Check for missing translations. The choices sheet is checked here so that the
+    # warning can be combined into one message.
+    warnings = missing_translations_check(
+        survey_sheet=survey_sheet,
+        choices_sheet=choices_sheet,
+        warnings=warnings,
+    )
+
     # No spell check for OSM sheet (infrequently used, many spurious matches).
     osm_sheet = dealias_and_group_headers(
         workbook_dict.get(constants.OSM, []), aliases.list_header, True
@@ -1382,7 +1395,7 @@ def get_filename(path):
 def parse_file_to_json(
     path,
     default_name="data",
-    default_language="default",
+    default_language=constants.DEFAULT_LANGUAGE_VALUE,
     warnings=None,
     file_object=None,
 ):
@@ -1514,7 +1527,10 @@ def _setup_question_types_dictionary(self):
         types_sheet = "question types"
         self._dict = self._dict[types_sheet]
         self._dict = dealias_and_group_headers(
-            self._dict, {}, use_double_colons, "default"
+            dict_array=self._dict,
+            header_aliases={},
+            use_double_colons=use_double_colons,
+            default_language=constants.DEFAULT_LANGUAGE_VALUE,
         )
         self._dict = organize_by_values(self._dict, "name")
 

diff --git a/tests/pyxform_test_case.py b/tests/pyxform_test_case.py
@@ -161,8 +161,8 @@ def assertPyxformXform(self, **kwargs):
           * error__contains: a list of strings which should exist in the error
           * error__not_contains: a list of strings which should not exist in the error
           * odk_validate_error__contains: list of strings; run_odk_validate must be set
-          * warning__contains: a list of strings which should exist in the warnings
-          * warning__not_contains: a list of strings which should not exist in the warnings
+          * warnings__contains: a list of strings which should exist in the warnings
+          * warnings__not_contains: a list of strings which should not exist in the warnings
           * warnings_count: the number of expected warning messages
           * xml__excludes: an array of strings which should not exist in the resulting
                xml. [xml|model|instance|itext]_excludes are also supported.
@@ -489,7 +489,8 @@ def assert_xpath_count(
             content=content,
             xpath=xpath,
         )
-        self.assertEqual(expected, len(observed), matcher_context.content_str)
+        msg = f"XPath found no matches:\n{xpath}\n\nXForm content:\n{matcher_context.content_str}"
+        self.assertEqual(expected, len(observed), msg=msg)
 
 
 def reorder_attributes(root):