Skip to content

Commit

Permalink
fix: performance improvements for large forms
Browse files Browse the repository at this point in the history
- missing translations check can still work while only looking at first
  row, which is a little faster and still passes the tests.
- while profiling, found an inefficient unique check in section.py. The
  benefit is only apparent with forms >1000 questions, but it is
  significant: q=5000 28s vs. 11s, q=10000 103s vs. 21s.
  • Loading branch information
lindsay-stevens committed Jan 21, 2022
1 parent f1babeb commit eb6f0fe
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 33 deletions.
9 changes: 5 additions & 4 deletions pyxform/section.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,16 @@ def validate(self):
# there's a stronger test of this when creating the xpath
# dictionary for a survey.
def _validate_uniqueness_of_element_names(self):
element_slugs = []
element_slugs = set()
for element in self.children:
if any(element.name.lower() == s.lower() for s in element_slugs):
elem_lower = element.name.lower()
if elem_lower in element_slugs:
raise PyXFormError(
"There are more than one survey elements named '%s' "
"(case-insensitive) in the section named '%s'."
% (element.name.lower(), self.name)
% (elem_lower, self.name)
)
element_slugs.append(element.name)
element_slugs.add(elem_lower)

def xml_instance(self, **kwargs):
"""
Expand Down
13 changes: 7 additions & 6 deletions pyxform/validators/pyxform/missing_translations_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,8 @@ def find_missing_translations(
each language (including the default / unspecified language) that is used for any
other translatable column.
This could be more efficient by not looking at every row, but that's how the data is
arranged by the time it passes through workbook_to_json(). On the bright side it
means this function could be adapted to warn about specific items lacking
translations, even when there are no missing translation columns.
This checks the first row only since it is concerned with the presence of columns, not
individual cells. It therefore assumes that each row object has the same structure.
:param sheet_data: The survey or choices sheet data.
:param translatable_columns: The translatable columns for a sheet. The structure
Expand All @@ -80,12 +78,15 @@ def process_cell(typ, cell):
translations_seen[lng].append(name)
translation_columns_seen.add(name)

for row in sheet_data:
for column_type, cell_content in row.items():
if 0 < len(sheet_data):
# e.g. ("name", "q1"), ("label", {"en": "Hello", "fr": "Bonjour"})
for column_type, cell_content in sheet_data[0].items():
if column_type == constants.MEDIA:
# e.g. ("audio", {"eng": "my.mp3"})
for media_type, media_cell in cell_content.items():
process_cell(typ=media_type, cell=media_cell)
if column_type == constants.BIND:
# e.g. ("jr:constraintMsg", "Try again")
for bind_type, bind_cell in cell_content.items():
process_cell(typ=bind_type, cell=bind_cell)
else:
Expand Down
51 changes: 28 additions & 23 deletions tests/test_translations.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,12 +458,16 @@ def test_missing_translation__one_lang_all_cols(self):

@unittest.skip("Slow performance test. Un-skip to run as needed.")
def test_missing_translations_check_performance(self):
"""Should find the translations check costs a fraction of a second for a giant form.
"""
Should find the translations check costs a fraction of a second for large forms.
Results with Python 3.8.9 on VM with 4CPU 8GB RAM, 500 questions with 2 choices
each, average of 20 runs (diff = 0.059s):
- with check (seconds): 1.351347613078542
- without check (seconds): 1.292487204639474
Results with Python 3.8.9 on VM with 4CPU 8GB RAM, x questions with 2 choices
each, average of 10 runs (seconds), with and without the check:
| num | with | without |
| 500 | 1.0192 | 0.9950 |
| 1000 | 2.0054 | 2.1026 |
| 2000 | 4.0714 | 4.0926 |
| 3000 | 6.0266 | 6.2476 |
"""
survey_header = """
| survey | | | | |
Expand All @@ -480,24 +484,25 @@ def test_missing_translations_check_performance(self):
| | c{i} | na | la-d | la-e |
| | c{i} | nb | lb-d | lb-e |
"""
questions = "\n".join((question.format(i=i) for i in range(1, 500)))
choice_lists = "\n".join((choice_list.format(i=i) for i in range(1, 500)))
md = "".join((survey_header, questions, choices_header, choice_lists))

def run(name):
runs = 0
results = []
while runs < 20:
start = perf_counter()
self.assertPyxformXform(md=md)
results.append(perf_counter() - start)
runs += 1
print(name, sum(results) / len(results))

run(name="with check (seconds):")

with patch("pyxform.xls2json.missing_translations_check", return_value=[]):
run(name="without check (seconds):")
for count in (500, 1000, 2000):
questions = "\n".join((question.format(i=i) for i in range(1, count)))
choice_lists = "\n".join((choice_list.format(i=i) for i in range(1, count)))
md = "".join((survey_header, questions, choices_header, choice_lists))

def run(name):
runs = 0
results = []
while runs < 10:
start = perf_counter()
self.assertPyxformXform(md=md)
results.append(perf_counter() - start)
runs += 1
print(name, sum(results) / len(results))

run(name=f"questions={count}, with check (seconds):")

with patch("pyxform.xls2json.missing_translations_check", return_value=[]):
run(name=f"questions={count}, without check (seconds):")


class TestTranslationsSurvey(PyxformTestCase):
Expand Down

0 comments on commit eb6f0fe

Please sign in to comment.