Skip to content

Commit

Permalink
Merge pull request #575 from sheppard/openpyxl
Browse files Browse the repository at this point in the history
use openpyxl for XLSX files; upgrade xlrd to v2
  • Loading branch information
lindsay-stevens committed Dec 2, 2021
2 parents 66c0d5b + 05d5ac7 commit bb2497d
Show file tree
Hide file tree
Showing 18 changed files with 225 additions and 43 deletions.
36 changes: 20 additions & 16 deletions dev_requirements.pip
Original file line number Diff line number Diff line change
@@ -1,37 +1,41 @@
#
# This file is autogenerated by pip-compile
# This file is autogenerated by pip-compile with python 3.8
# To update, run:
#
# pip-compile --annotation-style line --output-file=dev_requirements.pip dev_requirements.in
# pip-compile --annotation-style=line --output-file=dev_requirements.pip dev_requirements.in
#
astroid==2.8.0 # via pylint
black==21.9b0 # via -r requirements.in
black==21.9b0 # via -r dev_requirements.in
click==8.0.1 # via black, pip-tools
flake8==3.9.2 # via -r requirements.in
formencode==2.0.0 # via -r requirements.in
importlib-metadata==4.8.1 # via click, flake8, pep517
isort==5.9.3 # via -r requirements.in, pylint
defusedxml==0.7.1 # via pyxform
et-xmlfile==1.1.0 # via openpyxl
flake8==3.9.2 # via -r dev_requirements.in
formencode==2.0.0 # via -r dev_requirements.in
isort==5.9.3 # via -r dev_requirements.in, pylint
lazy-object-proxy==1.6.0 # via astroid
lxml==4.6.3 # via -r dev_requirements.in
mccabe==0.6.1 # via flake8, pylint
mock==4.0.3 # via -r requirements.in
mock==4.0.3 # via -r dev_requirements.in
mypy-extensions==0.4.3 # via black
nose==1.3.7 # via -r requirements.in
nose==1.3.7 # via -r dev_requirements.in
openpyxl==3.0.9 # via pyxform
pathspec==0.9.0 # via black
pep517==0.11.0 # via pip-tools
pip-tools==6.3.0 # via -r requirements.in
pip-tools==6.3.0 # via -r dev_requirements.in
platformdirs==2.3.0 # via black, pylint
pycodestyle==2.7.0 # via flake8
pyflakes==2.3.1 # via flake8
pylint==2.11.1 # via -r requirements.in
pylint==2.11.1 # via -r dev_requirements.in
regex==2021.8.28 # via black
six==1.16.0 # via formencode
toml==0.10.2 # via pylint
tomli==1.2.1 # via black, pep517
typed-ast==1.4.3 # via astroid, black
typing-extensions==3.10.0.2 # via astroid, black, importlib-metadata, pylint
typing-extensions==3.10.0.2 # via astroid, black, pylint
wheel==0.37.0 # via pip-tools
wrapt==1.12.1 # via astroid
xlrd==1.2.0 # via pyxform
yapf==0.31.0 # via -r requirements.in
zipp==3.5.0 # via importlib-metadata, pep517
xlrd==2.0.1 # via pyxform
yapf==0.31.0 # via -r dev_requirements.in

# The following packages are considered to be unsafe in a requirements file:
# pip
# setuptools
4 changes: 3 additions & 1 deletion pyxform/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@
EXTERNAL_CHOICES,
OSM,
]
SUPPORTED_FILE_EXTENSIONS = [".xls", ".xlsx", ".xlsm"]
XLS_EXTENSIONS = [".xls"]
XLSX_EXTENSIONS = [".xlsx", ".xlsm"]
SUPPORTED_FILE_EXTENSIONS = XLS_EXTENSIONS + XLSX_EXTENSIONS

LOCATION_PRIORITY = "location-priority"
LOCATION_MIN_INTERVAL = "location-min-interval"
Expand Down
37 changes: 36 additions & 1 deletion pyxform/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@
from json.decoder import JSONDecodeError
from xml.dom.minidom import Element, Text, parseString

import openpyxl
import xlrd

from pyxform.xls2json_backends import is_empty, xls_value_to_unicode, xlsx_value_to_str

SEP = "_"

# http://www.w3.org/TR/REC-xml/
Expand Down Expand Up @@ -151,8 +154,13 @@ def flatten(li):


def sheet_to_csv(workbook_path, csv_path, sheet_name):
from pyxform.xls2json_backends import xls_value_to_unicode
if workbook_path.endswith(".xls"):
return xls_sheet_to_csv(workbook_path, csv_path, sheet_name)
else:
return xlsx_sheet_to_csv(workbook_path, csv_path, sheet_name)


def xls_sheet_to_csv(workbook_path, csv_path, sheet_name):
wb = xlrd.open_workbook(workbook_path)
try:
sheet = wb.sheet_by_name(sheet_name)
Expand Down Expand Up @@ -181,6 +189,33 @@ def sheet_to_csv(workbook_path, csv_path, sheet_name):
return True


def xlsx_sheet_to_csv(workbook_path, csv_path, sheet_name):
wb = openpyxl.open(workbook_path)
try:
sheet = wb.get_sheet_by_name(sheet_name)
except KeyError:
return False
if sheet.max_row < 2:
return False
with open(csv_path, "w", newline="") as f:
writer = csv.writer(f, quoting=csv.QUOTE_ALL)
mask = [not is_empty(cell.value) for cell in sheet[1]]
for row in sheet.rows:
csv_data = []
try:
for v, m in zip(row, mask):
if m:
data = xlsx_value_to_str(v.value)
# clean the values of leading and trailing whitespaces
data = data.strip()
csv_data.append(data)
except TypeError:
continue
writer.writerow(csv_data)

return True


def has_external_choices(json_struct):
"""
Returns true if a select one external prompt is used in the survey.
Expand Down
6 changes: 4 additions & 2 deletions pyxform/xls2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pyxform import aliases, constants
from pyxform.errors import PyXFormError
from pyxform.utils import default_is_dynamic, is_valid_xml_tag, levenshtein_distance
from pyxform.xls2json_backends import csv_to_dict, xls_to_dict
from pyxform.xls2json_backends import csv_to_dict, xls_to_dict, xlsx_to_dict

if TYPE_CHECKING:
from typing import Any, Dict, KeysView, Optional
Expand Down Expand Up @@ -1362,8 +1362,10 @@ def parse_file_to_workbook_dict(path, file_object=None):
if not extension:
raise PyXFormError("No extension.")

if extension in constants.SUPPORTED_FILE_EXTENSIONS:
if extension in constants.XLS_EXTENSIONS:
return xls_to_dict(file_object if file_object is not None else path)
elif extension in constants.XLSX_EXTENSIONS:
return xlsx_to_dict(file_object if file_object is not None else path)
elif extension == ".csv":
return csv_to_dict(file_object if file_object is not None else path)
else:
Expand Down
105 changes: 105 additions & 0 deletions pyxform/xls2json_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
from collections import OrderedDict
from functools import reduce
from io import StringIO
from zipfile import BadZipFile

import openpyxl
import xlrd
from xlrd import XLRDError
from xlrd.xldate import XLDateAmbiguous
Expand Down Expand Up @@ -163,6 +165,109 @@ def xls_value_to_unicode(value, value_type, datemode):
return str(value).replace(chr(160), " ")


def xlsx_to_dict(path_or_file):
"""
Return a Python dictionary with a key for each worksheet
name. For each sheet there is a list of dictionaries, each
dictionary corresponds to a single row in the worksheet. A
dictionary has keys taken from the column headers and values
equal to the cell value for that row and column.
All the keys and leaf elements are strings.
"""
try:
workbook = openpyxl.open(filename=path_or_file, data_only=True)
except (OSError, BadZipFile, KeyError) as error:
raise PyXFormError("Error reading .xlsx file: %s" % error)

def xlsx_to_dict_normal_sheet(sheet):

# Check for duplicate column headers
column_header_list = list()
for cell in sheet[1]:
column_header = cell.value
# xls file with 3 columns mostly have a 3 more columns that are
# blank by default or something, skip during check
if is_empty(column_header):
# Preserve column order (will filter later)
column_header_list.append(None)
else:
if column_header in column_header_list:
raise PyXFormError("Duplicate column header: %s" % column_header)
# strip whitespaces from the header
clean_header = re.sub(r"( )+", " ", column_header.strip())
column_header_list.append(clean_header)

result = []
for row in sheet.iter_rows(min_row=2):
row_dict = OrderedDict()
for column, key in enumerate(column_header_list):
if key is None:
continue

value = row[column].value
if isinstance(value, str):
value = value.strip()

if not is_empty(value):
row_dict[key] = xlsx_value_to_str(value)

result.append(row_dict)

column_header_list = [key for key in column_header_list if key is not None]
return result, _list_to_dict_list(column_header_list)

result = OrderedDict()
for sheetname in workbook.sheetnames:
sheet = workbook[sheetname]
# Note that the sheet exists but do no further processing here.
result[sheetname] = []
# Do not process sheets that have nothing to do with XLSForm.
if sheetname not in constants.SUPPORTED_SHEET_NAMES:
if len(workbook.sheetnames) == 1:
(
result[constants.SURVEY],
result[f"{constants.SURVEY}_header"],
) = xlsx_to_dict_normal_sheet(sheet)
else:
continue
else:
(
result[sheetname],
result[f"{sheetname}_header"],
) = xlsx_to_dict_normal_sheet(sheet)

return result


def xlsx_value_to_str(value):
"""
Take a xls formatted value and try to make a string representation.
"""
if value is True:
return "TRUE"
elif value is False:
return "FALSE"
elif isinstance(value, float) and value.is_integer():
# Try to display as an int if possible.
return str(int(value))
elif isinstance(value, (int, datetime.datetime, datetime.time)):
return str(value)
else:
# ensure unicode and replace nbsp spaces with normal ones
# to avoid this issue:
# https://github.com/modilabs/pyxform/issues/83
return str(value).replace(chr(160), " ")


def is_empty(value):
if value is None:
return True
elif isinstance(value, str) and value.strip() == "":
return True
else:
return False


def get_cascading_json(sheet_list, prefix, level):
return_list = []
for row in sheet_list:
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
long_description=open("README.rst", "rt").read(),
python_requires=">=3.7",
install_requires=[
"xlrd==1.2.0",
"xlrd==2.0.1",
"openpyxl==3.0.9",
"defusedxml==0.7.1",
],
entry_points={
"console_scripts": [
Expand Down
Binary file added tests/example_xls/group.xlsx
Binary file not shown.
Binary file added tests/example_xls/include.xlsx
Binary file not shown.
Binary file added tests/example_xls/include_json.xlsx
Binary file not shown.
Binary file added tests/example_xls/loop.xlsx
Binary file not shown.
Binary file added tests/example_xls/specify_other.xlsx
Binary file not shown.
Binary file added tests/example_xls/text_and_integer.xlsx
Binary file not shown.
Binary file removed tests/example_xls/text_and_integer_xlsx.xlsx
Binary file not shown.
Binary file added tests/example_xls/yes_or_no_question.xlsx
Binary file not shown.
6 changes: 3 additions & 3 deletions tests/test_xls2json.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os

from pyxform.xls2json_backends import xls_to_dict
from pyxform.xls2json_backends import xlsx_to_dict
from pyxform.xls2xform import xls2xform_convert
from tests import example_xls, test_output
from tests.pyxform_test_case import PyxformTestCase
Expand Down Expand Up @@ -617,9 +617,9 @@ def test_xls2xform_convert__e2e_with_settings_misspelling(self):
)
self.assertIn(expected, "\n".join(warnings))

def test_xls_to_dict__extra_sheet_names_are_returned_by_parser(self):
def test_xlsx_to_dict__extra_sheet_names_are_returned_by_parser(self):
"""Should return all sheet names so that later steps can do spellcheck."""
d = xls_to_dict(os.path.join(example_xls.PATH, "extra_sheet_names.xlsx"))
d = xlsx_to_dict(os.path.join(example_xls.PATH, "extra_sheet_names.xlsx"))
self.assertIn("survey", d)
self.assertIn("my_sheet", d)
self.assertIn("stettings", d)
Expand Down
42 changes: 41 additions & 1 deletion tests/test_xls2json_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,16 @@
"""
from unittest import TestCase

import openpyxl
import xlrd

from pyxform.xls2json_backends import xls_value_to_unicode
from pyxform.xls2json_backends import (
xls_to_dict,
xls_value_to_unicode,
xlsx_to_dict,
xlsx_value_to_str,
)
from tests import utils


class TestXLS2JSONBackends(TestCase):
Expand All @@ -33,3 +40,36 @@ def test_xls_value_to_unicode(self):
csv_data = xls_value_to_unicode(value, value_type, datemode)
expected_output = "46.9"
self.assertEqual(csv_data, expected_output)

def test_xlsx_value_to_str(self):
value = 32.0
csv_data = xlsx_value_to_str(value)
expected_output = "32"
self.assertEqual(csv_data, expected_output)

# Test that the decimal value is not changed during conversion.
value = 46.9
csv_data = xlsx_value_to_str(value)
expected_output = "46.9"
self.assertEqual(csv_data, expected_output)

def test_defusedxml_enabled(self):
self.assertTrue(openpyxl.DEFUSEDXML)

def test_equivalency(self):
equivalent_fixtures = [
"group",
"loop",
"specify_other",
"include",
"text_and_integer",
"include_json",
"yes_or_no_question",
]
for fixture in equivalent_fixtures:
xls_path = utils.path_to_text_fixture("%s.xls" % fixture)
xlsx_path = utils.path_to_text_fixture("%s.xlsx" % fixture)
xls_inp = xls_to_dict(xls_path)
xlsx_inp = xlsx_to_dict(xlsx_path)
self.maxDiff = None
self.assertEqual(xls_inp, xlsx_inp)
Loading

0 comments on commit bb2497d

Please sign in to comment.