Skip to content

Commit

Permalink
Add support for specifying response file encoding (emscripten-core#15426
Browse files Browse the repository at this point in the history
)

* Add support for specifying response file encoding using the suffix of the response file name, and autodetect the response file encoding using the suffix of the response file name if one is specified there.

* Update Changelog

* Adjust test to verify locale.getpreferredencoding()

* Update ChangeLog

* Improve comments

* Update comment

* Relocate comment

* Update Changelog

* Add test to windows config
  • Loading branch information
juj committed Nov 15, 2021
1 parent 4b9a0d0 commit 5db73a6
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 15 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ jobs:
# note we do *not* build all libraries and freeze the cache; as we run
# only limited tests here, it's more efficient to build on demand
- run-tests:
test_targets: "other.test_emcc_cflags other.test_stdin other.test_bad_triple wasm2.test_sse1 wasm2.test_ccall other.test_closure_externs other.test_binaryen_debug other.test_js_optimizer_parse_error other.test_output_to_nowhere other.test_emcc_dev_null other.test_cmake* other.test_system_include_paths other.test_emar_response_file wasm2.test_utf16 other.test_special_chars_in_arguments other.test_toolchain_profiler other.test_realpath_nodefs"
test_targets: "other.test_emcc_cflags other.test_stdin other.test_bad_triple wasm2.test_sse1 wasm2.test_ccall other.test_closure_externs other.test_binaryen_debug other.test_js_optimizer_parse_error other.test_output_to_nowhere other.test_emcc_dev_null other.test_cmake* other.test_system_include_paths other.test_emar_response_file wasm2.test_utf16 other.test_special_chars_in_arguments other.test_toolchain_profiler other.test_realpath_nodefs other.test_response_file_encoding"
test-mac:
executor: mac
environment:
Expand Down
4 changes: 4 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ See docs/process.md for more on how version tagging works.
2.0.33 - 11/01/2021
-------------------
- Bug fixes
- Added support for specifying the text encoding to be used in response filenames
by passing the encoding as a file suffix (e.g. "a.rsp.utf-8" or "a.rsp.cp1252").
If not specified, the encoding is autodetected as either UTF-8 or Python
default "locale.getpreferredencoding()". (#15406, #15292, #15426)

2.0.32 - 10/19/2021
-------------------
Expand Down
16 changes: 16 additions & 0 deletions tests/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -10728,6 +10728,22 @@ def create_o(name, i):
self.run_process(building.get_command_with_possible_response_file([EMCC, 'main.c'] + files))
self.assertContained(str(count * (count - 1) // 2), self.run_js('a.out.js'))

# Tests that the filename suffix of the response files can be used to detect which encoding the file is.
def test_response_file_encoding(self):
open('äö.c', 'w').write('int main(){}')

open('a.rsp', 'w', encoding='utf-8').write('äö.c') # Write a response file with unicode contents ...
self.run_process([EMCC, '@a.rsp']) # ... and test that in the absence of a file suffix, it is autodetected to utf-8.

open('a.rsp.cp437', 'w', encoding='cp437').write('äö.c') # Write a response file with Windows CP-437 encoding ...
self.run_process([EMCC, '@a.rsp.cp437']) # ... and test that with the explicit suffix present, it is properly decoded

import locale
preferred_encoding = locale.getpreferredencoding(do_setlocale=False)
print('Python locale preferredencoding: ' + preferred_encoding)
open('a.rsp', 'w', encoding=preferred_encoding).write('äö.c') # Write a response file using Python preferred encoding
self.run_process([EMCC, '@a.rsp']) # ... and test that it is properly autodetected.

def test_output_name_collision(self):
# Ensure that the seconday filenames never collide with the primary output filename
# In this case we explcitly ask for JS to be ceated in a file with the `.wasm` suffix.
Expand Down
49 changes: 35 additions & 14 deletions tools/response_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@
DEBUG = int(os.environ.get('EMCC_DEBUG', '0'))


def create_response_file(args, directory):
def create_response_file(args, directory, suffix='.rsp.utf-8'):
"""Routes the given cmdline param list in args into a new response file and
returns the filename to it.
The returned filename has a suffix '.rsp'.
By default the returned filename has a suffix '.rsp.utf-8'. Pass a suffix parameter to override.
"""
response_fd, response_filename = tempfile.mkstemp(prefix='emscripten_', suffix='.rsp', dir=directory, text=True)

assert suffix.startswith('.')

response_fd, response_filename = tempfile.mkstemp(prefix='emscripten_', suffix=suffix, dir=directory, text=True)

# Backslashes and other special chars need to be escaped in the response file.
escape_chars = ['\\', '\"']
Expand All @@ -41,16 +44,12 @@ def escape(arg):
arg = '"%s"' % arg
contents += arg + '\n'

# When writing windows repsonse files force the encoding to UTF8 which we know
# that llvm tools understand. Without this, we get whatever the default codepage
# might be.
# See: https://github.com/llvm/llvm-project/blob/3f3d1c901d7abcc5b91468335679b1b27d8a02dd/llvm/include/llvm/Support/Program.h#L168-L170
# And: https://github.com/llvm/llvm-project/blob/63d16d06f5b8f71382033b5ea4aa668f8150817a/clang/include/clang/Driver/Job.h#L58-L69
# TODO(sbc): Should we also force utf-8 on non-windows?
if WINDOWS:
encoding = 'utf-8'
# Decide the encoding of the generated file based on the requested file suffix
if suffix.count('.') == 2:
# Use the encoding specified in the suffix of the response file
encoding = suffix.split('.')[2]
else:
encoding = None
encoding = 'utf-8'

with os.fdopen(response_fd, 'w', encoding=encoding) as f:
f.write(contents)
Expand All @@ -70,15 +69,37 @@ def read_response_file(response_filename):
"""Reads a response file, and returns the list of cmdline params found in the
file.
The encoding that the response filename should be read with can be specified
as a suffix to the file, e.g. "foo.rsp.utf-8" or "foo.rsp.cp1252". If not
specified, first UTF-8 and then Python locale.getpreferredencoding() are
attempted.
The parameter response_filename may start with '@'."""
if response_filename.startswith('@'):
response_filename = response_filename[1:]

if not os.path.exists(response_filename):
raise IOError("response file not found: %s" % response_filename)

with open(response_filename) as f:
args = f.read()
# Guess encoding based on the file suffix
components = os.path.basename(response_filename).split('.')
encoding_suffix = components[-1].lower()
if len(components) > 1 and (encoding_suffix.startswith('utf') or encoding_suffix.startswith('cp') or encoding_suffix.startswith('iso') or encoding_suffix in ['ascii', 'latin-1']):
guessed_encoding = encoding_suffix
else:
guessed_encoding = 'utf-8'

try:
# First try with the guessed encoding
with open(response_filename, encoding=guessed_encoding) as f:
args = f.read()
except (ValueError, LookupError): # UnicodeDecodeError is a subclass of ValueError, and Python raises either a ValueError or a UnicodeDecodeError on decode errors. LookupError is raised if guessed encoding is not an encoding.
if DEBUG:
logging.warning(f'Failed to parse response file {response_filename} with guessed encoding "{guessed_encoding}". Trying default system encoding...')
# If that fails, try with the Python default locale.getpreferredencoding()
with open(response_filename) as f:
args = f.read()

args = shlex.split(args)

if DEBUG:
Expand Down

0 comments on commit 5db73a6

Please sign in to comment.