Add support for specifying response file encoding (emscripten-core#15426

) * Add support for specifying response file encoding using the suffix of the response file name, and autodetect the response file encoding using the suffix of the response file name if one is specified there. * Update Changelog * Adjust test to verify locale.getpreferredencoding() * Update ChangeLog * Improve comments * Update comment * Relocate comment * Update Changelog * Add test to windows config (cherry picked from commit 5db73a6) # Conflicts: # .circleci/config.yml
Unity-Technologies · Nov 30, 2021 · 6a37d06 · 6a37d06
1 parent 0ec89f1
commit 6a37d06
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 14 deletions.
diff --git a/tests/test_other.py b/tests/test_other.py
@@ -10392,6 +10392,22 @@ def create_o(name, i):
     self.run_process(building.get_command_with_possible_response_file([EMCC, 'main.c'] + files))
     self.assertContained(str(count * (count - 1) // 2), self.run_js('a.out.js'))
 
+  # Tests that the filename suffix of the response files can be used to detect which encoding the file is.
+  def test_response_file_encoding(self):
+    open('äö.c', 'w').write('int main(){}')
+
+    open('a.rsp', 'w', encoding='utf-8').write('äö.c') # Write a response file with unicode contents ...
+    self.run_process([EMCC, '@a.rsp']) # ... and test that in the absence of a file suffix, it is autodetected to utf-8.
+
+    open('a.rsp.cp437', 'w', encoding='cp437').write('äö.c') # Write a response file with Windows CP-437 encoding ...
+    self.run_process([EMCC, '@a.rsp.cp437']) # ... and test that with the explicit suffix present, it is properly decoded
+
+    import locale
+    preferred_encoding = locale.getpreferredencoding(do_setlocale=False)
+    print('Python locale preferredencoding: ' + preferred_encoding)
+    open('a.rsp', 'w', encoding=preferred_encoding).write('äö.c') # Write a response file using Python preferred encoding
+    self.run_process([EMCC, '@a.rsp']) # ... and test that it is properly autodetected.
+
   def test_output_name_collision(self):
     # Ensure that the seconday filenames never collide with the primary output filename
     # In this case we explcitly ask for JS to be ceated in a file with the `.wasm` suffix.

diff --git a/tools/response_file.py b/tools/response_file.py
@@ -13,13 +13,16 @@
 DEBUG = int(os.environ.get('EMCC_DEBUG', '0'))
 
 
-def create_response_file(args, directory):
+def create_response_file(args, directory, suffix='.rsp.utf-8'):
   """Routes the given cmdline param list in args into a new response file and
   returns the filename to it.
 
-  The returned filename has a suffix '.rsp'.
+  By default the returned filename has a suffix '.rsp.utf-8'. Pass a suffix parameter to override.
   """
-  response_fd, response_filename = tempfile.mkstemp(prefix='emscripten_', suffix='.rsp', dir=directory, text=True)
+
+  assert suffix.startswith('.')
+
+  response_fd, response_filename = tempfile.mkstemp(prefix='emscripten_', suffix=suffix, dir=directory, text=True)
 
   # Backslashes and other special chars need to be escaped in the response file.
   escape_chars = ['\\', '\"']
@@ -41,16 +44,12 @@ def escape(arg):
       arg = '"%s"' % arg
     contents += arg + '\n'
 
-  # When writing windows repsonse files force the encoding to UTF8 which we know
-  # that llvm tools understand.  Without this, we get whatever the default codepage
-  # might be.
-  # See: https://github.com/llvm/llvm-project/blob/3f3d1c901d7abcc5b91468335679b1b27d8a02dd/llvm/include/llvm/Support/Program.h#L168-L170
-  # And: https://github.com/llvm/llvm-project/blob/63d16d06f5b8f71382033b5ea4aa668f8150817a/clang/include/clang/Driver/Job.h#L58-L69
-  # TODO(sbc): Should we also force utf-8 on non-windows?
-  if WINDOWS:
-    encoding = 'utf-8'
+  # Decide the encoding of the generated file based on the requested file suffix
+  if suffix.count('.') == 2:
+    # Use the encoding specified in the suffix of the response file
+    encoding = suffix.split('.')[2]
   else:
-    encoding = None
+    encoding = 'utf-8'
 
   with os.fdopen(response_fd, 'w', encoding=encoding) as f:
     f.write(contents)
@@ -70,15 +69,37 @@ def read_response_file(response_filename):
   """Reads a response file, and returns the list of cmdline params found in the
   file.
 
+  The encoding that the response filename should be read with can be specified
+  as a suffix to the file, e.g. "foo.rsp.utf-8" or "foo.rsp.cp1252". If not
+  specified, first UTF-8 and then Python locale.getpreferredencoding() are
+  attempted.
+
   The parameter response_filename may start with '@'."""
   if response_filename.startswith('@'):
     response_filename = response_filename[1:]
 
   if not os.path.exists(response_filename):
     raise IOError("response file not found: %s" % response_filename)
 
-  with open(response_filename) as f:
-    args = f.read()
+  # Guess encoding based on the file suffix
+  components = os.path.basename(response_filename).split('.')
+  encoding_suffix = components[-1].lower()
+  if len(components) > 1 and (encoding_suffix.startswith('utf') or encoding_suffix.startswith('cp') or encoding_suffix.startswith('iso') or encoding_suffix in ['ascii', 'latin-1']):
+    guessed_encoding = encoding_suffix
+  else:
+    guessed_encoding = 'utf-8'
+
+  try:
+    # First try with the guessed encoding
+    with open(response_filename, encoding=guessed_encoding) as f:
+      args = f.read()
+  except (ValueError, LookupError): # UnicodeDecodeError is a subclass of ValueError, and Python raises either a ValueError or a UnicodeDecodeError on decode errors. LookupError is raised if guessed encoding is not an encoding.
+    if DEBUG:
+      logging.warning(f'Failed to parse response file {response_filename} with guessed encoding "{guessed_encoding}". Trying default system encoding...')
+    # If that fails, try with the Python default locale.getpreferredencoding()
+    with open(response_filename) as f:
+      args = f.read()
+
   args = shlex.split(args)
 
   if DEBUG: