From 04fc0d47c3895f294401b588ed49cc4360f31be1 Mon Sep 17 00:00:00 2001
From: Joey Parrish <joeyparrish@users.noreply.github.com>
Date: Fri, 12 Aug 2022 10:48:14 -0700
Subject: [PATCH] fix: Fix TextDecoder fallback and browser support check
 (#4403)

In PR #4324, we lifted the requirement to have a native or polyfilled
TextDecoder implementation.  However, we forgot to remove the check
for it in isBrowserSupported().  This led to tests being skipped
entirely on Xbox, as Xbox was determined to be an unsupported platform
by Player.

To fix this, the check for TextDecoder/TextEncode in
isBrowserSupported() has been removed.

When the TextDecoder polyfill was removed, we left a reference to it
in karma.conf.js.  This didn't hurt anything per se, but this has now
been cleaned up.

Finally, TextDecoder was originally introduced to give us a way to
recover from errors instead of throwing.  The fallback that was
reintroduced in #4324 was the original code that throws on error.
This led to a test failure on Xbox, which represents a complete
subtitle failure in real content with an encoding issue.

To fix this, we replace the utf-8 decoding fallback based on
decodeURIComponent with a plain JS implementation.  This adds only 477
bytes to Shaka Player, which is pretty good compared to the 2315 byte
polyfill we used to recommend for this.

To better verify these text decoding features, a test that checked two
things has been split into two, comments around the tests have been
improved, and an additional test case has been added.
---
 karma.conf.js                  |  4 --
 lib/player.js                  |  5 ---
 lib/util/string_utils.js       | 79 +++++++++++++++++++++++++++-------
 test/util/string_utils_unit.js | 29 ++++++++++---
 4 files changed, 85 insertions(+), 32 deletions(-)

diff --git a/karma.conf.js b/karma.conf.js
index 529a6a0a7b..2c3b7902af 100644
--- a/karma.conf.js
+++ b/karma.conf.js
@@ -160,10 +160,6 @@ module.exports = (config) => {
       'node_modules/es6-promise-polyfill/promise.js',
       //   Babel polyfill, required for async/await
       'node_modules/@babel/polyfill/dist/polyfill.js',
-      //   TextDecoder polyfill, required for TextDecoder/TextEncoder on IE and
-      //   legacy Edge
-      //   eslint-disable-next-line max-len
-      'node_modules/fastestsmallesttextencoderdecoder/EncoderDecoderTogether.min.js',
 
       // muxjs module next
       'node_modules/mux.js/dist/mux.min.js',
diff --git a/lib/player.js b/lib/player.js
index cf54c0cde0..94918e62f1 100644
--- a/lib/player.js
+++ b/lib/player.js
@@ -857,14 +857,9 @@ shaka.Player = class extends shaka.util.FakeEventTarget {
     if (!window.Promise) {
       shaka.log.alwaysWarn('A Promise implementation or polyfill is required');
     }
-    if (!window.TextDecoder || !window.TextEncoder) {
-      shaka.log.alwaysWarn(
-          'A TextDecoder/TextEncoder implementation or polyfill is required');
-    }
 
     // Basic features needed for the library to be usable.
     const basicSupport = !!window.Promise && !!window.Uint8Array &&
-                         !!window.TextDecoder && !!window.TextEncoder &&
                          // eslint-disable-next-line no-restricted-syntax
                          !!Array.prototype.forEach;
     if (!basicSupport) {
diff --git a/lib/util/string_utils.js b/lib/util/string_utils.js
index 1fe17e6d56..40753123fc 100644
--- a/lib/util/string_utils.js
+++ b/lib/util/string_utils.js
@@ -37,6 +37,7 @@ shaka.util.StringUtils = class {
     if (uint8[0] == 0xef && uint8[1] == 0xbb && uint8[2] == 0xbf) {
       uint8 = uint8.subarray(3);
     }
+
     if (window.TextDecoder && !shaka.util.Platform.isPS4()) {
       // Use the TextDecoder interface to decode the text.  This has the
       // advantage compared to the previously-standard decodeUriComponent that
@@ -51,23 +52,69 @@ shaka.util.StringUtils = class {
       }
       return decoded;
     } else {
-      // http://stackoverflow.com/a/13691499
-      const utf8 = shaka.util.StringUtils.fromCharCode(uint8);
-      // This converts each character in the string to an escape sequence.  If
-      // the character is in the ASCII range, it is not converted; otherwise it
-      // is converted to a URI escape sequence.
-      // Example: '\x67\x35\xe3\x82\xac' -> 'g#%E3%82%AC'
-      const escaped = escape(utf8);
-      // Decode the escaped sequence.  This will interpret UTF-8 sequences into
-      // the correct character.
-      // Example: 'g#%E3%82%AC' -> 'g#€'
-      try {
-        return decodeURIComponent(escaped);
-      } catch (e) {
-        throw new shaka.util.Error(
-            shaka.util.Error.Severity.CRITICAL, shaka.util.Error.Category.TEXT,
-            shaka.util.Error.Code.BAD_ENCODING);
+      // Homebrewed UTF-8 decoder based on
+      // https://en.wikipedia.org/wiki/UTF-8#Encoding
+      // Unlike decodeURIComponent, won't throw on bad encoding.
+      // In this way, it is similar to TextDecoder.
+
+      let decoded = '';
+      for (let i = 0; i < uint8.length; ++i) {
+        // By default, the "replacement character" codepoint.
+        let codePoint = 0xFFFD;
+
+        // Top bit is 0, 1-byte encoding.
+        if ((uint8[i] & 0x80) == 0) {
+          codePoint = uint8[i];
+
+        // Top 3 bits of byte 0 are 110, top 2 bits of byte 1 are 10,
+        // 2-byte encoding.
+        } else if (uint8.length >= i + 2 &&
+            (uint8[i] & 0xe0) == 0xc0 &&
+            (uint8[i + 1] & 0xc0) == 0x80) {
+          codePoint = ((uint8[i] & 0x1f) << 6) |
+                      ((uint8[i + 1] & 0x3f));
+          i += 1;  // Consume one extra byte.
+
+        // Top 4 bits of byte 0 are 1110, top 2 bits of byte 1 and 2 are 10,
+        // 3-byte encoding.
+        } else if (uint8.length >= i + 3 &&
+            (uint8[i] & 0xf0) == 0xe0 &&
+            (uint8[i + 1] & 0xc0) == 0x80 &&
+            (uint8[i + 2] & 0xc0) == 0x80) {
+          codePoint = ((uint8[i] & 0x0f) << 12) |
+                      ((uint8[i + 1] & 0x3f) << 6) |
+                      ((uint8[i + 2] & 0x3f));
+          i += 2;  // Consume two extra bytes.
+
+        // Top 5 bits of byte 0 are 11110, top 2 bits of byte 1, 2 and 3 are 10,
+        // 4-byte encoding.
+        } else if (uint8.length >= i + 4 &&
+            (uint8[i] & 0xf1) == 0xf0 &&
+            (uint8[i + 1] & 0xc0) == 0x80 &&
+            (uint8[i + 2] & 0xc0) == 0x80 &&
+            (uint8[i + 3] & 0xc0) == 0x80) {
+          codePoint = ((uint8[i] & 0x07) << 18) |
+                      ((uint8[i + 1] & 0x3f) << 12) |
+                      ((uint8[i + 2] & 0x3f) << 6) |
+                      ((uint8[i + 3] & 0x3f));
+          i += 3;  // Consume three extra bytes.
+        }
+
+        // JavaScript strings are a series of UTF-16 characters.
+        if (codePoint <= 0xffff) {
+          decoded += String.fromCharCode(codePoint);
+        } else {
+          // UTF-16 surrogate-pair encoding, based on
+          // https://en.wikipedia.org/wiki/UTF-16#Description
+          const baseCodePoint = codePoint - 0x10000;
+          const highPart = baseCodePoint >> 10;
+          const lowPart = baseCodePoint & 0x3ff;
+          decoded += String.fromCharCode(0xd800 + highPart);
+          decoded += String.fromCharCode(0xdc00 + lowPart);
+        }
       }
+
+      return decoded;
     }
   }
 
diff --git a/test/util/string_utils_unit.js b/test/util/string_utils_unit.js
index 4a798fad17..03f13dd6b8 100644
--- a/test/util/string_utils_unit.js
+++ b/test/util/string_utils_unit.js
@@ -16,19 +16,34 @@ describe('StringUtils', () => {
   });
 
   it('won\'t break if given cut-off UTF8 character', () => {
-    // This array contains the first half of a 2-byte UTF8 character, stranded
-    // at the very end of the string.
-    const arr1 = [0x53, 0x61, 0x6e, 0x20, 0x4a, 0x6f, 0x73, 0x81];
+    const arr1 = [0x53, 0x61, 0x6e, 0x20, 0x4a, 0x6f, 0x73, 0xc3, 0xa9];
     expect(StringUtils.fromUTF8(new Uint8Array(arr1)))
+        .toBe('San Jos\u00E9');
+
+    // This array contains the first half of a 2-byte UTF8 character
+    // (0xc3 0xa9 = é).  The half-character is stranded at the very end of the
+    // string.
+    const arr = [0x53, 0x61, 0x6e, 0x20, 0x4a, 0x6f, 0x73, 0xc3];
+    expect(StringUtils.fromUTF8(new Uint8Array(arr)))
         .toBe('San Jos\uFFFD');
+  });
 
-    // For reasons I don't know, it seems like 0xE9 cannot be the start of a
-    // UTF8 character.  Perhaps it is a reserved number?
-    const arr2 = [0x4a, 0x6f, 0x73, 0xE9, 0x33, 0x33, 0x20, 0x53, 0x61, 0x6e];
-    expect(StringUtils.fromUTF8(new Uint8Array(arr2)))
+  it('won\'t break if given an invalid UTF-8 sequence', () => {
+    // 0xe9 0x33 0x33 is an invalid UTF-8 sequence.
+    const arr = [0x4a, 0x6f, 0x73, 0xE9, 0x33, 0x33, 0x20, 0x53, 0x61, 0x6e];
+    expect(StringUtils.fromUTF8(new Uint8Array(arr)))
         .toBe('Jos\uFFFD33 San');
   });
 
+  it('can handle an 8-byte character', () => {
+    // This is the UTF-8 encoding of the US flag emoji.
+    // It decodes into two Unicode codepoints, which becomes 4 JavaScript
+    // UTF-16 characters.
+    const arr = [0xf0, 0x9f, 0x87, 0xba, 0xf0, 0x9f, 0x87, 0xb8];
+    expect(StringUtils.fromUTF8(new Uint8Array(arr)))
+        .toBe('\uD83C\uDDFA\uD83C\uDDF8');
+  });
+
   it('strips the BOM in fromUTF8', () => {
     // This is 4 Unicode characters, the last will be split into a surrogate
     // pair.