Skip to content

Commit

Permalink
Update GB18030-2022 implementation
Browse files Browse the repository at this point in the history
https://bugs.webkit.org/show_bug.cgi?id=259349
rdar://112507597

Reviewed by Myles C. Maxfield.

This implements the new code points exactly as specified in GB180030-2022
instead of Unicode's industry recommendations for migrating data between
GB18030-2005 and GB18030-2022.

* LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gb18030/gb18030-decoder.any.js:
* LayoutTests/imported/w3c/web-platform-tests/encoding/legacy-mb-schinese/gb18030/gb18030-encoder.html:
* Source/WebCore/PAL/pal/text/TextCodecCJK.cpp:
(PAL::gb18030_2022Encode2Byte):
(PAL::gb18030_2022Encode4Byte):
(PAL::gb18030_2022Decode):
(PAL::TextCodecCJK::gb18030Decode):
(PAL::gbEncodeShared):
(PAL::gb18030AsymmetricEncode): Deleted.

Canonical link: https://commits.webkit.org/266173@main
  • Loading branch information
achristensen07 committed Jul 20, 2023
1 parent e281c3d commit 9fd5c67
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 83 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -65,24 +65,24 @@ decode([0xFE, 0x6D], "\u9FB8", "GB18030-2022 15");
decode([0xFE, 0x7E], "\u9FB9", "GB18030-2022 16");
decode([0xFE, 0x90], "\u9FBA", "GB18030-2022 17");
decode([0xFE, 0xA0], "\u9FBB", "GB18030-2022 18");
decode([0x82, 0x35, 0x90, 0x37], "\u9FB4", "GB18030-2022 19");
decode([0x82, 0x35, 0x90, 0x38], "\u9FB5", "GB18030-2022 20");
decode([0x82, 0x35, 0x90, 0x39], "\u9FB6", "GB18030-2022 21");
decode([0x82, 0x35, 0x91, 0x30], "\u9FB7", "GB18030-2022 22");
decode([0x82, 0x35, 0x91, 0x31], "\u9FB8", "GB18030-2022 23");
decode([0x82, 0x35, 0x91, 0x32], "\u9FB9", "GB18030-2022 24");
decode([0x82, 0x35, 0x91, 0x33], "\u9FBA", "GB18030-2022 25");
decode([0x82, 0x35, 0x91, 0x34], "\u9FBB", "GB18030-2022 26");
decode([0x84, 0x31, 0x82, 0x36], "\uFE10", "GB18030-2022 27");
decode([0x84, 0x31, 0x82, 0x37], "\uFE11", "GB18030-2022 28");
decode([0x84, 0x31, 0x82, 0x38], "\uFE12", "GB18030-2022 29");
decode([0x84, 0x31, 0x82, 0x39], "\uFE13", "GB18030-2022 30");
decode([0x84, 0x31, 0x83, 0x30], "\uFE14", "GB18030-2022 31");
decode([0x84, 0x31, 0x83, 0x31], "\uFE15", "GB18030-2022 32");
decode([0x84, 0x31, 0x83, 0x32], "\uFE16", "GB18030-2022 33");
decode([0x84, 0x31, 0x83, 0x33], "\uFE17", "GB18030-2022 34");
decode([0x84, 0x31, 0x83, 0x34], "\uFE18", "GB18030-2022 35");
decode([0x84, 0x31, 0x83, 0x35], "\uFE19", "GB18030-2022 36");
decode([0x82, 0x35, 0x90, 0x37], "\uE81E", "GB18030-2022 19");
decode([0x82, 0x35, 0x90, 0x38], "\uE826", "GB18030-2022 20");
decode([0x82, 0x35, 0x90, 0x39], "\uE82B", "GB18030-2022 21");
decode([0x82, 0x35, 0x91, 0x30], "\uE82C", "GB18030-2022 22");
decode([0x82, 0x35, 0x91, 0x31], "\uE832", "GB18030-2022 23");
decode([0x82, 0x35, 0x91, 0x32], "\uE843", "GB18030-2022 24");
decode([0x82, 0x35, 0x91, 0x33], "\uE854", "GB18030-2022 25");
decode([0x82, 0x35, 0x91, 0x34], "\uE864", "GB18030-2022 26");
decode([0x84, 0x31, 0x82, 0x36], "\uE78D", "GB18030-2022 27");
decode([0x84, 0x31, 0x82, 0x37], "\uE78F", "GB18030-2022 28");
decode([0x84, 0x31, 0x82, 0x38], "\uE78E", "GB18030-2022 29");
decode([0x84, 0x31, 0x82, 0x39], "\uE790", "GB18030-2022 30");
decode([0x84, 0x31, 0x83, 0x30], "\uE791", "GB18030-2022 31");
decode([0x84, 0x31, 0x83, 0x31], "\uE792", "GB18030-2022 32");
decode([0x84, 0x31, 0x83, 0x32], "\uE793", "GB18030-2022 33");
decode([0x84, 0x31, 0x83, 0x33], "\uE794", "GB18030-2022 34");
decode([0x84, 0x31, 0x83, 0x34], "\uE795", "GB18030-2022 35");
decode([0x84, 0x31, 0x83, 0x35], "\uE796", "GB18030-2022 36");

let i = 0;
for (const range of ranges) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,24 @@
encode("\u9FB9", "%FE~", "GB18030-2022 16");
encode("\u9FBA", "%FE%90", "GB18030-2022 17");
encode("\u9FBB", "%FE%A0", "GB18030-2022 18");
encode("\uE78D", "%A6%D9", "GB18030-2022 19");
encode("\uE78E", "%A6%DA", "GB18030-2022 20");
encode("\uE78F", "%A6%DB", "GB18030-2022 21");
encode("\uE790", "%A6%DC", "GB18030-2022 22");
encode("\uE791", "%A6%DD", "GB18030-2022 23");
encode("\uE792", "%A6%DE", "GB18030-2022 24");
encode("\uE793", "%A6%DF", "GB18030-2022 25");
encode("\uE794", "%A6%EC", "GB18030-2022 26");
encode("\uE795", "%A6%ED", "GB18030-2022 27");
encode("\uE796", "%A6%F3", "GB18030-2022 28");
encode("\uE81E", "%FEY", "GB18030-2022 29");
encode("\uE826", "%FEa", "GB18030-2022 30");
encode("\uE82B", "%FEf", "GB18030-2022 31");
encode("\uE82C", "%FEg", "GB18030-2022 32");
encode("\uE832", "%FEm", "GB18030-2022 33");
encode("\uE843", "%FE~", "GB18030-2022 34");
encode("\uE854", "%FE%90", "GB18030-2022 35");
encode("\uE864", "%FE%A0", "GB18030-2022 36");
encode("\uE78D", "%841%826", "GB18030-2022 19");
encode("\uE78E", "%841%828", "GB18030-2022 20");
encode("\uE78F", "%841%827", "GB18030-2022 21");
encode("\uE790", "%841%829", "GB18030-2022 22");
encode("\uE791", "%841%830", "GB18030-2022 23");
encode("\uE792", "%841%831", "GB18030-2022 24");
encode("\uE793", "%841%832", "GB18030-2022 25");
encode("\uE794", "%841%833", "GB18030-2022 26");
encode("\uE795", "%841%834", "GB18030-2022 27");
encode("\uE796", "%841%835", "GB18030-2022 28");
encode("\uE81E", "%825%907", "GB18030-2022 29");
encode("\uE826", "%825%908", "GB18030-2022 30");
encode("\uE82B", "%825%909", "GB18030-2022 31");
encode("\uE82C", "%825%910", "GB18030-2022 32");
encode("\uE832", "%825%911", "GB18030-2022 33");
encode("\uE843", "%825%912", "GB18030-2022 34");
encode("\uE854", "%825%913", "GB18030-2022 35");
encode("\uE864", "%825%914", "GB18030-2022 36");

const upperCaseNibble = x => {
return Math.floor(x).toString(16).toUpperCase();
Expand Down
106 changes: 59 additions & 47 deletions Source/WebCore/PAL/pal/text/TextCodecCJK.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -892,6 +892,58 @@ static const GB18030EncodeIndex& gb18030EncodeIndex()
return *table;
}

// https://unicode-org.atlassian.net/browse/ICU-22357
// The 2-byte values are handled correctly by values from gb18030()
// but these need to be exceptions from gb18030Ranges().
static std::optional<uint32_t> gb180302022Encode(UChar32 codePoint)
{
switch (codePoint) {
case 0xE81E: return 0x82359037;
case 0xE826: return 0x82359038;
case 0xE82B: return 0x82359039;
case 0xE82C: return 0x82359130;
case 0xE832: return 0x82359131;
case 0xE843: return 0x82359132;
case 0xE854: return 0x82359133;
case 0xE864: return 0x82359134;
case 0xE78D: return 0x84318236;
case 0xE78F: return 0x84318237;
case 0xE78E: return 0x84318238;
case 0xE790: return 0x84318239;
case 0xE791: return 0x84318330;
case 0xE792: return 0x84318331;
case 0xE793: return 0x84318332;
case 0xE794: return 0x84318333;
case 0xE795: return 0x84318334;
case 0xE796: return 0x84318335;
}
return std::nullopt;
}
static std::optional<UChar32> gb180302022Decode(uint8_t first, uint8_t second, uint8_t third, uint8_t fourth)
{
switch (static_cast<uint32_t>(first) << 24 | static_cast<uint32_t>(second) << 16 | static_cast<uint32_t>(third) << 8 | fourth) {
case 0x82359037: return 0xE81E;
case 0x82359038: return 0xE826;
case 0x82359039: return 0xE82B;
case 0x82359130: return 0xE82C;
case 0x82359131: return 0xE832;
case 0x82359132: return 0xE843;
case 0x82359133: return 0xE854;
case 0x82359134: return 0xE864;
case 0x84318236: return 0xE78D;
case 0x84318237: return 0xE78F;
case 0x84318238: return 0xE78E;
case 0x84318239: return 0xE790;
case 0x84318330: return 0xE791;
case 0x84318331: return 0xE792;
case 0x84318332: return 0xE793;
case 0x84318333: return 0xE794;
case 0x84318334: return 0xE795;
case 0x84318335: return 0xE796;
}
return std::nullopt;
}

// https://encoding.spec.whatwg.org/#gb18030-decoder
String TextCodecCJK::gb18030Decode(const uint8_t* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
Expand All @@ -917,6 +969,10 @@ String TextCodecCJK::gb18030Decode(const uint8_t* bytes, size_t length, bool flu
uint8_t first = std::exchange(m_gb18030First, 0x00);
uint8_t second = std::exchange(m_gb18030Second, 0x00);
uint8_t third = std::exchange(m_gb18030Third, 0x00);
if (auto codePoint = gb180302022Decode(first, second, third, byte)) {
result.appendCharacter(*codePoint);
return SawError::No;
}
if (auto codePoint = gb18030RangesCodePoint(((first - 0x81) * 10 * 126 * 10) + ((second - 0x30) * 10 * 126) + ((third - 0x81) * 10) + byte - 0x30)) {
result.appendCharacter(*codePoint);
return SawError::No;
Expand Down Expand Up @@ -981,52 +1037,6 @@ String TextCodecCJK::gb18030Decode(const uint8_t* bytes, size_t length, bool flu
return result;
}

// https://www.unicode.org/L2/L2023/23003r-gb18030-recommendations.pdf
static std::optional<uint16_t> gb18030AsymmetricEncode(UChar32 codePoint)
{
switch (codePoint) {
case 0xE78D:
return 0xA6D9;
case 0xE78E:
return 0xA6DA;
case 0xE78F:
return 0xA6DB;
case 0xE790:
return 0xA6DC;
case 0xE791:
return 0xA6DD;
case 0xE792:
return 0xA6DE;
case 0xE793:
return 0xA6DF;
case 0xE794:
return 0xA6EC;
case 0xE795:
return 0xA6ED;
case 0xE796:
return 0xA6F3;
case 0xE81E:
return 0xFE59;
case 0xE826:
return 0xFE61;
case 0xE82B:
return 0xFE66;
case 0xE82C:
return 0xFE67;
case 0xE832:
return 0xFE6D;
case 0xE843:
return 0xFE7E;
case 0xE854:
return 0xFE90;
case 0xE864:
return 0xFEA0;
default:
break;
}
return std::nullopt;
}

// https://encoding.spec.whatwg.org/#gb18030-encoder
enum class IsGBK : bool { No, Yes };
static Vector<uint8_t> gbEncodeShared(StringView string, Function<void(UChar32, Vector<uint8_t>&)>&& unencodableHandler, IsGBK isGBK)
Expand All @@ -1050,7 +1060,9 @@ static Vector<uint8_t> gbEncodeShared(StringView string, Function<void(UChar32,
result.append(0x80);
continue;
}
} else if (auto encoded = gb18030AsymmetricEncode(codePoint)) {
} else if (auto encoded = gb180302022Encode(codePoint)) {
result.append(*encoded >> 24);
result.append(*encoded >> 16);
result.append(*encoded >> 8);
result.append(*encoded);
continue;
Expand Down

0 comments on commit 9fd5c67

Please sign in to comment.