Use embedded Unicode (de)composition table.

RazrFalcon · Jul 24, 2020 · f0e5a76 · f0e5a76
1 parent 7bd78a8
commit f0e5a76
Show file tree

Hide file tree

Showing 5 changed files with 3,235 additions and 31 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,7 +15,6 @@ categories = ["text-processing"]
 bitflags = "1.2"
 smallvec = "1.4.1"
 ttf-parser = { git = "https://github.com/RazrFalcon/ttf-parser", rev = "fe1dc38" }
-unic-ucd-normal = { version = "0.9", default-features = false }
 unicode-bidi-mirroring = "0.1"
 unicode-ccc = "0.1"
 unicode-general-category = "0.2"

diff --git a/scripts/gen-unicode-norm-table.py b/scripts/gen-unicode-norm-table.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+import urllib.request
+import os
+
+URL = 'https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt'
+FILE_NAME = 'UnicodeData.txt'
+
+
+def hex_to_char_rs(c):
+    return f"'\\u{{{c}}}'"
+
+
+if not os.path.exists(FILE_NAME):
+    urllib.request.urlretrieve(URL, FILE_NAME)
+
+
+print('// WARNING: this file was generated by ../scripts/gen-unicode-norm-table.py')
+print()
+print('//! This module provides Unicode tables for canonical (de)composition.')
+print('//!')
+print('//! The current implementation is not the fastest one. Just good enough.')
+print()
+print('#[allow(dead_code)]')
+print('pub const UNICODE_VERSION: (u8, u8, u8) = (13, 0, 0);')
+print()
+print('// Rust support `Option<char>` layout optimization, so it will take only 4 bytes.')
+print('pub const DECOMPOSITION_TABLE: &[(char, char, Option<char>)] = &[')
+
+compose_data = []
+with open(FILE_NAME) as f:
+    for line in f:
+        parts = line.split(';')
+        if len(parts[5]) == 0:
+            continue
+
+        # Skip codepoints with compatibility formatting tags
+        # since we care only about canonical mapping.
+        if parts[5][0] == '<':
+            continue
+
+        # Print the decomposition table as is, since `UnicodeData` is already sorted.
+
+        c = parts[0]
+        mapping = parts[5].split(' ')
+        if len(mapping) == 2:
+            print(f"    ({hex_to_char_rs(c)}, {hex_to_char_rs(mapping[0])}, Some({hex_to_char_rs(mapping[1])})),")
+
+            # Remember only codepoints that should be decomposed into two codepoints.
+            compose_data.append([mapping[0], mapping[1], c])
+        elif len(mapping) == 1:
+            print(f'    ({hex_to_char_rs(c)}, {hex_to_char_rs(mapping[0])}, None),')
+        else:
+            raise 'invalid unicode data'
+
+print('];')
+print()
+
+
+print('// The first value is `a << 32 | b`.')
+print('// Sorted by the first value.')
+print('pub const COMPOSITION_TABLE: &[(u64, char)] = &[')
+
+pairs = []
+for mapping in compose_data:
+    needle = int(mapping[0], 16) << 32 | int(mapping[1], 16)
+    pairs.append((needle, mapping[2]))
+
+pairs.sort(key=lambda x: x[0])
+
+# Make sure that needles are unique.
+needles = set()
+for pair in pairs:
+    needles.add(pair[0])
+
+assert len(pairs) == len(needles)
+
+for pair in pairs:
+    print(f'    ({pair[0]}, {hex_to_char_rs(pair[1])}),')
+
+print('];')
diff --git a/src/lib.rs b/src/lib.rs
@@ -13,6 +13,7 @@ mod tag;
 mod tag_table;
 mod text_parser;
 mod unicode;
+mod unicode_norm;
 mod complex;
 mod ot;
 

diff --git a/src/unicode.rs b/src/unicode.rs
@@ -749,14 +749,47 @@ pub extern "C" fn rb_ucd_is_variation_selector(u: rb_codepoint_t) -> ffi::rb_boo
     char::try_from(u).unwrap().is_variation_selector() as i32
 }
 
+const S_BASE: u32 = 0xAC00;
+const L_BASE: u32 = 0x1100;
+const V_BASE: u32 = 0x1161;
+const T_BASE: u32 = 0x11A7;
+const L_COUNT: u32 = 19;
+const V_COUNT: u32 = 21;
+const T_COUNT: u32 = 28;
+const N_COUNT: u32 = V_COUNT * T_COUNT;
+const S_COUNT: u32 = L_COUNT * N_COUNT;
+
+fn compose_hangul(a: char, b: char) -> Option<char> {
+    let l = a as u32;
+    let v = b as u32;
+    if L_BASE <= l && l < (L_BASE + L_COUNT) && V_BASE <= v && v < (V_BASE + V_COUNT) {
+        let r = S_BASE + (l - L_BASE) * N_COUNT + (v - V_BASE) * T_COUNT;
+        Some(char::try_from(r).unwrap())
+    } else if S_BASE <= l && l <= (S_BASE + S_COUNT - T_COUNT)
+        && T_BASE <= v && v < (T_BASE + T_COUNT)
+        && (l - S_BASE) % T_COUNT == 0
+    {
+        let r = l + (v - T_BASE);
+        Some(char::try_from(r).unwrap())
+    } else {
+        None
+    }
+}
+
 pub fn compose(a: char, b: char) -> Option<char> {
-    unic_ucd_normal::compose(a, b)
+    if let Some(ab) = compose_hangul(a, b) {
+        return Some(ab);
+    }
+
+    let needle = (a as u64) << 32 | (b as u64);
+    crate::unicode_norm::COMPOSITION_TABLE.binary_search_by(|item| item.0.cmp(&needle)).ok()
+        .map(|idx| crate::unicode_norm::COMPOSITION_TABLE[idx].1)
 }
 
 #[no_mangle]
 pub extern "C" fn rb_ucd_compose(a: rb_codepoint_t, b: rb_codepoint_t, ab: *mut rb_codepoint_t) -> ffi::rb_bool_t {
     unsafe {
-        let new = unic_ucd_normal::compose(
+        let new = compose(
             char::try_from(a).unwrap(),
             char::try_from(b).unwrap(),
         );
@@ -771,16 +804,6 @@ pub extern "C" fn rb_ucd_compose(a: rb_codepoint_t, b: rb_codepoint_t, ab: *mut
 }
 
 fn rb_ucd_decompose_hangul(ab: rb_codepoint_t, a: *mut rb_codepoint_t, b: *mut rb_codepoint_t) -> bool {
-    const S_BASE: u32 = 0xAC00;
-    const L_BASE: u32 = 0x1100;
-    const V_BASE: u32 = 0x1161;
-    const T_BASE: u32 = 0x11A7;
-    const L_COUNT: u32 = 19;
-    const V_COUNT: u32 = 21;
-    const T_COUNT: u32 = 28;
-    const N_COUNT: u32 = V_COUNT * T_COUNT;
-    const S_COUNT: u32 = L_COUNT * N_COUNT;
-
     let si = ab.wrapping_sub(S_BASE);
     if si >= S_COUNT {
         return false;
@@ -817,25 +840,21 @@ pub extern "C" fn rb_ucd_decompose(
     }
 
     let ab = char::try_from(ab).unwrap();
-    let chars = match unic_ucd_normal::canonical_decomposition(ab) {
-        Some(chars) => chars,
-        None => return 0,
-    };
-
-    unsafe {
-        match chars.len() {
-            1 => {
-                *a = chars[0] as u32;
-                *b = 0;
-                1
-            }
-            2 => {
-                *a = chars[0] as u32;
-                *b = chars[1] as u32;
-                1
+    match crate::unicode_norm::DECOMPOSITION_TABLE.binary_search_by(|item| item.0.cmp(&ab)) {
+        Ok(idx) => {
+            let chars = &crate::unicode_norm::DECOMPOSITION_TABLE[idx];
+            unsafe {
+                if let Some(rb) = chars.2 {
+                    *a = chars.1 as u32;
+                    *b = rb as u32;
+                } else {
+                    *a = chars.1 as u32;
+                }
             }
-            _ => 0,
+
+            1
         }
+        Err(_) => 0,
     }
 }
 
@@ -847,6 +866,6 @@ mod tests {
         assert_eq!(unicode_ccc::UNICODE_VERSION,                (13, 0, 0));
         assert_eq!(unicode_general_category::UNICODE_VERSION,   (12, 1, 0)); // TODO: update
         assert_eq!(unicode_script::UNICODE_VERSION,             (13, 0, 0));
-        assert_eq!(unic_ucd_normal::UNICODE_VERSION.major,      10); // TODO: update
+        assert_eq!(crate::unicode_norm::UNICODE_VERSION,        (13, 0, 0));
     }
 }