Skip to content

Commit

Permalink
Use embedded Unicode (de)composition table.
Browse files Browse the repository at this point in the history
  • Loading branch information
RazrFalcon committed Jul 24, 2020
1 parent 7bd78a8 commit f0e5a76
Show file tree
Hide file tree
Showing 5 changed files with 3,235 additions and 31 deletions.
1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ categories = ["text-processing"]
bitflags = "1.2"
smallvec = "1.4.1"
ttf-parser = { git = "https://github.com/RazrFalcon/ttf-parser", rev = "fe1dc38" }
unic-ucd-normal = { version = "0.9", default-features = false }
unicode-bidi-mirroring = "0.1"
unicode-ccc = "0.1"
unicode-general-category = "0.2"
Expand Down
81 changes: 81 additions & 0 deletions scripts/gen-unicode-norm-table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python3

import urllib.request
import os

URL = 'https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt'
FILE_NAME = 'UnicodeData.txt'


def hex_to_char_rs(c):
return f"'\\u{{{c}}}'"


if not os.path.exists(FILE_NAME):
urllib.request.urlretrieve(URL, FILE_NAME)


print('// WARNING: this file was generated by ../scripts/gen-unicode-norm-table.py')
print()
print('//! This module provides Unicode tables for canonical (de)composition.')
print('//!')
print('//! The current implementation is not the fastest one. Just good enough.')
print()
print('#[allow(dead_code)]')
print('pub const UNICODE_VERSION: (u8, u8, u8) = (13, 0, 0);')
print()
print('// Rust support `Option<char>` layout optimization, so it will take only 4 bytes.')
print('pub const DECOMPOSITION_TABLE: &[(char, char, Option<char>)] = &[')

compose_data = []
with open(FILE_NAME) as f:
for line in f:
parts = line.split(';')
if len(parts[5]) == 0:
continue

# Skip codepoints with compatibility formatting tags
# since we care only about canonical mapping.
if parts[5][0] == '<':
continue

# Print the decomposition table as is, since `UnicodeData` is already sorted.

c = parts[0]
mapping = parts[5].split(' ')
if len(mapping) == 2:
print(f" ({hex_to_char_rs(c)}, {hex_to_char_rs(mapping[0])}, Some({hex_to_char_rs(mapping[1])})),")

# Remember only codepoints that should be decomposed into two codepoints.
compose_data.append([mapping[0], mapping[1], c])
elif len(mapping) == 1:
print(f' ({hex_to_char_rs(c)}, {hex_to_char_rs(mapping[0])}, None),')
else:
raise 'invalid unicode data'

print('];')
print()


print('// The first value is `a << 32 | b`.')
print('// Sorted by the first value.')
print('pub const COMPOSITION_TABLE: &[(u64, char)] = &[')

pairs = []
for mapping in compose_data:
needle = int(mapping[0], 16) << 32 | int(mapping[1], 16)
pairs.append((needle, mapping[2]))

pairs.sort(key=lambda x: x[0])

# Make sure that needles are unique.
needles = set()
for pair in pairs:
needles.add(pair[0])

assert len(pairs) == len(needles)

for pair in pairs:
print(f' ({pair[0]}, {hex_to_char_rs(pair[1])}),')

print('];')
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ mod tag;
mod tag_table;
mod text_parser;
mod unicode;
mod unicode_norm;
mod complex;
mod ot;

Expand Down
79 changes: 49 additions & 30 deletions src/unicode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -749,14 +749,47 @@ pub extern "C" fn rb_ucd_is_variation_selector(u: rb_codepoint_t) -> ffi::rb_boo
char::try_from(u).unwrap().is_variation_selector() as i32
}

const S_BASE: u32 = 0xAC00;
const L_BASE: u32 = 0x1100;
const V_BASE: u32 = 0x1161;
const T_BASE: u32 = 0x11A7;
const L_COUNT: u32 = 19;
const V_COUNT: u32 = 21;
const T_COUNT: u32 = 28;
const N_COUNT: u32 = V_COUNT * T_COUNT;
const S_COUNT: u32 = L_COUNT * N_COUNT;

fn compose_hangul(a: char, b: char) -> Option<char> {
let l = a as u32;
let v = b as u32;
if L_BASE <= l && l < (L_BASE + L_COUNT) && V_BASE <= v && v < (V_BASE + V_COUNT) {
let r = S_BASE + (l - L_BASE) * N_COUNT + (v - V_BASE) * T_COUNT;
Some(char::try_from(r).unwrap())
} else if S_BASE <= l && l <= (S_BASE + S_COUNT - T_COUNT)
&& T_BASE <= v && v < (T_BASE + T_COUNT)
&& (l - S_BASE) % T_COUNT == 0
{
let r = l + (v - T_BASE);
Some(char::try_from(r).unwrap())
} else {
None
}
}

pub fn compose(a: char, b: char) -> Option<char> {
unic_ucd_normal::compose(a, b)
if let Some(ab) = compose_hangul(a, b) {
return Some(ab);
}

let needle = (a as u64) << 32 | (b as u64);
crate::unicode_norm::COMPOSITION_TABLE.binary_search_by(|item| item.0.cmp(&needle)).ok()
.map(|idx| crate::unicode_norm::COMPOSITION_TABLE[idx].1)
}

#[no_mangle]
pub extern "C" fn rb_ucd_compose(a: rb_codepoint_t, b: rb_codepoint_t, ab: *mut rb_codepoint_t) -> ffi::rb_bool_t {
unsafe {
let new = unic_ucd_normal::compose(
let new = compose(
char::try_from(a).unwrap(),
char::try_from(b).unwrap(),
);
Expand All @@ -771,16 +804,6 @@ pub extern "C" fn rb_ucd_compose(a: rb_codepoint_t, b: rb_codepoint_t, ab: *mut
}

fn rb_ucd_decompose_hangul(ab: rb_codepoint_t, a: *mut rb_codepoint_t, b: *mut rb_codepoint_t) -> bool {
const S_BASE: u32 = 0xAC00;
const L_BASE: u32 = 0x1100;
const V_BASE: u32 = 0x1161;
const T_BASE: u32 = 0x11A7;
const L_COUNT: u32 = 19;
const V_COUNT: u32 = 21;
const T_COUNT: u32 = 28;
const N_COUNT: u32 = V_COUNT * T_COUNT;
const S_COUNT: u32 = L_COUNT * N_COUNT;

let si = ab.wrapping_sub(S_BASE);
if si >= S_COUNT {
return false;
Expand Down Expand Up @@ -817,25 +840,21 @@ pub extern "C" fn rb_ucd_decompose(
}

let ab = char::try_from(ab).unwrap();
let chars = match unic_ucd_normal::canonical_decomposition(ab) {
Some(chars) => chars,
None => return 0,
};

unsafe {
match chars.len() {
1 => {
*a = chars[0] as u32;
*b = 0;
1
}
2 => {
*a = chars[0] as u32;
*b = chars[1] as u32;
1
match crate::unicode_norm::DECOMPOSITION_TABLE.binary_search_by(|item| item.0.cmp(&ab)) {
Ok(idx) => {
let chars = &crate::unicode_norm::DECOMPOSITION_TABLE[idx];
unsafe {
if let Some(rb) = chars.2 {
*a = chars.1 as u32;
*b = rb as u32;
} else {
*a = chars.1 as u32;
}
}
_ => 0,

1
}
Err(_) => 0,
}
}

Expand All @@ -847,6 +866,6 @@ mod tests {
assert_eq!(unicode_ccc::UNICODE_VERSION, (13, 0, 0));
assert_eq!(unicode_general_category::UNICODE_VERSION, (12, 1, 0)); // TODO: update
assert_eq!(unicode_script::UNICODE_VERSION, (13, 0, 0));
assert_eq!(unic_ucd_normal::UNICODE_VERSION.major, 10); // TODO: update
assert_eq!(crate::unicode_norm::UNICODE_VERSION, (13, 0, 0));
}
}
Loading

0 comments on commit f0e5a76

Please sign in to comment.