In [1]:
import unicodedata

In [2]:
def list_characters(s):
    """List characters of string s, as seen by Python"""
    for c in s:
        print(c, end=' ')
        if unicodedata.combining(c):
            print(end=' ')
        print(unicodedata.name(c))

# Comparing two Unicode strings

In [3]:
words = [unicodedata.normalize('NFC', 'Schlyñ'), unicodedata.normalize('NFD', 'Schlyñ')]

for s in words:
    list_characters(s)
    print()

S LATIN CAPITAL LETTER S
c LATIN SMALL LETTER C
h LATIN SMALL LETTER H
l LATIN SMALL LETTER L
y LATIN SMALL LETTER Y
ñ LATIN SMALL LETTER N WITH TILDE

S LATIN CAPITAL LETTER S
c LATIN SMALL LETTER C
h LATIN SMALL LETTER H
l LATIN SMALL LETTER L
y LATIN SMALL LETTER Y
n LATIN SMALL LETTER N
̃  COMBINING TILDE



These two strings are different:

In [4]:
words[0] == words[1]

False

And yet they are the canonically equivalent:

In [5]:
unicodedata.normalize('NFC', words[0]) == unicodedata.normalize('NFC', words[1])

True

→ Normalize to NFC (Normalization Form Composed) to compare. NFC is also composed, which is what we want. But it doesn't matter because we're not interested in the characters as Python sees them, but in grapheme clusters (see below.)

# Grapheme clusters

For evaluation we're interesting in what is perceived as "characters". But is "ñ" 1 character (LATIN SMALL LETTER N WITH TILDE) or 2 (LATIN SMALL LETTER N + COMBINING TILDE)?

What we're probably want are [grapheme clusters](https://uniseg-python.readthedocs.io/en/latest/graphemecluster.html):

In [6]:
from uniseg.graphemecluster import grapheme_clusters

for w in words:
    print(list(grapheme_clusters(w)))

['S', 'c', 'h', 'l', 'y', 'ñ']
['S', 'c', 'h', 'l', 'y', 'ñ']


Just looking at the interesting character – the last one - from both words:

In [7]:
for w in words:
    list_characters(list(grapheme_clusters(w))[-1])
    print()

ñ LATIN SMALL LETTER N WITH TILDE

n LATIN SMALL LETTER N
̃  COMBINING TILDE



→ Work with grapheme clusters, not "characters as Python sees them".

In [8]:
def unicode_name(c):
    if 0xE000 <= ord(c) <= 0xF8FF:
        return 'private use character 0x{:04X}'.format(ord(c))
    else:
        return unicodedata.name(c)
 

def list_grapheme_clusters(s):
    """List grapheme clusters of string s"""
    for g in grapheme_clusters(s):
        print(g, end=' ')
        if len(g) > 1:
            print('(multiple)', end=' ')
        try:
            print(', '.join(unicode_name(c) for c in g))
        except ValueError:
            print('ValueError')

In [9]:
for w in words:
    list_grapheme_clusters(w)
    print()

S LATIN CAPITAL LETTER S
c LATIN SMALL LETTER C
h LATIN SMALL LETTER H
l LATIN SMALL LETTER L
y LATIN SMALL LETTER Y
ñ LATIN SMALL LETTER N WITH TILDE

S LATIN CAPITAL LETTER S
c LATIN SMALL LETTER C
h LATIN SMALL LETTER H
l LATIN SMALL LETTER L
y LATIN SMALL LETTER Y
ñ (multiple) LATIN SMALL LETTER N, COMBINING TILDE



In [10]:
list_grapheme_clusters('私は彼女がお茶を好きな事が分かった。')

私 CJK UNIFIED IDEOGRAPH-79C1
は HIRAGANA LETTER HA
彼 CJK UNIFIED IDEOGRAPH-5F7C
女 CJK UNIFIED IDEOGRAPH-5973
が HIRAGANA LETTER GA
お HIRAGANA LETTER O
茶 CJK UNIFIED IDEOGRAPH-8336
を HIRAGANA LETTER WO
好 CJK UNIFIED IDEOGRAPH-597D
き HIRAGANA LETTER KI
な HIRAGANA LETTER NA
事 CJK UNIFIED IDEOGRAPH-4E8B
が HIRAGANA LETTER GA
分 CJK UNIFIED IDEOGRAPH-5206
か HIRAGANA LETTER KA
っ HIRAGANA LETTER SMALL TU
た HIRAGANA LETTER TA
。 IDEOGRAPHIC FULL STOP


In [11]:
list_grapheme_clusters('. اما چند تا حرف تو فارسی هست که تو عربی نیست')

. FULL STOP
  SPACE
ا ARABIC LETTER ALEF
م ARABIC LETTER MEEM
ا ARABIC LETTER ALEF
  SPACE
چ ARABIC LETTER TCHEH
ن ARABIC LETTER NOON
د ARABIC LETTER DAL
  SPACE
ت ARABIC LETTER TEH
ا ARABIC LETTER ALEF
  SPACE
ح ARABIC LETTER HAH
ر ARABIC LETTER REH
ف ARABIC LETTER FEH
  SPACE
ت ARABIC LETTER TEH
و ARABIC LETTER WAW
  SPACE
ف ARABIC LETTER FEH
ا ARABIC LETTER ALEF
ر ARABIC LETTER REH
س ARABIC LETTER SEEN
ی ARABIC LETTER FARSI YEH
  SPACE
ه ARABIC LETTER HEH
س ARABIC LETTER SEEN
ت ARABIC LETTER TEH
  SPACE
ک ARABIC LETTER KEHEH
ه ARABIC LETTER HEH
  SPACE
ت ARABIC LETTER TEH
و ARABIC LETTER WAW
  SPACE
ع ARABIC LETTER AIN
ر ARABIC LETTER REH
ب ARABIC LETTER BEH
ی ARABIC LETTER FARSI YEH
  SPACE
ن ARABIC LETTER NOON
ی ARABIC LETTER FARSI YEH
س ARABIC LETTER SEEN
ت ARABIC LETTER TEH


In [12]:
list_grapheme_clusters('. لكن كم عدد الكلمات بالفارسية هل أنت باللغة العربية؟')

. FULL STOP
  SPACE
ل ARABIC LETTER LAM
ك ARABIC LETTER KAF
ن ARABIC LETTER NOON
  SPACE
ك ARABIC LETTER KAF
م ARABIC LETTER MEEM
  SPACE
ع ARABIC LETTER AIN
د ARABIC LETTER DAL
د ARABIC LETTER DAL
  SPACE
ا ARABIC LETTER ALEF
ل ARABIC LETTER LAM
ك ARABIC LETTER KAF
ل ARABIC LETTER LAM
م ARABIC LETTER MEEM
ا ARABIC LETTER ALEF
ت ARABIC LETTER TEH
  SPACE
ب ARABIC LETTER BEH
ا ARABIC LETTER ALEF
ل ARABIC LETTER LAM
ف ARABIC LETTER FEH
ا ARABIC LETTER ALEF
ر ARABIC LETTER REH
س ARABIC LETTER SEEN
ي ARABIC LETTER YEH
ة ARABIC LETTER TEH MARBUTA
  SPACE
ه ARABIC LETTER HEH
ل ARABIC LETTER LAM
  SPACE
أ ARABIC LETTER ALEF WITH HAMZA ABOVE
ن ARABIC LETTER NOON
ت ARABIC LETTER TEH
  SPACE
ب ARABIC LETTER BEH
ا ARABIC LETTER ALEF
ل ARABIC LETTER LAM
ل ARABIC LETTER LAM
غ ARABIC LETTER GHAIN
ة ARABIC LETTER TEH MARBUTA
  SPACE
ا ARABIC LETTER ALEF
ل ARABIC LETTER LAM
ع ARABIC LETTER AIN
ر ARABIC LETTER REH
ب ARABIC LETTER BEH
ي ARABIC LETTER YEH
ة ARABIC LETTER TEH MARBUTA
؟ ARABIC QUESTION MAR

In [13]:
list_grapheme_clusters('Hell😀 W😀rld!')

H LATIN CAPITAL LETTER H
e LATIN SMALL LETTER E
l LATIN SMALL LETTER L
l LATIN SMALL LETTER L
😀 GRINNING FACE
  SPACE
W LATIN CAPITAL LETTER W
😀 GRINNING FACE
r LATIN SMALL LETTER R
l LATIN SMALL LETTER L
d LATIN SMALL LETTER D
! EXCLAMATION MARK


In [14]:
list_grapheme_clusters('u̶̜͓̬̞͚͙̪̰͓̯̲̝̬͔͎̳̼͇̓͊ͤ̋̃̀̄̓̿͊̀̚͟͜͟ͅņ̷͔̤̜̗̘̠̦̦̖̟͉̹͕̬͎̙̲̲̎̅̈́ͮͣ̔̀̌͂̄͆͑̚i̴̢͖̳̣̙͕̍ͯͧ̀ͥͭ̆ͣ̉͐͆̊͋͛̈́͒͟c̰̟̫̲͇̺̹͖̼̦̾ͮ̍̐ͤͪ̓ͤ̐̈́̅ͯͤ̚̚͘o̴ͣ̑̐ͫ̈̄͊ͥ̓͟͏̫͔̠̤̜̤̥͘ḍ̛̥͖͓̪͈̹̯͖̱̘͙͖ͧ̿ͧ̓̓͊̈͑͘̕e̛̺͈̜̰̜̖͎͚͈͋̒̆̈́̏͊ͬ̎̑̇̾̆̓ͬ̔̐̾ͭ́͞')

u̶̜͓̬̞͚͙̪̰͓̯̲̝̬͔͎̳̼͇̓͊ͤ̋̃̀̄̓̿͊̀̚͟͜͟ͅ (multiple) LATIN SMALL LETTER U, COMBINING COMMA ABOVE, COMBINING NOT TILDE ABOVE, COMBINING LATIN SMALL LETTER E, COMBINING DOUBLE ACUTE ACCENT, COMBINING TILDE, COMBINING GRAVE ACCENT, COMBINING LEFT ANGLE ABOVE, COMBINING MACRON, COMBINING COMMA ABOVE, COMBINING DOUBLE OVERLINE, COMBINING NOT TILDE ABOVE, COMBINING DOUBLE MACRON BELOW, COMBINING GRAVE TONE MARK, COMBINING DOUBLE BREVE BELOW, COMBINING LONG STROKE OVERLAY, COMBINING DOUBLE MACRON BELOW, COMBINING LEFT HALF RING BELOW, COMBINING X BELOW, COMBINING CARON BELOW, COMBINING DOWN TACK BELOW, COMBINING DOUBLE RING BELOW, COMBINING ASTERISK BELOW, COMBINING BRIDGE BELOW, COMBINING TILDE BELOW, COMBINING X BELOW, COMBINING INVERTED BREVE BELOW, COMBINING LOW LINE, COMBINING UP TACK BELOW, COMBINING CARON BELOW, COMBINING LEFT ARROWHEAD BELOW, COMBINING UPWARDS ARROW BELOW, COMBINING DOUBLE LOW LINE, COMBINING SEAGULL BELOW, COMBINING EQUALS SIGN BELOW, COMBINING GREEK YPOGEGRAMMENI
n̎̅̈́ͮͣ̚

In [15]:
list_grapheme_clusters('Zeugnuͤß')

Z LATIN CAPITAL LETTER Z
e LATIN SMALL LETTER E
u LATIN SMALL LETTER U
g LATIN SMALL LETTER G
n LATIN SMALL LETTER N
uͤ (multiple) LATIN SMALL LETTER U, COMBINING LATIN SMALL LETTER E
ß LATIN SMALL LETTER SHARP S


In [16]:
list_grapheme_clusters('Zeugnß')

Z LATIN CAPITAL LETTER Z
e LATIN SMALL LETTER E
u LATIN SMALL LETTER U
g LATIN SMALL LETTER G
n LATIN SMALL LETTER N
 private use character 0xE72B
ß LATIN SMALL LETTER SHARP S
