# Python Unicode handling

In [9]:
c = u'한' # u is optional and not needed unless you want to maintain compatibility with python 2.

## Getting a code point

In [10]:
# https://docs.python.org/3/library/functions.html#ord
# ord stands for ordinal: https://stackoverflow.com/questions/50314440/what-does-the-name-of-the-ord-function-stand-for
code_point = ord(c)
print(code_point)
print(hex(code_point))
print(type(code_point))
print(type(hex(code_point)))

54620
0xd55c
<class 'int'>
<class 'str'>


In [12]:
# chr() is the inverse of ord().
# Code point -> Character
print(chr(ord(c)))
print(chr(ord(c)) == '\ud55c')

한
True


In [27]:
unicode_escape = c.encode('unicode_escape')
print(unicode_escape)
type(unicode_escape)

b'\\ud55c'


bytes

## Unicode representations 

In [35]:
hex_form = 0xd55c
bin_form = 0b1101010101011100

# '#': show 0b
# '018': show 18 length long including '0b'
# https://stackoverflow.com/questions/16926130/convert-to-binary-and-keep-leading-zeros-in-python
print(format(hex_form, '#018b'))
print(type(hex_form))
print(type(bin_form))
print(hex_form == bin_form)

0b1101010101011100
<class 'int'>
<class 'int'>
True


In [5]:
two_bytes = '\u00e9'
four_bytes = '\U000000e9'
db = '\N{LATIN SMALL LETTER E WITH ACUTE}'

print(db)
print(two_bytes == four_bytes == db)

é
True


## Encoding and decoding

In [11]:
code_bytes = c.encode('utf8')
print(type(code_bytes))

<class 'bytes'>


In [14]:
print(code_bytes)

b'\xed\x95\x9c'


In [16]:
print(code_bytes.hex())

ed959c


In [17]:
code_bytes_utf32 = c.encode('utf32')

In [19]:
code_bytes_utf32.hex()

'fffe00005cd50000'

In [21]:
code_bytes_utf16 = c.encode('utf16')
code_bytes_utf16.hex()

'fffe5cd5'

In [23]:
code_bytes_utf16.decode('utf16')

'한'

In [20]:
# Error Handling
print('ignore: ', c.encode('ascii', errors='ignore'))
print('replace: ', c.encode('ascii', errors='replace'))
print('xmlcharrefreplace: ', c.encode('ascii', errors='xmlcharrefreplace'))
print('backslashreplace: ', c.encode('ascii', errors='backslashreplace'))
print('namereplace: ', c.encode('ascii', errors='namereplace'))

ignore:  b''
replace:  b'?'
xmlcharrefreplace:  b'&#54620;'
backslashreplace:  b'\\ud55c'
namereplace:  b'\\N{HANGUL SYLLABLE HAN}'


## Unicode Data Functions

In [30]:
import unicodedata as ud

print(ud.name(c))
print(ud.category(c))
print(ud.combining(c))
print(ud.lookup(ud.name(c)))
print(ud.normalize('NFC', c))

HANGUL SYLLABLE HAN
Lo
0
한
한


## Emoji

In [4]:
# It’s Not Wrong that "🤦🏼‍♂️".length == 7 (in JS)
# https://hsivonen.fi/string-leng
facepalm = '🤦🏼‍♂️'
print(facepalm)
print(len(facepalm))
print(list(map(hex, map(ord, list(facepalm)))))

🤦🏼‍♂️
5
['0x1f926', '0x1f3fc', '0x200d', '0x2642', '0xfe0f']


## Bytes

In [7]:
# You can only put ASCII literal characters or hexadecimal numbers.
abc = b'abc\x31'
print(abc)

# This will throw a SyntaxError
# SyntaxError: bytes can only contain ASCII literal characters
#
# b'한' 

b'abc1'
