# 文本和字节序列

## 4.1字符问题

In [4]:
s = 'cafの'
len(s)

4

In [5]:
b = s.encode('utf-8')
b

b'caf\xe3\x81\xae'

In [6]:
b.decode('utf-8')

'cafの'

## 4.2字节概要

In [7]:
cafe = bytes('cafの', encoding='utf-8')
cafe

b'caf\xe3\x81\xae'

In [8]:
cafe[0]

99

In [9]:
cafe[:1]

b'c'

In [10]:
cafearr = bytearray(cafe)
cafearr

bytearray(b'caf\xe3\x81\xae')

In [11]:
cafearr[-1:]

bytearray(b'\xae')

In [17]:
c = bytes('のcaf', encoding='utf-8')
c

b'\xe3\x81\xaecaf'

In [18]:
c[0]

227

In [19]:
c[:1]

b'\xe3'

In [20]:
bytes.fromhex('31 38 99 62')

b'18\x99b'

In [21]:
import array
numbers = array.array('h', [-2,-1,0,1,2])
octets = bytes(numbers)
octets

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

## 4.3 基本的编解码器
### 4.3.2 处理UnicodeDecodeError

In [22]:
octets = b'Montr\xe9al'
octets.decode('cp1252')

'Montréal'

In [23]:
octets.decode('iso8859_7')

'Montrιal'

In [24]:
octets.decode('koi8_r')

'MontrИal'

In [25]:
octets.decode('utf-8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte

In [27]:
octets.decode('utf-8', errors = 'replace')

'Montr�al'

## 4.6为了正确比较二规范化Unicode

In [28]:
s1 = 'café'
s2 = 'cafe\u0301'
s2

'café'

In [29]:
len(s2)

5

In [30]:
s1 == s2

False

In [34]:
from unicodedata import normalize
s1 = 'café'
s2 = 'cafe\u0301'
len(normalize('NFC',s1)),len(normalize('NFC',s2))

(4, 4)

In [35]:
len(normalize('NFD',s1)),len(normalize('NFD',s2))

(5, 5)

In [36]:
from unicodedata import name
ohm = '\u2126'
ohm

'Ω'

In [37]:
name(ohm)

'OHM SIGN'

In [38]:
ohm_c = normalize('NFC', ohm)
ohm_c

'Ω'

In [39]:
name(ohm_c)

'GREEK CAPITAL LETTER OMEGA'

### 4.6.1 大小写折叠

In [40]:
s = 'StressASD'
s.casefold()

'stressasd'

### 4.6.2 规范化文本匹配实用函数

In [42]:
s1 = 'café'
s2 = 'cafe\u0301'
s1 == s2

False

In [46]:
def nfc_equal(s1, s2):
    return normalize('NFC', s1) == normalize('NFC', s2)

def fold_equal(s1, s2):
    return (normalize('NFC', s1).casefold() == normalize('NFC', s2).casefold())

nfc_equal(s1, s2)

True

In [47]:
nfc_equal('A', 'a')

False

In [48]:
fold_equal('A', 'a')

True

### 4.6.3 极端‘规范化’：去掉变音号

In [51]:
import unicodedata
import string

def shave_mark(s):
    norm_s = unicodedata.normalize('NFD', s)
    shaved = ''.join(c for c in norm_s if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

s1 = 'café'
shave_mark(s1)

'cafe'

## 4.7 Unicode文本排序

In [50]:
l = ['café', 'cake', 'caff']
sorted(l)

['caff', 'café', 'cake']

In [56]:
import pyuca
coll = pyuca.Collator()

In [57]:
sorted(l, key = coll.sort_key)

['café', 'caff', 'cake']

## 4.8 Unicode 数据库

In [59]:
import unicodedata
import re

digit = re.compile(r'\d')

sample = '1\xbc\xb2\u136b\u216b\u2466\u2480\u3285'

for char in sample:
    print('U+%04x'%ord(char),
          char.center(6),
          'redit' if digit.match(char) else '-',
          'isdit' if char.isdigit() else '-',
          'isnum' if char.isnumeric() else '-',
          format(unicodedata.numeric(char), '5.2f'),
          unicodedata.name(char),
          sep = '\t'
         )

U+0031	  1   	redit	isdit	isnum	 1.00	DIGIT ONE
U+00bc	  ¼   	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
U+00b2	  ²   	-	isdit	isnum	 2.00	SUPERSCRIPT TWO
U+136b	  ፫   	-	isdit	isnum	 3.00	ETHIOPIC DIGIT THREE
U+216b	  Ⅻ   	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
U+2466	  ⑦   	-	isdit	isnum	 7.00	CIRCLED DIGIT SEVEN
U+2480	  ⒀   	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
U+3285	  ㊅   	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX


## 4.9支持字符串和字节序列的双模式API
### 4.9.1 正则表达式中的字符串和字节序列

In [68]:
import re

re_num_str = re.compile(r'\d+')
re_words_str = re.compile(r'\w+')
re_num_bytes = re.compile(rb'\d+')
re_words_bytes = re.compile(rb'\w+')

text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef""as 1729 = 13 + 123 = 93 + 103")
text_bytes = text_str.encode('utf-8')

re_num_str.findall(text_str)

['௧௭௨௯', '1729', '13', '123', '93', '103']

In [69]:
re_words_str.findall(text_str)

['Ramanujan', 'saw', '௧௭௨௯as', '1729', '13', '123', '93', '103']

In [70]:
re_num_bytes.findall(text_bytes)

[b'1729', b'13', b'123', b'93', b'103']

In [71]:
re_words_bytes.findall(text_bytes)

[b'Ramanujan', b'saw', b'as', b'1729', b'13', b'123', b'93', b'103']

## 4.9.2 os模块中的字符串和字节序列

In [72]:
import os
os.listdir('.')

['.ipynb_checkpoints',
 'ch1.Python数据类型.ipynb',
 'ch2序列构成的数据.ipynb',
 'ch3字典与集合.ipynb',
 'ch4文本和字节序列.ipynb',
 'floats.bin']

In [75]:
os.listdir(b'.')

[b'.ipynb_checkpoints',
 b'ch1.Python\xe6\x95\xb0\xe6\x8d\xae\xe7\xb1\xbb\xe5\x9e\x8b.ipynb',
 b'ch2\xe5\xba\x8f\xe5\x88\x97\xe6\x9e\x84\xe6\x88\x90\xe7\x9a\x84\xe6\x95\xb0\xe6\x8d\xae.ipynb',
 b'ch3\xe5\xad\x97\xe5\x85\xb8\xe4\xb8\x8e\xe9\x9b\x86\xe5\x90\x88.ipynb',
 b'ch4\xe6\x96\x87\xe6\x9c\xac\xe5\x92\x8c\xe5\xad\x97\xe8\x8a\x82\xe5\xba\x8f\xe5\x88\x97.ipynb',
 b'floats.bin']

In [78]:
name = os.listdir(b'.')[2]
name = name.decode('ascii', 'surrogateescape')
name

'ch2\udce5\udcba\udc8f\udce5\udc88\udc97\udce6\udc9e\udc84\udce6\udc88\udc90\udce7\udc9a\udc84\udce6\udc95\udcb0\udce6\udc8d\udcae.ipynb'