# 文本和字节序列

## 4.1字符问题

In [4]:
s = 'cafの'
len(s)

4

In [5]:
b = s.encode('utf-8')
b

b'caf\xe3\x81\xae'

In [6]:
b.decode('utf-8')

'cafの'

## 4.2字节概要

In [7]:
cafe = bytes('cafの', encoding='utf-8')
cafe

b'caf\xe3\x81\xae'

In [8]:
cafe[0]

99

In [9]:
cafe[:1]

b'c'

In [10]:
cafearr = bytearray(cafe)
cafearr

bytearray(b'caf\xe3\x81\xae')

In [11]:
cafearr[-1:]

bytearray(b'\xae')

In [17]:
c = bytes('のcaf', encoding='utf-8')
c

b'\xe3\x81\xaecaf'

In [18]:
c[0]

227

In [19]:
c[:1]

b'\xe3'

In [20]:
bytes.fromhex('31 38 99 62')

b'18\x99b'

In [21]:
import array
numbers = array.array('h', [-2,-1,0,1,2])
octets = bytes(numbers)
octets

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

## 4.3 基本的编解码器
### 4.3.2 处理UnicodeDecodeError

In [22]:
octets = b'Montr\xe9al'
octets.decode('cp1252')

'Montréal'

In [23]:
octets.decode('iso8859_7')

'Montrιal'

In [24]:
octets.decode('koi8_r')

'MontrИal'

In [25]:
octets.decode('utf-8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte

In [27]:
octets.decode('utf-8', errors = 'replace')

'Montr�al'

## 4.6为了正确比较二规范化Unicode

In [28]:
s1 = 'café'
s2 = 'cafe\u0301'
s2

'café'

In [29]:
len(s2)

5

In [30]:
s1 == s2

False

In [34]:
from unicodedata import normalize
s1 = 'café'
s2 = 'cafe\u0301'
len(normalize('NFC',s1)),len(normalize('NFC',s2))

(4, 4)

In [35]:
len(normalize('NFD',s1)),len(normalize('NFD',s2))

(5, 5)

In [36]:
from unicodedata import name
ohm = '\u2126'
ohm

'Ω'

In [37]:
name(ohm)

'OHM SIGN'

In [38]:
ohm_c = normalize('NFC', ohm)
ohm_c

'Ω'

In [39]:
name(ohm_c)

'GREEK CAPITAL LETTER OMEGA'

### 4.6.1 大小写折叠

In [40]:
s = 'StressASD'
s.casefold()

'stressasd'

### 4.6.2 规范化文本匹配实用函数

In [42]:
s1 = 'café'
s2 = 'cafe\u0301'
s1 == s2

False

In [46]:
def nfc_equal(s1, s2):
    return normalize('NFC', s1) == normalize('NFC', s2)

def fold_equal(s1, s2):
    return (normalize('NFC', s1).casefold() == normalize('NFC', s2).casefold())

nfc_equal(s1, s2)

True

In [47]:
nfc_equal('A', 'a')

False

In [48]:
fold_equal('A', 'a')

True

### 4.6.3 极端‘规范化’：去掉变音号

In [49]:
import unicodedata
import string

def shave_mark(s):
    norm_s = unicodedata.normalize('NFD', s)
    shaved = ''.join(c for c in norm_s if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

s1 = 'café'
shave_mark(s1)

'cafe'