## TEXT vs Bytes

In [4]:
s = 'café'
len(s)

4

In [5]:
u = s.encode('utf-8')
print(u)
print(len(u))

b'caf\xc3\xa9'
5


In [6]:
u

b'caf\xc3\xa9'

### 要点

- py2  unicode -- str
- py3  str -- bytes
- unicode 是一种统一的字符编码

### Byte

In [7]:
cafe = bytes('café', encoding='utf-8')
cafe

b'caf\xc3\xa9'

In [8]:
cafe[0]

99

In [9]:
cafe[1]

97

In [10]:
cafe[:1]

b'c'

In [12]:
cafe_arr = bytearray(cafe)
cafe_arr

bytearray(b'caf\xc3\xa9')

In [13]:
cafe_arr[:1]

bytearray(b'c')

In [14]:
cafe_arr[-1:]

bytearray(b'\xa9')

In [15]:
cafe[0] == cafe[:1]

False

In [16]:
cafe[0] == cafe[:1][0]

True

### arrray


In [17]:
import array
nums = array.array('h', [1, 2, 4, -5, 6])
out = bytes(nums)
out

b'\x01\x00\x02\x00\x04\x00\xfb\xff\x06\x00'

In [18]:
out[0]

1

In [19]:
out[:3]

b'\x01\x00\x02'

### Structs and Memory Views

In [24]:
import struct
fmt = '<3s3sHH'
with open('forestg.jpg', 'rb')as f:
    img = memoryview(f.read())
    


In [26]:
header = img[:10]
bytes(header)

b'\xff\xd8\xff\xe0\x00\x10JFIF'

In [22]:
struct.unpack(fmt, header)

(b'\xff\xd8\xff', b'\xe0\x00\x10', 17994, 17993)

In [27]:
hash(header)

-6585862264318255475

#### sumary

- memoryview 是 imutable 类型
- memoryview 的切片返回的也是 memoryview

### Basic encoders/decoders

In [29]:
for codec in ['utf-8', 'latin_1', 'utf-16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')
    


utf-8	b'El Ni\xc3\xb1o'
latin_1	b'El Ni\xf1o'
utf-16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


In [30]:
city = 'São Paulo'
city.encode('utf-8')

b'S\xc3\xa3o Paulo'

In [31]:
city.encode('utf-16')

b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'

In [32]:
city.encode('gbk')

UnicodeEncodeError: 'gbk' codec can't encode character '\xe3' in position 1: illegal multibyte sequence

In [33]:
city.encode('iso8859-1')

b'S\xe3o Paulo'

In [35]:
city.encode('gbk', errors='ignore')

b'So Paulo'

In [36]:
city.encode('gbk', errors='replace')

b'S?o Paulo'

In [37]:
city.encode('gbk', errors='xmlcharrefreplace')

b'S&#227;o Paulo'

In [39]:
import chardet
chardet.detect(b'S&#227;o Paulo')

{'confidence': 1.0, 'encoding': 'ascii', 'language': ''}

In [40]:
chardet.detect(b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00')

{'confidence': 1.0, 'encoding': 'UTF-16', 'language': ''}

### 区分 bytes 和 str

In [45]:
def dis(content):
    if isinstance(content, bytes):
        print('{} is bytes'.format(content))
    else:
        print('{} is str'.format(content))
        

In [46]:
dis(b'So Paulo')

b'So Paulo' is bytes


In [47]:
dis('周鑫')

周鑫 is str


### Normalizing Unicode for saner comparisons

In [48]:
s1 = 'café'
s2 = 'cafe\u0301'
s1, s2

('café', 'café')

In [49]:
len(s1), len(s2)

(4, 5)

In [50]:
from unicodedata import normalize


In [52]:
len(normalize('NFC', s1)), len(normalize('NFC', s2))

(4, 4)

In [53]:
len(normalize('NFD', s1)), len(normalize('NFD', s2))

(5, 5)