# py_文本和字节

### 案例：编码和解码

In [4]:
s = 'cafe-咖啡'
print(len(s))

b = s.encode(encoding='utf-8')
print(b)  # 一个汉字占 3 个字节
print(len(b))  # 打印字节的长度

c = b.decode('utf8')
print(c)

7
b'cafe-\xe5\x92\x96\xe5\x95\xa1'
11
cafe-咖啡


### 案例：`bytes` 与 `bytearray`

In [10]:
cafe = bytes('cafe', encoding='utf-8')
print(cafe)
print(cafe[0])   # ASCII码对于值
print(cafe[:1])

print("====================")
cafe_arr = bytearray(cafe)
print(cafe_arr)
print(cafe_arr[0])
print(cafe_arr[-1:])

b'cafe'
99
b'c'
bytearray(b'cafe')
99
bytearray(b'e')


### 案例：结构体与内存视图

In [6]:
import struct

fmt = '<3s3sHH'
with open('./file/icon.ico', 'rb') as fp:
    img = memoryview(fp.read())

header = img[:10]
print(bytes(header))

print(struct.unpack(fmt, header))    # 拆包，得到一个元组，包含类型、版本、宽度、高度
del header    # 删除引用
del img  

b'\x00\x00\x01\x00\x01\x00@@\x00\x00'
(b'\x00\x00\x01', b'\x00\x01\x00', 16448, 0)


### 案例：解码器

1. utf_8别名：utf8/ utf-8/ U8

In [10]:
for codec in ['latin1', 'utf_8', 'utf_16']:
    print(codec, 'café'.encode(codec), sep='\t')

latin1	b'caf\xe9'
utf_8	b'caf\xc3\xa9'
utf_16	b'\xff\xfec\x00a\x00f\x00\xe9\x00'


### 案例：处理 `UnicodeEncodeError`

In [19]:
city = 'são Paulo'
print('utf_8', city.encode('utf_8'), sep='\t')
print('utf_16', city.encode('utf_16'), sep='\t')

try:
    print('cp437', city.encode('cp437'), sep='\t')
except Exception as e:
    print('cp437', '<error>：'+str(e), sep='\t')

print('cp437', city.encode('cp437', errors='ignore'), sep='\t')     # 跳过
print('cp437', city.encode('cp437', errors='replace'), sep='\t')    # 替换 => ？
print('cp437', city.encode('cp437', errors='xmlcharrefreplace'), sep='\t')   # 替换 => xml 实体

utf_8	b's\xc3\xa3o Paulo'
utf_16	b'\xff\xfes\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'
cp437	<error>：'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>
cp437	b'so Paulo'
cp437	b's?o Paulo'
cp437	b's&#227;o Paulo'


### 案例：处理`UnicodeDecodeError` 

In [5]:
octets = b's\xc3\xa3o Paulo'
print(octets.decode('cp1252'))
print(octets.decode('iso8859_7'))
print(octets.decode('koi8_r'))
print(octets.decode('utf_8'))
print(octets.decode('iso8859_7', errors='replace'))

sÃ£o Paulo
sΓ£o Paulo
sцёo Paulo
são Paulo
sΓ£o Paulo


In [10]:
# 统一字符编码侦测包
import chardet

chardet.detect(b's\xc3\xa3o Paulo')

{'confidence': 0.505, 'encoding': 'utf-8', 'language': ''}

### 案例：编码默认值

In [12]:
import sys, locale

expressions = """
    locale.getpreferredencoding()
    type(my_file)
    my_file.encoding
    sys.stdout.isatty()
    sys.stdout.encoding
    sys.stdin.isatty()
    sys.stdin.encoding
    sys.stderr.isatty()
    sys.stderr.encoding
    sys.getdefaultencoding()
    sys.getfilesystemencoding()
"""
my_file = open('dummy', 'w')

for expression in expressions.split():
    value = eval(expression)
    print(expression.rjust(30), '->', repr(value))

 locale.getpreferredencoding() -> 'cp936'
                 type(my_file) -> <class '_io.TextIOWrapper'>
              my_file.encoding -> 'cp936'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'cp936'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'
