# 2 字符串和文本

## 2.1 使用多个界定符分割字符串

In [1]:
import re
line = 'asdf fjdk; afed, fjek,asdf, foo'
re.split(r'[;,\s]\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

## 2.2 字符串开头或结尾匹配

In [2]:
# str 方法
filename = 'spam.txt'
print(filename.endswith('.txt'))

url = 'http://www.python.org'
print(url.startswith('http:'))

True
True


In [3]:
# os 方法
import os
filenames = os.listdir('.')  # 列出当前路径下所有文件名
print(filenames)

lst = [name for name in filenames if name.endswith(('.c', '.h')) ]
print(lst)

b = any(name.endswith('.py') for name in filenames)
print(b)

['.ipynb_checkpoints', '01_数据结构和算法.ipynb', '02_字符串和文本.ipynb', 'filename']
[]
False


In [4]:
# urllib  域名解析方法
from urllib.request import urlopen

def read_data(name):
    if name.startswith(('http:', 'https:', 'ftp:')):
        return urlopen(name).read()
    else:
        with open(name) as f:
            return f.read()
        
choices = ['http:', 'ftp:']
url = 'http://www.python.org'
url.startswith(tuple(choices))

True

## 2.3 用Shell通配符匹配字符串

- `fnmatch` 库：
    * `fnmatch()`:使用底层操作系统的大小写敏感规则(不同的系统是不一样的)来匹配模式。
    * `fnmatchcase()`: 对于大小写统一敏感。
    

In [5]:
from fnmatch import fnmatch, fnmatchcase

print(fnmatch('foo.txt', '*.txt'))

print(fnmatch('foo.txt', '?oo.txt'))

print(fnmatch('Dat45.csv', 'Dat[0-9]*'))

names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py']
res = [name for name in names if fnmatch(name, 'Dat*.csv')]
res

print(fnmatchcase('foo.txt', '*.TXT'))

True
True
True
False


In [6]:
from fnmatch import fnmatchcase
addresses = [
    '5412 N CLARK ST',
    '1060 W ADDISON ST',
    '1039 W GRANVILLE AVE',
    '2122 N CLARK ST',
    '4802 N BROADWAY',
]

res = [addr for addr in addresses if fnmatchcase(addr, '* ST')]
res1 = [addr for addr in addresses 
        if fnmatchcase(addr, '54[0-9][0-9] *CLARK*')]

display(res, res1)

['5412 N CLARK ST', '1060 W ADDISON ST', '2122 N CLARK ST']

['5412 N CLARK ST']

## 2.4 字符串匹配和搜索

- `str`: find
- `re`: findall, match, search, 等

In [7]:
text = 'yeah, but no, but yeah, but no, but yeah'

print(text == 'yeah')
print(text.startswith('yeah'))
print(text.endswith('no'))
print(text.find('no'))   # 返回频次

False
True
False
10


## 2.5 字符串搜索和替换
- `str`: 
    * `str.replace()`
- `re`:
    * `sub()`
    * `subn()`: 替换，并返回替换次数

In [8]:
text = 'yeah, but no, but yeah, but no, but yeah'
new = text.replace('yeah', 'yep')
new

'yep, but no, but yep, but no, but yep'

In [9]:
import re

text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
new = re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)

new

'Today is 2012-11-27. PyCon starts 2013-3-13.'

In [10]:
# 复杂的替换，传递一个替换回调函数
import re
from calendar import month_abbr

def change_date(m):
    mon_name = month_abbr[int(m.group(1))]  # 将整数的月份 => 对应缩写
    return f'{m.group(2)} {mon_name} {m.group(3)}'

datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
new1 = datepat.sub(r'\3-\1-\2', text)
new2 = datepat.sub(change_date, text)  # 传入 search 对象

display(new1, new2)

'Today is 2012-11-27. PyCon starts 2013-3-13.'

'Today is 27 Nov 2012. PyCon starts 13 Mar 2013.'

## 2.6 字符串忽略大小写的搜索替换

- `re`: flags=re.IGNORECASE

In [11]:
text = 'UPPER PYTHON, lower python, Mixed Python'
res = re.findall('python', text, flags=re.IGNORECASE)
# 替换但是大小写不能保持一致
new = re.sub('python', 'snake', text, flags=re.IGNORECASE)  

display(res, new)

['PYTHON', 'python', 'Python']

'UPPER snake, lower snake, Mixed snake'

In [12]:
# 闭包
def matchcase(word):
    def replace(m):
        text = m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return replace

f = matchcase('word')
f(re.match('\w+', 'ANE'))

'WORD'

In [13]:
# 闭包替换
re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE)

'UPPER SNAKE, lower snake, Mixed Snake'

## 2.7 最短匹配模式

In [14]:
str_pat = re.compile(r'"(.*?)"')

text1 = 'Computer says "no."'
res1 = str_pat.findall(text1)

text2 = 'Computer says "no." Phone says "yes."'
res2 = str_pat.findall(text2)

display(res1, res2)

['no.']

['no.', 'yes.']

## 2.8 多行匹配模式

- `(?:.|\n)`：非捕获组(也就是它定义了一个仅仅用来做匹配，而不能通过单独捕获或者编号的组)
- `re.DOTALL`：可匹配包含换行符的字符

In [15]:
comment = re.compile(r'/\*(.*?)\*/')
text1 = '/* this is a comment */'
text2 = '''
    /* this is a
    multiline comment */
    '''
res1 = comment.findall(text1)
res2 = comment.findall(text2)

display(res1, res2)

[' this is a comment ']

[]

In [16]:
comment = re.compile(r'/\*((?:.|\n)*?)\*/')
text2 = '''
    /* this is a
    multiline comment */
    '''
res2 = comment.findall(text2)
res2

[' this is a\n    multiline comment ']

In [17]:
comment = re.compile(r'/\*(.*?)\*/', re.DOTALL)
comment.findall(text2)

[' this is a\n    multiline comment ']

# 2.9 将Unicode文本标准化

`Unicode` 某些字符能够用多个合法的编码表示。

- `unicodedata`:
    * `normalize(form, unistr, /)`: 按照一种标准化模式，标准化字符；
    * `combining(chr, /)`: 


In [18]:
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'

print(f"s1 \t\t {s1}")
print(f"s2 \t\t {s2}")
print(f"s1 == s2 \t {s1 == s2}")
print(f"len(s1) \t {len(s1)}")
print(f"len(s2) \t {len(s2)}")

s1 		 Spicy Jalapeño
s2 		 Spicy Jalapeño
s1 == s2 	 False
len(s1) 	 14
len(s2) 	 15


In [19]:
import unicodedata

# NFC   字符应该是整体组成(比如可能的话就使用单一编码)
t1 = unicodedata.normalize('NFC', s1)
t2 = unicodedata.normalize('NFC', s2)
print(f"t1 \t\t {t1}")
print(f"t2 \t\t {t2}")
print(f"ascii(t1) \t {ascii(t1)}")
print(f"ascii(t2) \t {ascii(t2)}")
print(f"t1 == t2 \t {t1 == t2}")

# NFD   字符应该分解为多个组合字符表示。
t3 = unicodedata.normalize('NFD', s1)
t4 = unicodedata.normalize('NFD', s2)
print(f"t3 \t\t {t3}")
print(f"t4 \t\t {t4}")
print(f"ascii(t3) \t {ascii(t3)}")
print(f"ascii(t4) \t {ascii(t4)}")
print(f"t3 == t4 \t {t3 == t4}")


t1 		 Spicy Jalapeño
t2 		 Spicy Jalapeño
ascii(t1) 	 'Spicy Jalape\xf1o'
ascii(t2) 	 'Spicy Jalape\xf1o'
t1 == t2 	 True
t3 		 Spicy Jalapeño
t4 		 Spicy Jalapeño
ascii(t3) 	 'Spicy Jalapen\u0303o'
ascii(t4) 	 'Spicy Jalapen\u0303o'
t3 == t4 	 True


In [20]:
# 其他模式
s = '\ufb01'

t1 = unicodedata.normalize('NFD', s)
t2 = unicodedata.normalize('NFKD', s)
t3 = unicodedata.normalize('NFKC', s)

display(t1, t2, t3)

'ﬁ'

'fi'

'fi'

- `unicodedata.combining`: 测试unicode字符是否为和音字符, 如果是则返回整数值0。

In [21]:
print(f"s1 \t {s1}")
t1 = unicodedata.normalize('NFD', s1)
print(f"t1 \t {t1}")
t2 = ''.join(c for c in t1 if not unicodedata.combining(c))
print(f"t2 \t {t2}")

s1 	 Spicy Jalapeño
t1 	 Spicy Jalapeño
t2 	 Spicy Jalapeno


## 2.10 在正则式中使用Unicode

- 一般情况下，不混合使用 Unicode 和 正则。
- `\d` 可以匹配 unicode 的数字字符。
- 使用 Unicode 对应字符进行匹配。

In [22]:
import re
import unicodedata

num = re.compile('\d+')
res1 = num.match('123')

u_str = '\u0661\u0662\u0663'
print(unicodedata.normalize('NFKC', u_str))

utf_str = u_str.encode('utf-8')# 将 Unicode => utf-8
print(utf_str)  

res2 = num.match('\u0661\u0662\u0663')

display(res1.group(), res2.group())

١٢٣
b'\xd9\xa1\xd9\xa2\xd9\xa3'


'123'

'١٢٣'

In [23]:
# 忽略大小写 和 大小写转换
pat = re.compile('stra\u00dfe', re.IGNORECASE)
s = 'straße'
res1 = pat.match(s)
res2 = pat.match(s.upper())
s_u = s.upper()

display(res1, res2, s_u)

<_sre.SRE_Match object; span=(0, 6), match='straße'>

None

'STRASSE'

## 2.11 删除字符串中不需要的字符

- `str`:
    - `strip()`, `lstrip()`, `rstrip()`
    - `replace`
- `re`:  
    - `sub`, `subn`

## 2.12 审查清理文本字符串

- `str`:
    - `translate(table)`: 传入替换的 map_dict

In [24]:
s = 'pýtĥöñ\fis\tawesome\r\n'

re_map = {
    ord('\t') : ' ',
    ord('\f') : ' ',
    ord('\r') : None      # Deleted
}
a = s.translate(re_map)
print(a)

pýtĥöñ is awesome



- 删除所有的 **和音字符**

In [25]:
import unicodedata
import sys

print(f"sys.maxunicode \t {sys.maxunicode}")   # 打印 Unicode 的字符
# 构建字典 => Unicode和音字符：None
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) 
                         if unicodedata.combining(chr(c)))
b = unicodedata.normalize('NFD', a)
print(f"b \t {b}")

b = b.translate(cmb_chrs)
print(f"b \t {b}")

sys.maxunicode 	 1114111
b 	 pýtĥöñ is awesome

b 	 python is awesome



- 映射 Unicode 数字字符 => 对应 ASCII 字符

In [26]:
digitmap = { c: ord('0') + unicodedata.digit(chr(c)) 
            for c in range(sys.maxunicode) 
            if unicodedata.category(chr(c)) == 'Nd' }
print(f"len(digitmap) \t {len(digitmap)}")

x = '\u0661\u0662\u0663'
res = x.translate(digitmap)
print(f"res \t\t {res}")

len(digitmap) 	 580
res 		 123


- 结合 `encode()` 和 `decode()`

In [27]:
b = unicodedata.normalize('NFD', a)
b_n = b.encode('ascii', 'ignore').decode('ascii')

print(f"b_n \t {b_n}")

b_n 	 python is awesome



##  2.13 字符串对齐

- `str`: `ljust()`, `rjust()`, `center()`
- `format()`: `<`, `>`, `^`

In [28]:
text = 'Hello World'
print(text.ljust(20))
print(text.rjust(20))
print(text.center(20))

Hello World         
         Hello World
    Hello World     


In [29]:
# 空白填充指定字符
print(text.rjust(20,'='))
print(text.center(20,'*'))

****Hello World*****


In [30]:
# format()
print(format(text, '>20'))
print(format(text, '<20'))
print(format(text, '^20'))

         Hello World
Hello World         
    Hello World     


In [31]:
# format()  带字符填充
print(format(text, '=>20s'))
print(format(text, '*^20s'))

****Hello World*****


In [32]:
# 填充模式
print(f"{'hello':=>10s}{'world':=^10s}")

=====hello==world===


## 2.14 合并拼接字符串

- `join`

- 高性能方案

In [33]:
def sample():
    yield 'Is'
    yield 'Chicago'
    yield 'Not'
    yield 'Chicago?'
    
# 结合 I/O 操作
# for part in sample():
#     f.write(part)
    
text = ''.join(sample())
text

'IsChicagoNotChicago?'

In [34]:
# 大文件 I/O 方案
def combine(source, maxsize):
    parts = []
    size = 0
    for part in source:
        parts.append(part)
        size += len(part)
        if size > maxsize:
            yield ''.join(parts)
            parts = []
            size = 0
    yield ''.join(parts)

# 结合文件操作
with open('filename', 'w') as f:
    for part in combine(sample(), 32768):
        f.write(part)

## 2.15 字符串中插入变量

- `format()`
- `format_map()` + `vars()`

In [35]:
s = '{name} has {n} messages.'
s.format(name='Guido', n=37)

'Guido has 37 messages.'

In [36]:
name = 'Guido'
n = 37
s.format_map(vars())  # 找到变量域中的变量

'Guido has 37 messages.'

In [37]:
class Info:
    def __init__(self, name, n):
        self.name = name
        self.n = n

a = Info('Guido',37)
s.format_map(vars(a))   # 找到对象的变量

'Guido has 37 messages.'

- `format` 和 `format_map()`缺陷：不能很好的处理变量缺失的情况。

In [38]:
try:
    s.format(name='Guido')
except Exception as e:
    print(f"Error: {e}")

Error: 'n'


In [39]:
class safesub(dict):
    """防止key找不到"""
    def __missing__(self, key):
        return '{' + key + '}'

- 需要频繁执行替换步骤

In [40]:
import sys

def sub(text):
    return text.format_map(safesub(sys._getframe(1).f_locals))

# sys._getframe(1) 返回调用者的栈帧。
# 可以从访问属性 f_locals 来获得局部变量。

In [41]:
name = 'Guido'
n = 37

print(sub('Hello {name}'))
print(sub('You have {n} messages.'))
print(sub('Your favorite color is {color}'))

Hello Guido
You have 37 messages.
Your favorite color is {color}


## 2.16 以指定列宽格式化长字符串

- `textwrap`

In [42]:
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."

In [43]:
import textwrap

# 指定行长度
print(textwrap.fill(s, 70))
print("="*100)
print(textwrap.fill(s, 40))
print("="*100)
# 首行缩进
print(textwrap.fill(s, 40, initial_indent='    '))
print("="*100)
# 子行缩进
print(textwrap.fill(s, 40, subsequent_indent='    '))

Look into my eyes, look into my eyes, the eyes, the eyes, the eyes,
not around the eyes, don't look around the eyes, look into my eyes,
you're under.
Look into my eyes, look into my eyes,
the eyes, the eyes, the eyes, not around
the eyes, don't look around the eyes,
look into my eyes, you're under.
    Look into my eyes, look into my
eyes, the eyes, the eyes, the eyes, not
around the eyes, don't look around the
eyes, look into my eyes, you're under.
Look into my eyes, look into my eyes,
    the eyes, the eyes, the eyes, not
    around the eyes, don't look around
    the eyes, look into my eyes, you're
    under.


## 2.17 在字符串中处理html和xml
需要转换文本中特定的字符(比如<, >, 或 &)

- `html.escape()`

In [44]:
import html

s = 'Elements are written as "<tag>text</tag>".'
print(s)
print(html.escape(s))
print(html.escape(s, quote=False))

Elements are written as "<tag>text</tag>".
Elements are written as &quot;&lt;tag&gt;text&lt;/tag&gt;&quot;.
Elements are written as "&lt;tag&gt;text&lt;/tag&gt;".


- 处理的是ASCII文本，并且想将非ASCII文本对应的编码实体嵌入进去

In [45]:
s = 'Spicy Jalapeño'
s.encode('ascii', errors='xmlcharrefreplace')

b'Spicy Jalape&#241;o'

- 解析 HTML 和 XML

In [46]:
# 
import html

s = 'Spicy &quot;Jalape&#241;o&quot.'
html.unescape(s)

'Spicy "Jalapeño".'

In [47]:
from xml.sax.saxutils import unescape

t = 'The prompt is &gt;&gt;&gt;'
unescape(t)

'The prompt is >>>'

## 2.18 字符串令牌解析

In [48]:
text = 'foo = 23 + 42 * 10'
tokens = [('NAME', 'foo'), ('EQ','='), ('NUM', '23'), ('PLUS','+'),
          ('NUM', '42'), ('TIMES', '*'), ('NUM', '10')]

In [49]:
import re

NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))

In [50]:
scanner = master_pat.scanner('foo = 42')
scanner.match()

<_sre.SRE_Match object; span=(0, 3), match='foo'>

In [51]:
_.lastgroup, _.group()

('NAME', 'foo')

In [52]:
scanner.match()

<_sre.SRE_Match object; span=(3, 4), match=' '>

In [53]:
_.lastgroup, _.group()

('WS', ' ')

In [54]:
scanner.match()

<_sre.SRE_Match object; span=(4, 5), match='='>

In [55]:
_.lastgroup, _.group()

('EQ', '=')

In [56]:
scanner.match()

<_sre.SRE_Match object; span=(5, 6), match=' '>

In [57]:
_.lastgroup, _.group()

('WS', ' ')

In [58]:
scanner.match()

<_sre.SRE_Match object; span=(6, 8), match='42'>

In [59]:
_.lastgroup, _.group()

('NUM', '42')

In [60]:
scanner.match()

- 打包成一个生成器

In [61]:
from collections import namedtuple

def generate_tokens(pat, text):
    Token = namedtuple('Token', ['type', 'value'])
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())
        
for tok in generate_tokens(master_pat, 'foo = 42'):
    print(tok)

Token(type='NAME', value='foo')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='42')


In [62]:
# 过滤掉空白令牌
tokens = (tok for tok in generate_tokens(master_pat, text)
          if tok.type != 'WS')
for tok in tokens:
    print(tok)

Token(type='NAME', value='foo')
Token(type='EQ', value='=')
Token(type='NUM', value='23')
Token(type='PLUS', value='+')
Token(type='NUM', value='42')
Token(type='TIMES', value='*')
Token(type='NUM', value='10')


In [63]:
# 令牌的顺序是有影响的
LT = r'(?P<LT><)'
LE = r'(?P<LE><=)'
EQ = r'(?P<EQ>=)'

master_pat = re.compile('|'.join([LE, LT, EQ])) # Correct
# master_pat = re.compile('|'.join([LT, LE, EQ])) # Incorrect

In [64]:
PRINT = r'(?P<PRINT>print)'
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'

master_pat = re.compile('|'.join([PRINT, NAME]))

for tok in generate_tokens(master_pat, 'printer'):
    print(tok)

Token(type='PRINT', value='print')
Token(type='NAME', value='er')


## 2.19 实现一个简单的递归下降分析器

In [65]:
"""
Topic: 下降解析器
Desc :
"""
import re
import collections

# Token specification
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
MINUS = r'(?P<MINUS>-)'
TIMES = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NUM, PLUS, MINUS, TIMES,
                                  DIVIDE, LPAREN, RPAREN, WS]))
# Tokenizer
Token = collections.namedtuple('Token', ['type', 'value'])


def generate_tokens(text):
    scanner = master_pat.scanner(text)
    for m in iter(scanner.match, None):
        tok = Token(m.lastgroup, m.group())
        if tok.type != 'WS':
            yield tok


# Parser
class ExpressionEvaluator:
    '''
    实现一个递归下降解析器。每个方法实现单个语法规则。
    使用._accept()方法测试并接受当前的前视令牌。
    使用._expect()方法精确匹配并在输入上丢弃下一个令牌。
    (如果不匹配，则引发SyntaxError)。
    '''

    def parse(self, text):
        self.tokens = generate_tokens(text)
        self.tok = None  # Last symbol consumed
        self.nexttok = None  # Next symbol tokenized
        self._advance()  # Load first lookahead token
        return self.expr()

    def _advance(self):
        'Advance one token ahead'
        self.tok, self.nexttok = self.nexttok, next(self.tokens, None)

    def _accept(self, toktype):
        'Test and consume the next token if it matches toktype'
        if self.nexttok and self.nexttok.type == toktype:
            self._advance()
            return True
        else:
            return False

    def _expect(self, toktype):
        'Consume next token if it matches toktype or raise SyntaxError'
        if not self._accept(toktype):
            raise SyntaxError('Expected ' + toktype)

    # Grammar rules follow
    def expr(self):
        "expression ::= term { ('+'|'-') term }*"
        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.term()
            if op == 'PLUS':
                exprval += right
            elif op == 'MINUS':
                exprval -= right
        return exprval

    def term(self):
        "term ::= factor { ('*'|'/') factor }*"
        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval *= right
            elif op == 'DIVIDE':
                termval /= right
        return termval

    def factor(self):
        "factor ::= NUM | ( expr )"
        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            exprval = self.expr()
            self._expect('RPAREN')
            return exprval
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')


def descent_parser():
    e = ExpressionEvaluator()
    print(e.parse('2'))
    print(e.parse('2 + 3'))
    print(e.parse('2 + 3 * 4'))
    print(e.parse('2 + (3 + 4) * 5'))
    # print(e.parse('2 + (3 + * 4)'))


descent_parser()

2
5
14
37


## 2.20 字节字符串上的字符串操作

In [66]:
# 字节字符
data = b'Hello World'
display(
    data[0:5],
    data.startswith(b'Hello'),
    data.split(), 
    data.replace(b'Hello', b'Hello Cruel'),
)

b'Hello'

True

[b'Hello', b'World']

b'Hello Cruel World'

In [67]:
# 字节数组
data = bytearray(b'Hello World')
display(
    data[0:5],
    data.startswith(b'Hello'),
    data.split(),
    data.replace(b'Hello', b'Hello Cruel')
)

bytearray(b'Hello')

True

[bytearray(b'Hello'), bytearray(b'World')]

bytearray(b'Hello Cruel World')

In [68]:
# 正则表达式情况
import re

data = b'FOO:BAR,SPAM'
try:
    re.split('[:,]',data)
except Exception as e:
    print(f"Error: {e}")

re.split(b'[:,]',data)   # Notice: pattern as bytes

Error: cannot use a string pattern on a bytes-like object


[b'FOO', b'BAR', b'SPAM']

- 切片差异性

In [69]:
a = 'Hello World' # Text string
display(a[0], a[1])

b = b'Hello World' # Text string
display(b[0], b[1])

'H'

'e'

72

101

- 打印差异性

In [70]:
s = b'Hello World'
print(s)
print(s.decode('ascii'))

b'Hello World'
Hello World


- 字节字符串，不存在格式化操作

In [71]:
try:
    b'{} {} {}'.format(b'ACME', 100, 490.1)
except Exception as e:
    print(f"Error: {e}")   

Error: 'bytes' object has no attribute 'format'


In [72]:
try:
    f"{b'ACME':10s}"
except Exception as e:
    print(f"Error: {e}") 

Error: unsupported format string passed to bytes.__format__
