In [None]:
# https://pymotw.com/3/re/index.html

In [28]:
"""
Finding Patterns in Text
""" 
import re
pattern = 'this'
text = 'Does this text match the pattern?'
match = re.search(pattern, text)
s = match.start()
e = match.end()

print(match)
print('Found "{}" in "{}"\nfrom {} to {} ("{}")'.format(\
    match.re.pattern, match.string,s,e,text[s:e]))

pattern_not_exist = 'that'
match = re.search(pattern_not_exist, text)
print('Pattern not exist in text, match = {}'.format(match))

<_sre.SRE_Match object; span=(5, 9), match='this'>
Found <_sre.SRE_Match object; span=(5, 9), match='this'>
Found "this" in "Does this text match the pattern?"
from 5 to 9 ("this")
Pattern not exist in text, match = None


In [24]:
"""
Compiling Expressions
"""
import re
p = 'this'
# 表达式编译以后运行更高效
# 不编译直接使用，python 也会做 cache，但是 cache 容量有限，而且还有查找的开销
# 可以在模块加载时就预编译好表达式，避免第一次运行时编译
regex = re.compile(p)
print(regex.__class__)
text = 'Does this text match the pattern?'
if regex.search(text):
    print('Found "{}"'.format(p))

<class '_sre.SRE_Pattern'>
Found "this"


In [37]:
"""
Multiple Matches
"""
import re

text = 'abbaaabbbbaaaaa'
pattern = 'abb'

# findall 仅返回 text 中匹配到的字符串
for match in re.findall(pattern, text):
    print('Found {!r}'.format(match))

# finditer 返回 MatchObject 的 iterator
for match in re.finditer(pattern, text):
    s = match.start()
    e = match.end()
    print(match)
    print('Found {!r} at {:d}:{:d}'.format(
        text[s:e], s, e))

Found 'abb'
Found 'abb'
<_sre.SRE_Match object; span=(0, 3), match='abb'>
Found 'abb' at 0:3
<_sre.SRE_Match object; span=(5, 8), match='abb'>
Found 'abb' at 5:8


In [40]:
"""
Pattern Syntax
"""
def test_patterns(text, patterns):
    """Given source text and a list of patterns, look for
    matches for each pattern within the text and print
    them to stdout.
    """
    # Look for each pattern in the text and print the results
    for pattern, desc in patterns:
        print("'{}' ({})\n".format(pattern, desc))
        print("  {}".format(text))
        for match in re.finditer(pattern, text):
            s = match.start()
            e = match.end()
            substr = text[s:e]
            n_backslashes = text[:s].count('\\')
            prefix = '.' * (s + n_backslashes)
            print("  {}{}".format(prefix, substr))
        print()

test_patterns('abbaaabbbbaaaaa',[('ab', "'a' followed by 'b'"),
                   ])

'ab' ('a' followed by 'b')

  abbaaabbbbaaaaa
  ab
  .....ab



In [44]:
"""
meta-character:
*：重复 0 或多次
+：至少出现一次
?：重复 0 或 1 次
{m}：必须出现 m 次
{m,n}：出现 m 到 n 次
{m,}：至少出现 m 次
"""
test_patterns(
    'abbaabbba',
    [('ab*', 'a followed by zero or more b'),
     ('ab+', 'a followed by one or more b'),
     ('ab?', 'a followed by zero or one b'),
     ('ab{3}', 'a followed by three b'),
     ('ab{2,3}', 'a followed by two to three b'),
     ('ab{0,3}', 'a followed by zero to three b')],
)

'ab*' (a followed by zero or more b)

  abbaabbba
  abb
  ...a
  ....abbb
  ........a

'ab+' (a followed by one or more b)

  abbaabbba
  abb
  ....abbb

'ab?' (a followed by zero or one b)

  abbaabbba
  ab
  ...a
  ....ab
  ........a

'ab{3}' (a followed by three b)

  abbaabbba
  ....abbb

'ab{2,3}' (a followed by two to three b)

  abbaabbba
  abb
  ....abbb

'ab{0,3}' (a followed by zero to three b)

  abbaabbba
  abb
  ...a
  ....abbb
  ........a



In [53]:
"""
re 模块默认使用 greedy 策略
例如 'ab*' 是允许 a 后面跟 0 个或多个 b，对于 'abb' 字符串，
greedy 策略下会找到 'abb'，禁止 greedy 策略只会找到 'a'
可以在正则表达式的最后加上 '?' 来禁止 greedy 策略
"""
test_patterns(
    'abbaabbba',
    [('ab*?', 'a followed by zero or more b'),
     ('ab+?', 'a followed by one or more b'),
     ('ab??', 'a followed by zero or one b'),
     ('ab{3}?', 'a followed by three b'),
     ('ab{2,3}?', 'a followed by two to three b')],
)

'ab*?' (a followed by zero or more b)

  abbaabbba
  a
  ...a
  ....a
  ........a

'ab+?' (a followed by one or more b)

  abbaabbba
  ab
  ....ab

'ab??' (a followed by zero or one b)

  abbaabbba
  a
  ...a
  ....a
  ........a

'ab{3}?' (a followed by three b)

  abbaabbba
  ....abbb

'ab{2,3}?' (a followed by two to three b)

  abbaabbba
  abb
  ....abb



In [55]:
"""
Character Sets：在 '[]' 的字符串中任意 match 一个
"""
test_patterns(
    'abbaabbba',
    [('[ab]', 'either a or b'),
     ('a[ab]+', 'a followed by 1 or more a or b'),
     ('a[ab]+?', 'a followed by 1 or more a or b, not greedy')],
)

'[ab]' (either a or b)

  abbaabbba
  a
  .b
  ..b
  ...a
  ....a
  .....b
  ......b
  .......b
  ........a

'a[ab]+' (a followed by 1 or more a or b)

  abbaabbba
  abbaabbba

'a[ab]+?' (a followed by 1 or more a or b, not greedy)

  abbaabbba
  ab
  ...aa



In [58]:
"""
找到所有不包含 '-', '.' 和空格的 substrings
"""
test_patterns(
    'This is some text -- with punctuation.',
    [('[^-. ]+', 'sequences without -, ., or space')],
)

'[^-. ]+' (sequences without -, ., or space)

  This is some text -- with punctuation.
  This
  .....is
  ........some
  .............text
  .....................with
  ..........................punctuation



In [63]:
"""
使用 '-' 符号表示候选字符范围
"""
test_patterns(
    'This is some text -- with punctuation.',
    [('[a-z]+', 'sequences of lowercase letters'),
     ('[A-Z]+', 'sequences of uppercase letters'),
     ('[a-zA-Z]+', 'sequences of letters of either case'),
     ('[A-Z][a-z]+', 'one uppercase followed by lowercase')],
)

'[a-z]+' (sequences of lowercase letters)

  This is some text -- with punctuation.
  .his
  .....is
  ........some
  .............text
  .....................with
  ..........................punctuation

'[A-Z]+' (sequences of uppercase letters)

  This is some text -- with punctuation.
  T

'[a-zA-Z]+' (sequences of letters of either case)

  This is some text -- with punctuation.
  This
  .....is
  ........some
  .............text
  .....................with
  ..........................punctuation

'[A-Z][a-z]+' (one uppercase followed by lowercase)

  This is some text -- with punctuation.
  This



In [66]:
"""
'.' 可以表示任意字符
"""
test_patterns(
    'abbaabbba',
    [('a.', 'a followed by any one character'),
     ('b.', 'b followed by any one character'),
     ('a.*b', 'a followed by anything, ending in b'),
     ('a.*?b', 'a followed by anything, ending in b')],
)

'a.' (a followed by any one character)

  abbaabbba
  ab
  ...aa

'b.' (b followed by any one character)

  abbaabbba
  .bb
  .....bb
  .......ba

'a.*b' (a followed by anything, ending in b)

  abbaabbba
  abbaabbb

'a.*?b' (a followed by anything, ending in b)

  abbaabbba
  ab
  ...aab



In [70]:
"""
转义字符，数字、非数字、空白符等
"""
test_patterns(
    'A prime #1 345 example!',
    [(r'\d+', 'sequence of digits'),
     (r'\D+', 'sequence of non-digits'),
     (r'\s+', 'sequence of whitespace'),
     (r'\S+', 'sequence of non-whitespace'),
     (r'\w+', 'alphanumeric characters'),
     (r'\W+', 'non-alphanumeric')],
)

'\d+' (sequence of digits)

  A prime #1 345 example!
  .........1
  ...........345

'\D+' (sequence of non-digits)

  A prime #1 345 example!
  A prime #
  .......... 
  .............. example!

'\s+' (sequence of whitespace)

  A prime #1 345 example!
  . 
  ....... 
  .......... 
  .............. 

'\S+' (sequence of non-whitespace)

  A prime #1 345 example!
  A
  ..prime
  ........#1
  ...........345
  ...............example!

'\w+' (alphanumeric characters)

  A prime #1 345 example!
  A
  ..prime
  .........1
  ...........345
  ...............example

'\W+' (non-alphanumeric)

  A prime #1 345 example!
  . 
  ....... #
  .......... 
  .............. 
  ......................!



In [95]:
"""
Anchoring
"""
test_patterns(
    'Thi3s is some text text1 2text -- with punctuation.',
    [(r'^\w+', 'word at start of string'),
     (r'\A\w+', 'word at start of string'),
     (r'\w+\S$', 'word near end of string'), # \w+ 不会匹配到字符串最后的 '.'，所以要加一个 '\S'
     (r'\w+\S*\Z', 'word near end of string'),
     (r'\w*t\w*', 'word containing t'),
     (r'\bt\w+', 't at start of word'),
     (r'\w+t\b', 't at end of word'),
     (r'\Bt\B', 't, not start or end of word'),
     (r'\w*\Bt\B\w*', 't, not start or end of word，包含不处于开头或结尾的 t'),
    (r'\S+t\S+', 't, 不包含处于开头或结尾的 t')],
)

'^\w+' (word at start of string)

  Thi3s is some text text1 2text -- with punctuation.
  Thi3s

'\A\w+' (word at start of string)

  Thi3s is some text text1 2text -- with punctuation.
  Thi3s

'\w+\S$' (word near end of string)

  Thi3s is some text text1 2text -- with punctuation.
  .......................................punctuation.

'\w+\S*\Z' (word near end of string)

  Thi3s is some text text1 2text -- with punctuation.
  .......................................punctuation.

'\w*t\w*' (word containing t)

  Thi3s is some text text1 2text -- with punctuation.
  ..............text
  ...................text1
  .........................2text
  ..................................with
  .......................................punctuation

'\bt\w+' (t at start of word)

  Thi3s is some text text1 2text -- with punctuation.
  ..............text
  ...................text1

'\w+t\b' (t at end of word)

  Thi3s is some text text1 2text -- with punctuation.
  ..............text
  ............