# Finding Patterns of Text Without Regular Expressions

In [4]:
# 電話格式: 012-345-6489，所以字串長度是10

def isPhoneNumber(text):
    if len(text) != 12:  #先看字串長度是否正確
        return False
    for i in range(0, 3):  #按照順序看是否數字符號
        if not text[i].isdecimal():
            return False
    if text[3] != '-': #
            return False
    for i in range(4, 7):
        if not text[i].isdecimal():
            return False
    if text[7] != '-':
            return False
    for i in range(8, 12):
        if not text[i].isdecimal():
            return False
    return True

print('415-555-4242 is a phone number:')
print(isPhoneNumber('415-555-4242'))
print('Moshi moshi is a phone number:')
print(isPhoneNumber('Moshi moshi'))

415-555-4242 is a phone number:
True
Moshi moshi is a phone number:
False


In [6]:
message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'

for i in range(len(message)):
    chunk = message[i:i+12]
    if isPhoneNumber(chunk):
        print('Phone number found: ' + chunk)

print('Done')

Phone number found: 415-555-1011
Phone number found: 415-555-9999
Done


# Finding Patterns of Text with Regular Expressions

In [7]:
import re

# Creating Regex Objects
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

# Matching Regex Objects
# 注意: search只返還第一個值!!!
mo = phoneNumRegex.search('My number is 415-555-4242.')
print('Phone number found: ' + mo.group())

Phone number found: 415-555-4242


# Grouping with Parentheses

In [12]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My number is 415-555-4242.')
mog = mo.group()

print(mo.group(0))
print(mo.group(1))
print(mo.group(2))
print(mo.groups()) #屬於tuple

415-555-4242
415
555-4242
('415', '555-4242')


# Summary 1

# Matching Multiple Groups with the Pipe | 匹配多組字

In [15]:
#Will match either 'Batman' or 'Tina Fey'.
heroRegex = re.compile (r'Batman|Tina Fey') 

mo1 = heroRegex.search('Batman and Tina Fey.')
mo1.group() #返還第一個match值

mo2 = heroRegex.search('Tina Fey and Batman.')
mo2.group() #返還第一個match值

'Tina Fey'

In [17]:
# use the pipe to match one of several patterns as part of your regex.
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel')
mo.group()

'Batmobile'

# Optional Matching with the Question Mark ? 選擇性匹配

In [20]:
# Optional Matching with the Question Mark ???

batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
mo1.group()

mo2 = batRegex.search('The Adventures of Batwoman')
mo2.group()

'Batwoman'

# Matching Zero or More with the Star * 匹配0或多個

In [21]:
batRegex = re.compile(r'Bat(wo)*man')
mo3 = batRegex.search('The Adventures of Batwowowowoman')
mo3.group()

'Batwowowowoman'

# Matching One or More with the Plus + 匹配1或多個

In [22]:
batRegex = re.compile(r'Bat(wo)+man')
mo3 = batRegex.search('The Adventures of Batwowowowoman')
mo3.group()

'Batwowowowoman'

# Matching Specific Repetitions with Curly Brackets {} 比對前一個字元n次

In [24]:
# (Ha){3} = (Ha)(Ha)(Ha)

haRegex = re.compile(r'(Ha){3}')
mo = haRegex.search('HaHaHaHa')
mo.group()

'HaHaHa'

# Greedy(最大數量) and Nongreedy Matching(敘述句加上?) 

In [27]:
#greedy 會找最大數量，這是python的default!!
greedyHaRegex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegex.search('HaHaHaHaHa')
mo1.group()

'HaHaHaHaHa'

In [28]:
#Nongreedy 會找最小數量
nongreedyHaRegex = re.compile(r'(Ha){3,5}?') #加上?
mo2 = nongreedyHaRegex.search('HaHaHaHaHa')
mo2.group()

'HaHaHa'

# The findall() Method 使用list返還全部match的值

# .search()只返還第一個match的值

In [38]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # no groups
mo = phoneNumRegex.search('Cell: 415-555-9999 Work: 212-555-0000')
mo.group()

'415-555-9999'

In [39]:
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000') # no groups
#返還list

['415-555-9999', '212-555-0000']

In [41]:
phoneNumRegex2 = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups
phoneNumRegex2.findall('Cell: 415-555-9999 Work: 212-555-0000') 
#返還tuple

[('415', '555', '9999'), ('212', '555', '0000')]

# Character Classes

In [47]:
xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 \
                swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

# Making Your Own Character Classes 使用[]

In [48]:
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']

In [49]:
vowelRegex = re.compile(r'[^aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

# The Caret and Dollar Sign Characters ^開頭是 結尾是$

In [52]:
# ^開頭是
beginsWithHello = re.compile(r'^Hello') #開頭是Hello
beginsWithHello.search('Hello world!')

<_sre.SRE_Match object; span=(0, 5), match='Hello'>

In [53]:
beginsWithHello.search('He said hello.') == None

True

In [54]:
# $結尾是
endsWithNumber = re.compile(r'\d$') #結尾是數字
endsWithNumber.search('Your number is 42')

<_sre.SRE_Match object; span=(16, 17), match='2'>

In [55]:
endsWithNumber.search('Your number is forty two.') == None

True

# The Wildcard Character . or dot = match any character 除了  newline

In [56]:
#. match any character except for a newline
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

# Matching Everything with Dot-Star .*

In [59]:
#.* match所有東西
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Al Last Name: Sweigart')
mo.groups()

('Al', 'Sweigart')

In [60]:
# 加上?變成nongreedy!!
nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man>'

In [63]:
greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man> for dinner.>'

# Matching Newlines with the Dot Character: the dot character match all characters, including the newline character.

In [64]:
noNewlineRegex = re.compile('.*') #跳行就不讀了
noNewlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.'

In [65]:
newlineRegex = re.compile('.*', re.DOTALL) #連新一行都讀進去
newlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.\nProtect the innocent.\nUphold the law.'

# Summary 2

# Case-Insensitive Matching = re.IGNORECASE 忽略大小寫

In [66]:
# re.IGNORECASE or re.I
robocop = re.compile(r'robocop', re.I)
robocop.search('RoboCop is part man, part machine, all cop.').group()

'RoboCop'

# Substituting Strings with the sub() Method 替代字串

In [67]:
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')
#__.sub('a','b') 把b裡面的匹配字用a取代掉

'CENSORED gave the secret documents to CENSORED.'

In [68]:
agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'

# Managing Complex Regexes '''''',re.VERBOSE 跨行註釋書寫

In [69]:
# re.VERBOSE
# ignore whitespace and comments inside the regular expression string

phoneRegex = re.compile(r'''(
    (\d{3}|\(\d{3}\))? # area code
    (\s|-|\.)? # separator
    \d{3} # first 3 digits
    (\s|-|\.) # separator
    \d{4} # last 4 digits
    (\s*(ext|x|ext.)\s*\d{2,5})? # extension
    )''', re.VERBOSE)

同時符合 re.IGNORECASE | re.DOTALL | re.VERBOSE

In [70]:
someRegexValue = re.compile('foo', re.IGNORECASE | re.DOTALL | re.VERBOSE)

# Project: Phone Number and Email Address Extractor

In [74]:
# phoneAndEmail.py - Finds phone numbers and email addresses on the clipboard.

import pyperclip, re

# TODO: Create a Regex for Phone Numbers
phoneRegex = re.compile(r'''(
    (\d{3}|\(\d{3}\))? # area code
    (\s|-|\.)? # separator
    (\d{3}) # first 3 digits
    (\s|-|\.) # separator
    (\d{4}) # last 4 digits
    (\s*(ext|x|ext.)\s*(\d{2,5}))? # extension
    )''', re.VERBOSE)

# TODO: Create email regex.
emailRegex = re.compile(r'''(
    [a-zA-Z0-9._%+-]+ # username
    @ # @ symbol
    [a-zA-Z0-9.-]+ # domain name
    (\.[a-zA-Z]{2,4}) # dot-something
    )''', re.VERBOSE)

# TODO: Find matches in clipboard text.
text = str(pyperclip.paste())
matches = []

for groups in phoneRegex.findall(text):
    phoneNum = '-'.join([groups[1], groups[3], groups[5]])
    if groups[8] != '':
        phoneNum += ' x' + groups[8]
    matches.append(phoneNum)

for groups in emailRegex.findall(text):
    matches.append(groups[0])

# TODO: Copy results to the clipboard.
if len(matches) > 0:
    pyperclip.copy('\n'.join(matches))
    print('Copied to clipboard:')
    print('\n'.join(matches))
else:
    print('No phone numbers or email addresses found.')

Copied to clipboard:
800-420-7240
415-863-9900
415-863-9950
info@nostarch.com
media@nostarch.com
academic@nostarch.com
info@nostarch.com
