In [1]:
# Chapter 7 Pattern matching with regular expressions

In [2]:
## finding patterns of text without regular expressions

In [2]:
def isPhoneNumber(text):
    if len(text) != 12:
        return False
    for i in range(0, 3):
        if not text[i].isdecimal():
            return False
    if text[3] != '-':
        return False
    for i in range(4, 7):
        if not text[i].isdecimal():
            return False
    if text[7] != '-':
        return False
    for i in range(8, 12):
        if not text[i].isdecimal():
            return False
    return True    

In [3]:
print('415-555-4242 is a phone number: ')
print(isPhoneNumber('415-555-4242'))

415-555-4242 is a phone number: 
True


In [4]:
print('Moshi moshi is a phone number:')
print(isPhoneNumber('Moshi moshi'))

Moshi moshi is a phone number:
False


In [5]:
message = 'call me at 415-555-1011 tomorrow. 415-555-999 is my office.'
for i in range(len(message)):
    chunck = message[i : i + 12]
    if isPhoneNumber(chunck):
        print('phone number found: ' + chunck)
print('done')

phone number found: 415-555-1011
done


In [6]:
## Finding Patterns of Text with Regular Expressions

In [7]:
### get regex objects
import re

In [8]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # raw string

In [9]:
### matching regex objects
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search('my number is 415-555-4242.')
print('phone number found: ' + mo.group())

phone number found: 415-555-4242


In [10]:
## More Pattern Matching with Regular Expressions

In [11]:
### Grouping with Parentheses

In [12]:
mo.group(0)

'415-555-4242'

In [13]:
mo.group(1)

IndexError: no such group

In [14]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)')
mo = phoneNumRegex.search('my number if 415-555-4242.')
mo.group(0)

'415-555-4242'

In [15]:
mo.group(1)

'415'

In [16]:
mo.groups()

('415', '555', '4242')

In [17]:
areaCode, mainNumber = mo.groups()

ValueError: too many values to unpack (expected 2)

In [18]:
areaCode, _, _ = mo.groups()

In [19]:
print(areaCode)

415


In [20]:
### Matching Multiple Groups with the Pipe

In [21]:
heroRegex = re.compile(r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey')
mo1.group()

'Batman'

In [22]:
mo2 = heroRegex.search('Tina Fey and Batman')
mo2.group()

'Tina Fey'

In [23]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel')
mo.group()

'Batmobile'

In [24]:
mo.group(1)

'mobile'

In [25]:
### Optional Matching with the Question Mark

In [26]:
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('the adventures of Batman')
mo1.group()

'Batman'

In [27]:
batRegex = re.compile(r'Bat(wo)?man')
mo2 = batRegex.search('the adventures of Batwoman')
mo2.group()

'Batwoman'

In [29]:
phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
mo1 = phoneRegex.search('my number is 415-555-4242')
mo1.group()

'415-555-4242'

In [30]:
mo2 = phoneRegex.search('my number is 555-4242')
mo2.group()

'555-4242'

In [31]:
### Matching Zero or More with the Star

In [32]:
batRegex = re.compile(r'Bat(wo)*man') # zero or more
mo1 = batRegex.search('the adventures of Batman')
mo1.group()

'Batman'

In [33]:
mo2 = batRegex.search('the adventures of Batwoman')
mo2.group()

'Batwoman'

In [34]:
mo3 = batRegex.search('the adventures of Batwowowoman')
mo3.group()

'Batwowowoman'

In [None]:
### Matching One or More with the Plus

In [35]:
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('the adventures of Batwoman')
mo1.group()

'Batwoman'

In [36]:
mo3 = batRegex.search('the adventures of Batwowowoman')
mo3.group()

'Batwowowoman'

In [37]:
mo2 = batRegex.search('the adventures of Batman') # one or more
mo2 == None

True

In [38]:
### Matching Specific Repetitions with Curly Brackets

In [39]:
haRegex = re.compile(r'(ha){3}')
mo1 = haRegex.search('hahaha')
mo1.group()

'hahaha'

In [40]:
mo2 = haRegex.search('haha')
mo2 == None

True

In [41]:
## Greedy and Nongreedy Matching

In [43]:
greedyHaRegex = re.compile(r'(ha){3,5}')
mo1 = greedyHaRegex.search('hahahahaha')
mo1.group()

'hahahahaha'

In [45]:
nongreedyHaRegex = re.compile(r'(ha){3,5}?')
mo2 = nongreedyHaRegex.search('hahahahaha')
mo2.group()

'hahaha'

In [46]:
## The findall() Method

In [47]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # no group inside
mo = phoneNumRegex.search('cell: 415-555-9999 work: 212-555-0000')
mo.group()

'415-555-9999'

In [48]:
phoneNumRegex.findall('cell: 415-555-9999 work: 212-555-0000')

['415-555-9999', '212-555-0000']

In [49]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # groups inside
phoneNumRegex.findall('cell: 415-555-9999 work: 212-555-0000')

[('415', '555', '9999'), ('212', '555', '0000')]

In [50]:
## Character Classes

In [53]:
xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, \
                  7 swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

In [54]:
### Making Your Own Character Classes

In [55]:
vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']

In [56]:
consonantRegex = re.compile(r'[^aeiouAEIOU]') # excluding
consonantRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

In [57]:
## The Caret and Dollar Sign Characters

In [59]:
beginWithHello = re.compile(r'^Hello') # begin wtih 'Hello'
beginWithHello.search('Hello world !')

<_sre.SRE_Match object; span=(0, 5), match='Hello'>

In [60]:
beginWithHello.search('he said Hello') == None

True

In [61]:
endWithNumber = re.compile(r'\d$') # end with digit
endWithNumber.search('your number is 42')

<_sre.SRE_Match object; span=(16, 17), match='2'>

In [62]:
endWithNumber.search('your number is forty two.') == None

True

In [63]:
wholeStringIsNum = re.compile(r'^\d+$') # start and end with number - all numbers
wholeStringIsNum.search('1234567890')

<_sre.SRE_Match object; span=(0, 10), match='1234567890'>

In [64]:
wholeStringIsNum.search('1234s67890') == None

True

In [65]:
wholeStringIsNum.search('1234 67890') == None

True

In [66]:
## The Wildcard Character

In [68]:
atRegex = re.compile(r'.at')
atRegex.findall('the cat in the hat sat o the flat mat')

['cat', 'hat', 'sat', 'lat', 'mat']

In [69]:
### Matching Everything with Dot-Star

In [71]:
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Al Last Name: Sweigart')
mo.group(1)

'Al'

In [72]:
mo.group(2)

'Sweigart'

In [75]:
nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man>'

In [76]:
greedyRegex = re.compile(r'<.*>') # longest possible string
mo = greedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man> for dinner.>'

In [77]:
### Matching Newlines with the Dot Character

In [78]:
noNewlineRegex = re.compile('.*')
noNewlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.'

In [79]:
noNewlineRegex = re.compile('.*', re.DOTALL)
noNewlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.\nProtect the innocent.\nUphold the law.'

In [80]:
## Case-Insensitive Matching

In [81]:
robocop = re.compile(r'robocop', re.I) # ignorecase
robocop.search('RoboCop is part man, part machine, all cop.').group()

'RoboCop'

In [82]:
robocop.search('ROBOCOP is part man, part machine, all cop.').group()

'ROBOCOP'

In [83]:
robocop.search('Al, why does your programming book talk about robocop so much?').group()

'robocop'

In [84]:
### Substituting Strings with the sub() Method

In [85]:
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret document to Agent Bob.')

'CENSORED gave the secret document to CENSORED.'

In [86]:
namesRegex = re.compile(r'Agent (\w)\w*')
namesRegex.sub(r'\1****', 'Agent Alice gave the secret document to Agent Bob.')

'A**** gave the secret document to B****.'

In [87]:
## Managing Complex Regexes

In [89]:
phoneRegex = re.compile(r'''(
    (\d{3}|\(\d{3}\))?
    (\s|-|\.)?
    \d{3}
    (\s|-|\.)
    \d{4}
    (\s*(ext|x|ext.)\s*\d{2,5})?
    )''', re.VERBOSE)

In [90]:
### Combining re.IGNORECASE, re.DOTALL, and re.VERBOSE

In [91]:
someRegexValue = re.compile('foo', re.IGNORECASE | re.DOTALL)

In [92]:
someRegexValue = re.compile('foo', re.IGNORECASE | re.DOTALL | re.VERBOSE)

In [93]:
# there is a project - in a separate file