In [1]:
import re

In [3]:
phoneNumRegex=re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')  # \d is to denote digit
mo = phoneNumRegex.search('My number is 415-555-4242')
mo.group()

'415-555-4242'

In [4]:
phoneNumRegex=re.compile(r'\d{3}-\d{3}-\d{4}')
mo = phoneNumRegex.search('My number is 415-555-4242')
mo.group()

'415-555-4242'

In [10]:
phoneNumRegex=re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My number is 415-555-4242')

In [11]:
mo.group(1)

'415'

In [12]:
mo.group(2)

'555-4242'

In [14]:
mo.group(0)

'415-555-4242'

In [15]:
mo.group()

'415-555-4242'

In [16]:
mo.groups()

('415', '555-4242')

In [17]:
areaCode, mainNumber = mo.groups()

In [18]:
areaCode

'415'

In [20]:
mainNumber

'555-4242'

In [28]:
phoneNumRegex=re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My number is (415) 555-4242')

In [30]:
mo.group(1)

'(415)'

In [31]:
mo.group(2)

'555-4242'

### Matching Multiple Groups with the Pipe

##### When both Batman and He-man occur in the searched string, the first occurrence of matching text will be returned as the Match object

In [32]:
heroRegex = re.compile(r'Batman|He-man')
mo1 = heroRegex.search('I like Batman and He-man')

In [33]:
mo1.group()

'Batman'

In [75]:
heroRegex = re.compile(r'Batman|He-man')
mo2 = heroRegex.search('I like He-man and Batman')

In [78]:
mo2.group()

'He-man'

In [None]:
# strings 'Batman','Batmobile', 'Batcopter', and 'Batbat' can be written as 'Bat(man|mobile|copter|bat)'

In [46]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost one wheel')

In [47]:
mo.group()

'Batmobile'

In [50]:
mo.group(1)

'mobile'

### Optional Matching with the Question Mark

In [79]:
optionalRegex = re.compile('Bat(wo)?man')
mo1 = optionalRegex.search('Adventures of Batman')

In [80]:
mo1.group()

'Batman'

In [62]:
mo2 = optionalRegex.search('Adventures of Batwoman')

In [63]:
mo2.group()

'Batwoman'

In [None]:
# The (wo)? part of the regular expression means that the pattern wo is an optional group.

### Matching Zero or More with the Star

In [81]:
optionalRegex = re.compile('Bat(wo)*man')
mo1 = optionalRegex.search('Adventures of Batman')

In [82]:
mo1.group()

'Batman'

In [83]:
mo2 = optionalRegex.search('Adventures of Batwoman')

In [84]:
mo2.group()

'Batwoman'

In [85]:
mo3 = optionalRegex.search('Adventures of Batwowowowowoman')

In [86]:
mo3.group()

'Batwowowowowoman'

### Matching One or More with the Plus (aleast one)

In [88]:
optionalRegex = re.compile('Bat(wo)+man')
mo1 = optionalRegex.search('Adventures of Batman')

In [90]:
mo1 == None # atleast one occurence of wo should be present in the string

True

In [91]:
mo2 = optionalRegex.search('Adventures of Batwoman')
mo2.group()

'Batwoman'

### Matching Specific Repetitions with Curly Brackets

In [None]:
# (Ha){3} and (Ha)(Ha)(Ha) are same.

# (Ha){3,5} and ((Ha)(Ha)(Ha))|((Ha)(Ha)(Ha)(Ha))|((Ha)(Ha)(Ha)(Ha)(Ha)) are same.

# (Ha){3,} will match three or more instances of the (Ha).

# (Ha){,5} will match zero to five instances.

In [92]:
haRegex = re.compile(r'(ha){3}')
mo = haRegex.search('hahaha')
mo.group()

'hahaha'

In [93]:
mo1 = haRegex.search('haha')
mo1 == None

True

### Greedy and Nongreedy Matching

In [None]:
# Regular expressions are greedy by default, it will match the longest string possible.

In [None]:
# The nongreedy Regular expressions matches the shortest string possible. It will have {}? 

In [98]:
greedyHaRegex = re.compile(r'(ha){3,5}')
mo1 = greedyHaRegex.search('hahahahaha')  # 5 times ha
mo1.group()  

'hahahahaha'

In [99]:
nongreedyHaRegex = re.compile(r'(ha){3,5}?')
mo2 = nongreedyHaRegex.search('hahahahaha')  # 5 times ha
mo2.group()

'hahaha'

### findall() Method

In [101]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo1 = phoneNumRegex.search('Cell: 415-555-9999 Work: 212-555-0000')
mo1.group()

'415-555-9999'

In [104]:
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo2 = phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')
mo2

['415-555-9999', '212-555-0000']

In [None]:
# findall() will not return a Match object but a list of strings—as long as there are no groups in the regular 
# expression. If there are groups in the regular expression, then findall() will return a list of tuples.

In [105]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

[('415', '555', '9999'), ('212', '555', '0000')]

In [None]:
Table: Shorthand Codes for Common Character Classes

\d      Any numeric digit from 0 to 9.
\D      Any character that is not a numeric digit from 0 to 9.
\w      Any letter, numeric digit, or the underscore character.
        (Think of this as matching “word” characters.)
\W      Any character that is not a letter, numeric digit, or the
        underscore character.
\s      Any space, tab, or newline character. (Think of this as
        matching “space” characters.)
\S      Any character that is not a space, tab, or newline.

In [107]:
xmasRegex = re.compile(r'\d+\s\w+')  # match one or more digits follwed by whitespace and match one or more text
xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

### Making Your Own Character Classes

In [108]:
vowelRegex = re.compile(r'[aeiouAEIOU]')  # use square bracket
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']

In [109]:
# [a-zA-Z0-9] match all lowercase letters, uppercase letters, and numbers.
# A negative character class will match all the characters that are not in the character class, use caret (^).

In [110]:
consonantRegex = re.compile(r'[^aeiouAEIOU]')
consonantRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

### The Caret and Dollar Sign Characters

##### Use the caret symbol (^) at the start of a regex to indicate that a match must occur at the beginning of the searched text. Likewise, you can put a dollar sign (dollar symbol) at the end of the regex to indicate the string must end with this regex pattern. And you can use the ^ and  dollar together to indicate that the entire string must match the regex—

In [None]:
# For example, the r'^Hello' regular expression string matches strings that begin with 'Hello'.

In [111]:
beginsWithHello = re.compile(r'^Hello')
beginsWithHello.search('Hello world!')

<_sre.SRE_Match at 0x97e9a58>

In [112]:
beginsWithHello.search('He said hello.') == None

True

In [113]:
# The r'\d$' regular expression string matches strings that end with a numeric character from 0 to 9

In [114]:
endsWithNumber = re.compile(r'\d$')
endsWithNumber.search('Your number is 42')

<_sre.SRE_Match at 0x97e9d30>

In [115]:
endsWithNumber.search('Your number is forty two.') == None

True

In [None]:
# r'^\d+$' regular expression string matches strings that both begin and end with one or more numeric characters

In [116]:
wholeStringIsNum = re.compile(r'^\d+$')
wholeStringIsNum.search('1234567890')

<_sre.SRE_Match at 0x97e9f38>

### Wildcard Character

In [None]:
# '.' will match any character except for a newline.
# the dot character will match just one character. Below ex check 'flat', result as lat.

In [117]:
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

### Matching Everything with Dot-Star

In [None]:
# To match the string 'First Name:', followed by any and all text, followed by 'Last Name:', and then followed by 
# anything again.

In [None]:
# dot character means “any single character except the newline,” and the star character means “zero or more of the 
# preceding character.”

In [118]:
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Al Last Name: Sweigart')

In [119]:
mo.group(1)

'Al'

In [120]:
mo.group(2)

'Sweigart'

In [121]:
mo.group()

'First Name: Al Last Name: Sweigart'

In [122]:
nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man>'

In [123]:
nongreedyRegex = re.compile(r'<.*>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man> for dinner.>'

### Matching Newlines with the Dot Character

In [124]:
noNewlineRegex = re.compile('.*')
noNewlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.'

In [125]:
newlineRegex = re.compile('.*', re.DOTALL)
newlineRegex.search('Serve the public trust.\nProtect the innocent. \nUphold the law.').group()

'Serve the public trust.\nProtect the innocent. \nUphold the law.'

In [None]:
The ? matches zero or one of the preceding group.
• The * matches zero or more of the preceding group.
• The + matches one or more of the preceding group.
• The {n} matches exactly n of the preceding group.
• The {n,} matches n or more of the preceding group.
• The {,m} matches 0 to m of the preceding group.
• The {n,m} matches at least n and at most m of the preceding group.
• {n,m}? or *? or +? performs a nongreedy match of the preceding group.
• ^spam means the string must begin with spam.
• spam$ means the string must end with spam.
• The . matches any character, except newline characters.
• \d, \w, and \s match a digit, word, or space character, respectively.
• \D, \W, and \S match anything except a digit, word, or space character,
respectively.
• [abc] matches any character between the brackets (such as a, b, or c).
• [^abc] matches any character that isn’t between the brackets.

### Case-Insensitive Matching

In [126]:
# To make your regex case-insensitive, you can pass re.IGNORECASE or re.I as a second argument to re.compile().

In [127]:
robocop = re.compile(r'robocop', re.I)
robocop.search('RoboCop is part man, part machine, all cop.').group()

'RoboCop'

In [128]:
robocop.search('ROBOCOP protects the innocent.').group()

'ROBOCOP'

In [129]:
robocop.search('Al, why does your programming book talk about robocop so much?').group()

'robocop'

### Substituting Strings with the sub() Method

In [130]:
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [131]:
agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'

In [133]:
phoneRegex = re.compile(r'((\d{3}|\(\d{3}\))?(\s|-|\.)?\d{3}(\s|-|\.)\d{4}(\s*(ext|x|ext.)\s*\d{2,5})?)')
phoneRegex.search('My number is 415-555-4242')

<_sre.SRE_Match at 0x7338be8>

In [None]:
# If you want a regular expression that’s case-insensitive and includes newlines to match the dot character

In [None]:
someRegexValue = re.compile('foo', re.IGNORECASE | re.DOTALL)


In [None]:
someRegexValue = re.compile('foo', re.IGNORECASE | re.DOTALL | re.VERBOSE)
