In [11]:
# Regular expressions are a powerful and (mostly) standardized
#      way of searching, replacing, and parsing text with complex patterns of characters.

s = '100 NORTH MAIN ROAD'
print(s.replace('ROAD', 'RD.'))

s = '100 NORTH BROAD ROAD'
print(s.replace('ROAD', 'RD.'))        # --> Not expected output

print(s[:16] + s[16:].replace('ROAD', 'RD.'))

import re
print(re.sub('ROAD$', 'RD.', s))       # $ => Word to be replaced is at the last

100 NORTH MAIN RD.
100 NORTH BRD. RD.
100 NORTH BROAD RD.
100 NORTH BROAD RD.


In [19]:
# Pattern matching
# '^' -> matches start of the string
# '$' -> matches end of the string
# '?' -> optionally matches a single character. For eg: M? will check if M exists. 
# ? takes care that it is not hard and fast for character to be present in the string

#### Checking for thousands 'M' in Roman Numerals

import re

pattern = "^M?M?M?$"                          # String containing only M's with count 0 to 3 will match pattern
print(re.search(pattern, 'M'))                # 'M' matches the pattern
print(re.search(pattern, 'MM'))
print(re.search(pattern, 'MMM'))
print(re.search(pattern, 'MMMM'))

<re.Match object; span=(0, 1), match='M'>
<re.Match object; span=(0, 2), match='MM'>
<re.Match object; span=(0, 3), match='MMM'>
None


In [24]:
pattern = "^aM?M?"
print(re.search(pattern, 'Mabc'))
print(re.search(pattern, 'aM'))
print(re.search(pattern, 'abMcd'))

None
<re.Match object; span=(0, 2), match='aM'>
<re.Match object; span=(0, 1), match='a'>


In [29]:
#### Checking for Hundreds in Roman Numerals

import re
pattern = '^M?M?M?(CM|CD|D?C?C?C?)$'
print(re.search(pattern, 'MCM'))
print(re.search(pattern, 'MD'))
print(re.search(pattern, 'MMMCCC'))
print(re.search(pattern, 'MCMC'))
print(re.search(pattern, ''))

<re.Match object; span=(0, 3), match='MCM'>
<re.Match object; span=(0, 2), match='MD'>
<re.Match object; span=(0, 6), match='MMMCCC'>
None
<re.Match object; span=(0, 0), match=''>


In [28]:
import re
pattern = '^M{0,3}$'                   # if you want to match at least one but no more than three M characters, you could say M{1,3}
print(re.search(pattern, 'M'))                # 'M' matches the pattern
print(re.search(pattern, 'MM'))
print(re.search(pattern, 'MMM'))
print(re.search(pattern, 'MMMM'))

<re.Match object; span=(0, 1), match='M'>
<re.Match object; span=(0, 2), match='MM'>
<re.Match object; span=(0, 3), match='MMM'>
None


In [30]:
#### Checking for Tens in Roman Numerals

pattern = '^M?M?M?(CM|CD|D?C?C?C?)(XC|XL|L?X?X?X?)$'
# pattern can also be written as
pattern_2 = '^M{0,3(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})$'
print(re.search(pattern, 'MCMXL'))
print(re.search(pattern, 'MCML'))
print(re.search(pattern, 'MCMLX'))
print(re.search(pattern, 'MCMLXXX'))
print(re.search(pattern, 'MCMLXXXX'))

<re.Match object; span=(0, 5), match='MCMXL'>
<re.Match object; span=(0, 4), match='MCML'>
<re.Match object; span=(0, 5), match='MCMLX'>
<re.Match object; span=(0, 7), match='MCMLXXX'>
None


In [32]:
##### Parsing phone numbers

# \d : matches any numeric digit (0-9)
# \D : matches anything other than digits

# \d{3} : \d means “any numeric digit” (0 through 9). The {3} means “match exactly three numeric digits”

phonePattern = re.compile(r'^(\d{3})-(\d{3})-(\d{4})$')
print(phonePattern.search('800-555-1212').groups())

print(phonePattern.search('800-555-1212-1234'))

('800', '555', '1212')
None


In [33]:
# \d+ : remembered group of one or more digits
phonePattern = re.compile(r'^(\d{3})-(\d{3})-(\d{4})-(\d+)$')
phonePattern.search('800-555-1212-1234567').groups()

('800', '555', '1212', '1234567')

In [34]:
phonePattern = re.compile(r'^(\d{3})\D+(\d{3})\D+(\d{4})\D+(\d+)$')
print(phonePattern.search('800 555 1212 1234').groups())
print(phonePattern.search('800-555-1212-1234').groups())

('800', '555', '1212', '1234')
('800', '555', '1212', '1234')


In [35]:
# \D+  ==> 1 or more
# \D*  ==> 0 or more

phonePattern = re.compile(r'^(\d{3})\D*(\d{3})\D*(\d{4})\D*(\d*)$')
print(phonePattern.search('80055512121234').groups())

('800', '555', '1212', '1234')


In [None]:
# ^ matches the beginning of a string.
# $ matches the end of a string.
# \b matches a word boundary.
# \d matches any numeric digit.
# \D matches any non-numeric character.
# x? matches an optional x character (in other words, it matches an x zero or one times).
# x* matches x zero or more times.
# x+ matches x one or more times.
# x{n,m} matches an x character at least n times, but not more than m times.
# (a|b|c) matches exactly one of a, b or c.
# (x) in general is a remembered group. 
# You can get the value of what matched by using the groups() method of the object returned by re.search.