## Regular Expressions

In [8]:
import re

In [11]:
pattern = r'^(\d{3})-(\d{3})-(\d{4})$'
string = '111-456-7890'
match = re.match(pattern, string)
match.groups()

('111', '456', '7890')

In [13]:
# Let's parse some strings that are formatted differently
numbers = """
111-456-7890
2228901234
333.456.3847
(444) 456-7890
"""

# Parens create groups, \d matches decimals, and {x} match x number of last character set.
# Let's compile for speed.
pattern = re.compile(r'^(\d{3})-(\d{3})-(\d{4})$')
# ? matches 0 or 1 instances of last character set
# pattern = re.compile(r'^(\d{3})-?(\d{3})-?(\d{4})$')  
# \D matches a non-decimal character. * match 0 or more instances of last character set.
# pattern = re.compile(r'^(\d{3})\D*(\d{3})\D*(\d{4})$')
# Putting it all together.
# pattern = re.compile(r'^\(?(\d{3})\)?\D*(\d{3})\D*(\d{4})$')

for number in numbers.strip().split('\n'):
    match = pattern.match(number)
    if match:
        print(number, '=>', match.groups())
    else:
        print(number, '=>', 'No match')

111-456-7890 => ('111', '456', '7890')
2228901234 => No match
333.456.3847 => No match
(444) 456-7890 => No match


In [18]:
# Look up documentation:: https://docs.python.org/3/library/re.html

# character ranges are denoted with [character range]
pattern = r'^(1-)?(\d{3})-([A-Za-z0-9]{3})-([A-Za-z0-9]{4})$'
string = '1-800-kid-CARS'
match = re.match(pattern, string)
match.groups()

('1-', '800', 'kid', 'CARS')

In [30]:
pattern = r'^([A-Za-z-]+)$'
string = 'Mary-Jo'
match = re.match(pattern, string)
match.groups()

('Mary-Jo',)

In [37]:
# Named groups
pattern = r'^(?P<first_name>[A-Za-z-]+) (?P<last_name>[A-Za-z-]+)$'
string = 'Mary-Jo Lue-ellen'
match = re.match(pattern, string)
print(match.groups())
print(match.group('first_name'))
print(match.group('last_name'))

('Mary-Jo', 'Lue-ellen')
Mary-Jo
Lue-ellen


In [42]:
# Flags
pattern = r'''
    ^                        # match beginning of string
    (?P<first_name>[A-Z-]+)  # match first name
    \s+                      # match space between names
    (?P<last_name>[A-Z-]+)   # match last name
    $                        # match end of string
'''

string = 'Mary-Jo Lue-ellen'
match = re.match(pattern, string, flags=(re.IGNORECASE | re.VERBOSE))

print(match.groups())
print(match.group('first_name'))
print(match.group('last_name'))

('Mary-Jo', 'Lue-ellen')
Mary-Jo
Lue-ellen
