In [1]:
import re

## Meta characters
All meta characters: . ^ $ * + ? { } [ ] \ | ( )
- . Any character (except newline character) "he..o"

- ^ Starts with "^hello"

- \$ Ends with "world$"

- \* Zero or more occurrences "aix*"

- \+ One or more occurrences "aix+"

- { } Exactly the specified number of occurrences "al{2}"

- [] A set of characters \[a-m]

- \[^] - Matches characters NOT in

- \ Signals a special sequence (can also be used to escape special characters) "\d"

- | Either or "falls|stays"

- ( ) Capture and group

<br><br>

## More Metacharacters / Special Sequences
- \d :Matches any decimal digit; this is equivalent to the class \[0-9].

- \\D : Matches any non-digit character; this is equivalent to the class \[^0-9].

- \s : Matches any whitespace character;

- \S : Matches any non-whitespace character;

- \w : Matches any alphanumeric (word) character; this is equivalent to the class \[a-zA-Z0-9_].

- \W : Matches any non-alphanumeric character; this is equivalent to the class \[^a-zA-Z0-9_]

- \b Returns a match where the specified characters are at the beginning or at the end of a word r"\bain" r"ain\b"

- \B Returns a match where the specified characters are present, but NOT at the beginning (or at the end) of a word r"\Bain" r"ain\B"

- \A Returns a match if the specified characters are at the beginning of the string "\AThe"

- \Z Returns a match if the specified characters are at the end of the string "Spain\Z"

<br><br>

## Methods on match:
- match(): Determine if the RE matches at the beginning of the beginning of the string

- search(): Scan through the string, looking for any location where the RE matches

- findall(): Finds all substrings where the RE matches, and returns them as as a list

- finditer(): Finds all substrings where the RE matches, and returns them as an iterator



- Sets
- Quantifier
- Conditions
- Grouping
- Modification
- Compilation flags

Return the numbers of the string and combine them into one string

In [18]:
string = '123abc456789abcd123ABC'
pattern = re.compile(r'\d+')
matches = pattern.findall(string)
combined_matches = ''.join(matches)
combined_matches

'123456789123'

In [22]:
# a different way to write this
string = '123abc456789abcd123ABC'
matches = re.findall(r'\d+', string)
combined_matches = ''.join(matches)
combined_matches

'123456789123'

Return the letters and combine them into one string

In [21]:
string = '123abc456789abcd12abc3ABC'
pattern = re.compile(r'[a-zA-Z]+')
matches = pattern.findall(string)
combined_matches = ''.join(matches)
combined_matches

'abcabcdabcABC'

In [102]:
string = '123abc456789abcd12abc3ABC'
pattern = re.compile(r'\D')
matches = pattern.findall(string)
combined_matches = ''.join(matches)
combined_matches

'abcabcdabcABC'

Find the strings that start with "prev"

In [43]:
string = """
prevenir
prevent
treprev
"""

pattern = re.compile(r'^prev[a-z]+', re.MULTILINE) # the '^' is the key here
matches = pattern.findall(string)
matches

['prevenir', 'prevent']

Find the string that ends with "prev"

In [48]:
string = """
prevenir
prevent
treprev
"""
pattern = re.compile(r'[a-zA-Z]+prev$', re.MULTILINE) # the '$' is the key here
matches = pattern.findall(string)
matches

['treprev']

Find all words that start with "**`b`**"

In [100]:
string = """
bank
abba
saab
bunker
bank-box
little-britain
"""
pattern = re.compile(r'^\bb[a-z-]+', re.MULTILINE)
matches = pattern.findall(string)
matches

['bank', 'bunker', 'bank-box']

Find all words that start with "**`b`**", either before or after the "-"

In [98]:
string = """
bank
abba
saab
bunker
bank-box
little-britain
"""
pattern = re.compile(r'\bb[a-z-]+', re.MULTILINE)
matches = pattern.findall(string)
matches

['bank', 'bunker', 'bank-box', 'britain']

Find the words with double consonant and ends with "w"

In [195]:
string = """       
chello 
hello 
solow
Sottaw 
billy  
hollow  
halo 
shallow 
"""

pattern = re.compile(r'[a-zA-Z]+[a-z{2}].+w')
matches = pattern.findall(string, re.MULTILINE)
matches

['solow', 'Sottaw', 'hollow', 'shallow']

### Sets

In [200]:
string = 'hello 123-56'
pattern = re.compile(r'[0-9-]') # stuff inside the bracket is a set
matches = pattern.findall(string)
''.join(matches)

'123-56'

### Quantifiers
- *: 0 or more

- +: 1 or more

- ?: 0 or 1 --> optional character

- {4}: exact number

- {4,6}: range number

In [205]:
string = 'hello123'
pattern = re.compile(r'\d{3}') # 3 is the quantifier
matches = pattern.findall(string)
matches

['123']

In [207]:
dates = '''
01.04.2020

2020.04.01

2020-04-01
2020-05-23
2020-06-11
2020-07-11
2020-08-11

2020/04/02

2020_04_04
2020_04_04
'''

Find dates that are some variation of YYYY-MM-DD

In [209]:
pattern = re.compile(r'\d{4}.\d{2}.\d{2}')
matches = pattern.findall(dates)
matches

['2020.04.01',
 '2020-04-01',
 '2020-05-23',
 '2020-06-11',
 '2020-07-11',
 '2020-08-11',
 '2020/04/02',
 '2020_04_04',
 '2020_04_04']

In [214]:
dates = '''
01.04.2020

2020.04.01

2020-04-01
2020.05.23
2020-06-11
2020-07-11
2020-08-11

2020/04/02

2020_04_04
2020_04_04
'''

Find the dates of months May and June in 2020

In [224]:
pattern = re.compile(r'\d{4}.0[56].\d{2}')
matches = pattern.findall(dates)
matches

['2020.05.23', '2020-06-11']

#### Conditions:
Math everyone with Mr/Ms/Mrs

In [330]:
string = """ 
Mr Simpson 
hello world 
Mrs Simpson 
Mo Town 
Mr. Brown 
Ms Smith 
Mrs. White 
Mr. T 
"""

In [272]:
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[a-zA-Z]+')
matches = pattern.finditer(string)
list(matches)

[<re.Match object; span=(2, 12), match='Mr Simpson'>,
 <re.Match object; span=(27, 38), match='Mrs Simpson'>,
 <re.Match object; span=(49, 58), match='Mr. Brown'>,
 <re.Match object; span=(60, 68), match='Ms Smith'>,
 <re.Match object; span=(70, 80), match='Mrs. White'>,
 <re.Match object; span=(82, 87), match='Mr. T'>]

Validate email adresses

In [279]:
emails = """
pythonengineer@gmail.com
python-engineer@gmx.de
python.engineer@gmx.de
python-engineer123@my-domain.org
555python@hotmail.com
"""
pattern = re.compile(r'[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}') # there may be more special characters allowed
matches = pattern.findall(emails)
matches

['pythonengineer@gmail.com',
 'python-engineer@gmx.de',
 'python.engineer@gmx.de',
 'python-engineer123@my-domain.org',
 '555python@hotmail.com']

Grouping - adding parentheses

In [280]:
pattern = re.compile(r'([a-z0-9._%+-]+)@([a-z0-9.-]+)\.([a-z]{2,})') # there may be more special characters allowed
matches = pattern.findall(emails)
matches

[('pythonengineer', 'gmail', 'com'),
 ('python-engineer', 'gmx', 'de'),
 ('python.engineer', 'gmx', 'de'),
 ('python-engineer123', 'my-domain', 'org'),
 ('555python', 'hotmail', 'com')]

Modification <br>
- split
- sub

In [282]:
string = 'abc-def-ghj-klm-mno'
pattern = re.compile(r'-')
pattern.split(string)

['abc', 'def', 'ghj', 'klm', 'mno']

replace the dots with hyphens

In [285]:
string = 'abc.def-ghj.klm-mno.pqr.stu'
pattern = re.compile(r'\.')
pattern.sub('-', string)

'abc-def-ghj-klm-mno-pqr-stu'

Change the first occurance of planet into the word "world"

In [286]:
string = 'hello planet! You are the best planet'
pattern = re.compile(r'planet')
pattern.sub('world', string, count=1)

'hello world! You are the best planet'

In [307]:
urls = """
http://python-engineer.com
https://www.python-engineer.org
http://www.pyeng.net
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
www.zerohedge.com
cnn.com
"""

In [339]:
pattern = re.compile(r'(https://|http://)?(www\.)?([a-z-]+)(\.[a-z]{2,})')
matches = pattern.finditer(urls)
for match in matches:
    print(match.group(0), '  --  ', match.groups())

http://python-engineer.com   --   ('http://', None, 'python-engineer', '.com')
https://www.python-engineer.org   --   ('https://', 'www.', 'python-engineer', '.org')
http://www.pyeng.net   --   ('http://', 'www.', 'pyeng', '.net')
https://www.google.com   --   ('https://', 'www.', 'google', '.com')
http://coreyms.com   --   ('http://', None, 'coreyms', '.com')
https://youtube.com   --   ('https://', None, 'youtube', '.com')
https://www.nasa.gov   --   ('https://', 'www.', 'nasa', '.gov')
www.zerohedge.com   --   (None, 'www.', 'zerohedge', '.com')
cnn.com   --   (None, None, 'cnn', '.com')


In [15]:
info = """
Dave Martin
615-555-7164
173 Main St., Springfield RI 55924
davemartin@bogusemail.com

Charles Harris
800-555-5669
969 High St., Atlantis VA 34075
charlesharris@bogusemail.com

Eric Williams
560-555-5153
806 1st St., Faketown AK 86847
laurawilliams@bogusemail.com

Corey Jefferson
900-555-9340
826 Elm St., Epicburg NE 10671
coreyjefferson@bogusemail.com

Jennifer Martin-White
714-555-7405
212 Cedar St., Sunnydale CT 74983
jenniferwhite@bogusemail.com

Erick Davis
800-555-6771
519 Washington St., Olympus TN 32425
tomdavis@bogusemail.com

Neil Patterson
783-555-4799
625 Oak St., Dawnstar IL 61914
neilpatterson@bogusemail.com

Laura Jefferson
516-555-4615
890 Main St., Pythonville LA 29947
laurajefferson@bogusemail.com

John Miller
372-555-9809
117 Cedar St., Thundera NM 75205
johnmiller@bogusemail.com

Corey Jackson
890-555-5618
115 Oak St., Gotham UT 36433
coreyjackson@bogusemail.com

Sam Thomas
670-555-3005
743 Lake St., Springfield MS 25473
samthomas@bogusemail.com

Patricia Thomas
509-555-5997
381 Hill St., Blackwater CT 30958
patriciathomas@bogusemail.com

Jennifer Davis
721-555-5632
125 Main St., Smalltown MT 62155
jenniferdavis@bogusemail.com
"""