## General Rules

In [None]:
.       - Any Character Except New Line
\d      - Digit (0-9)
\D      - Not a Digit (0-9)
\w      - Word Character (a-z, A-Z, 0-9, _)
\W      - Not a Word Character
\s      - Whitespace (space, tab, newline)
\S      - Not Whitespace (space, tab, newline)

\b      - Word Boundary
\B      - Not a Word Boundary
^       - Beginning of a String
$       - End of a String

[]      - Matches Characters in brackets
[^ ]    - Matches Characters NOT in brackets
|       - Either Or
( )     - Group

Quantifiers:
*       - 0 or More
+       - 1 or More
?       - 0 or One
{3}     - Exact Number
{3,4}   - Range of Numbers (Minimum, Maximum)


#### Sample Regexs ####

[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+

## Examples

In [39]:
# https://www.youtube.com/watch?v=K8L6KVGG-7o&t=392s
import re

text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
coreyms.com
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T

cat
pat
mat
bat
'''

sentence = 'Start a sentence and then bring it to an end'

### create a search pattern

In [9]:
pattern = re.compile(r'abc')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)
print(text_to_search[1:4])

<re.Match object; span=(1, 4), match='abc'>
abc


### .       - Any Character Except New Line
* search almost all character
* with a backslash search all periods (/.)

In [10]:
pattern = re.compile(r'\.')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(111, 112), match='.'>
<re.Match object; span=(146, 147), match='.'>
<re.Match object; span=(167, 168), match='.'>
<re.Match object; span=(171, 172), match='.'>
<re.Match object; span=(218, 219), match='.'>
<re.Match object; span=(249, 250), match='.'>
<re.Match object; span=(262, 263), match='.'>


In [11]:
pattern = re.compile(r'coreyms\.com')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(139, 150), match='coreyms.com'>


### \d      - Digit (0-9)

In [14]:
pattern = re.compile(r'\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(55, 56), match='1'>
<re.Match object; span=(56, 57), match='2'>
<re.Match object; span=(57, 58), match='3'>
<re.Match object; span=(58, 59), match='4'>
<re.Match object; span=(59, 60), match='5'>
<re.Match object; span=(60, 61), match='6'>
<re.Match object; span=(61, 62), match='7'>
<re.Match object; span=(62, 63), match='8'>
<re.Match object; span=(63, 64), match='9'>
<re.Match object; span=(64, 65), match='0'>
<re.Match object; span=(151, 152), match='3'>
<re.Match object; span=(152, 153), match='2'>
<re.Match object; span=(153, 154), match='1'>
<re.Match object; span=(155, 156), match='5'>
<re.Match object; span=(156, 157), match='5'>
<re.Match object; span=(157, 158), match='5'>
<re.Match object; span=(159, 160), match='4'>
<re.Match object; span=(160, 161), match='3'>
<re.Match object; span=(161, 162), match='2'>
<re.Match object; span=(162, 163), match='1'>
<re.Match object; span=(164, 165), match='1'>
<re.Match object; span=(165, 166), match='2'>
<re.Matc

### \w      - Word Character (a-z, A-Z, 0-9, _)

In [15]:
pattern = re.compile(r'\w')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(3, 4), match='c'>
<re.Match object; span=(4, 5), match='d'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='f'>
<re.Match object; span=(7, 8), match='g'>
<re.Match object; span=(8, 9), match='h'>
<re.Match object; span=(9, 10), match='i'>
<re.Match object; span=(10, 11), match='j'>
<re.Match object; span=(11, 12), match='k'>
<re.Match object; span=(12, 13), match='l'>
<re.Match object; span=(13, 14), match='m'>
<re.Match object; span=(14, 15), match='n'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(16, 17), match='p'>
<re.Match object; span=(17, 18), match='q'>
<re.Match object; span=(18, 19), match='u'>
<re.Match object; span=(19, 20), match='r'>
<re.Match object; span=(20, 21), match='t'>
<re.Match object; span=(21, 22), match='u'>
<re.Match object; span=(22, 23), match='v'>
<re.Match object; span=(23, 24), match='w'>
<re.M

### \s      - Whitespace (space, tab, newline)

In [16]:
pattern = re.compile(r'\s')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='\n'>
<re.Match object; span=(27, 28), match='\n'>
<re.Match object; span=(54, 55), match='\n'>
<re.Match object; span=(65, 66), match='\n'>
<re.Match object; span=(68, 69), match=' '>
<re.Match object; span=(73, 74), match='\n'>
<re.Match object; span=(88, 89), match=' '>
<re.Match object; span=(94, 95), match=' '>
<re.Match object; span=(97, 98), match=' '>
<re.Match object; span=(100, 101), match=' '>
<re.Match object; span=(110, 111), match='\n'>
<re.Match object; span=(112, 113), match=' '>
<re.Match object; span=(114, 115), match=' '>
<re.Match object; span=(116, 117), match=' '>
<re.Match object; span=(118, 119), match=' '>
<re.Match object; span=(120, 121), match=' '>
<re.Match object; span=(122, 123), match=' '>
<re.Match object; span=(124, 125), match=' '>
<re.Match object; span=(126, 127), match=' '>
<re.Match object; span=(128, 129), match=' '>
<re.Match object; span=(130, 131), match=' '>
<re.Match object; span=(132, 133), match=' '>
<r

### \b and \B      - Word Boundary

In [18]:
pattern = re.compile(r'\bHa')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(66, 68), match='Ha'>
<re.Match object; span=(69, 71), match='Ha'>


In [19]:
pattern = re.compile(r'\BHa')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(71, 73), match='Ha'>


### ^       - Beginning of a String

In [13]:
pattern = re.compile(r'^Start')
matches = pattern.finditer(sentence)
for match in matches:
    print(match)

<re.Match object; span=(0, 5), match='Start'>


### $       - End of a String

In [22]:
pattern = re.compile(r'end$')
matches = pattern.finditer(sentence)
for match in matches:
    print(match)

<re.Match object; span=(41, 44), match='end'>


### - (dash)

In [34]:
pattern = re.compile(r'[1-5]')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(55, 56), match='1'>
<re.Match object; span=(56, 57), match='2'>
<re.Match object; span=(57, 58), match='3'>
<re.Match object; span=(58, 59), match='4'>
<re.Match object; span=(59, 60), match='5'>
<re.Match object; span=(151, 152), match='3'>
<re.Match object; span=(152, 153), match='2'>
<re.Match object; span=(153, 154), match='1'>
<re.Match object; span=(155, 156), match='5'>
<re.Match object; span=(156, 157), match='5'>
<re.Match object; span=(157, 158), match='5'>
<re.Match object; span=(159, 160), match='4'>
<re.Match object; span=(160, 161), match='3'>
<re.Match object; span=(161, 162), match='2'>
<re.Match object; span=(162, 163), match='1'>
<re.Match object; span=(164, 165), match='1'>
<re.Match object; span=(165, 166), match='2'>
<re.Match object; span=(166, 167), match='3'>
<re.Match object; span=(168, 169), match='5'>
<re.Match object; span=(169, 170), match='5'>
<re.Match object; span=(170, 171), match='5'>
<re.Match object; span=(172, 173), match='1'

In [38]:
pattern = re.compile(r'[a-h]')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(3, 4), match='c'>
<re.Match object; span=(4, 5), match='d'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='f'>
<re.Match object; span=(7, 8), match='g'>
<re.Match object; span=(8, 9), match='h'>
<re.Match object; span=(67, 68), match='a'>
<re.Match object; span=(70, 71), match='a'>
<re.Match object; span=(72, 73), match='a'>
<re.Match object; span=(75, 76), match='e'>
<re.Match object; span=(77, 78), match='a'>
<re.Match object; span=(79, 80), match='h'>
<re.Match object; span=(80, 81), match='a'>
<re.Match object; span=(82, 83), match='a'>
<re.Match object; span=(83, 84), match='c'>
<re.Match object; span=(85, 86), match='e'>
<re.Match object; span=(91, 92), match='e'>
<re.Match object; span=(92, 93), match='e'>
<re.Match object; span=(93, 94), match='d'>
<re.Match object; span=(98, 99), match='b'>
<re.Match object; span=(99, 100), match='e'>
<re

In [36]:
pattern = re.compile(r'[a-hA-H]')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(3, 4), match='c'>
<re.Match object; span=(4, 5), match='d'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='f'>
<re.Match object; span=(7, 8), match='g'>
<re.Match object; span=(8, 9), match='h'>
<re.Match object; span=(28, 29), match='A'>
<re.Match object; span=(29, 30), match='B'>
<re.Match object; span=(30, 31), match='C'>
<re.Match object; span=(31, 32), match='D'>
<re.Match object; span=(32, 33), match='E'>
<re.Match object; span=(33, 34), match='F'>
<re.Match object; span=(34, 35), match='G'>
<re.Match object; span=(35, 36), match='H'>
<re.Match object; span=(66, 67), match='H'>
<re.Match object; span=(67, 68), match='a'>
<re.Match object; span=(69, 70), match='H'>
<re.Match object; span=(70, 71), match='a'>
<re.Match object; span=(71, 72), match='H'>
<re.Match object; span=(72, 73), match='a'>
<re.Match object; span=(75, 76), match='e'>
<re.

### - (dash) and ^ 

In [40]:
pattern = re.compile(r'[^b]at')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(267, 270), match='cat'>
<re.Match object; span=(271, 274), match='pat'>
<re.Match object; span=(275, 278), match='mat'>


### *       - 0 or More

In [41]:
pattern = re.compile(r'\d{3}.\d{3}.\d{4}')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(151, 163), match='321-555-4321'>
<re.Match object; span=(164, 176), match='123.555.1234'>
<re.Match object; span=(177, 189), match='123*555*1234'>
<re.Match object; span=(190, 202), match='800-555-1234'>
<re.Match object; span=(203, 215), match='900-555-1234'>


### ?       - 0 or One

In [46]:
pattern = re.compile(r'Mr\.?\s[A-Z]\w*')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>
<re.Match object; span=(260, 265), match='Mr. T'>


### |       - Either Or

In [50]:
pattern = re.compile(r'M(r|s|rs)\.?\s[A-Z]\w*')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>
<re.Match object; span=(237, 245), match='Ms Davis'>
<re.Match object; span=(246, 259), match='Mrs. Robinson'>
<re.Match object; span=(260, 265), match='Mr. T'>


### findall

In [80]:
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*')
matches = pattern.findall(text_to_search)
for match in matches:
    print(match)

Mr
Mr
Ms
Mrs
Mr


### search

In [84]:
pattern = re.compile(r'sentence', re.IGNORECASE)   # re.I works as well (flags)
matches = pattern.search(sentence)
print(matches)

<re.Match object; span=(8, 16), match='sentence'>


## Matching phone numbers

In [24]:
pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(151, 162), match='321-555-432'>
<re.Match object; span=(164, 175), match='123.555.123'>
<re.Match object; span=(177, 188), match='123*555*123'>
<re.Match object; span=(190, 201), match='800-555-123'>
<re.Match object; span=(203, 214), match='900-555-123'>


In [29]:
pattern = re.compile(r'\d\d\d[-.]\d\d\d[-.]\d\d\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(151, 162), match='321-555-432'>
<re.Match object; span=(164, 175), match='123.555.123'>
<re.Match object; span=(190, 201), match='800-555-123'>
<re.Match object; span=(203, 214), match='900-555-123'>


In [32]:
pattern = re.compile(r'[89]00[-.]\d\d\d[-.]\d\d\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(190, 201), match='800-555-123'>
<re.Match object; span=(203, 214), match='900-555-123'>


In [27]:
with open ('data.txt', 'r', encoding='utf-8') as f:
    contents = f.read()

In [28]:
pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d')
matches = pattern.finditer(contents)
for match in matches:
    print(match)

<re.Match object; span=(12, 23), match='615-555-716'>
<re.Match object; span=(102, 113), match='800-555-566'>
<re.Match object; span=(191, 202), match='560-555-515'>
<re.Match object; span=(281, 292), match='900-555-934'>
<re.Match object; span=(378, 389), match='714-555-740'>
<re.Match object; span=(467, 478), match='800-555-677'>
<re.Match object; span=(557, 568), match='783-555-479'>
<re.Match object; span=(647, 658), match='516-555-461'>
<re.Match object; span=(740, 751), match='127-555-186'>
<re.Match object; span=(829, 840), match='608-555-493'>
<re.Match object; span=(915, 926), match='568-555-605'>
<re.Match object; span=(1003, 1014), match='292-555-187'>
<re.Match object; span=(1091, 1102), match='900-555-320'>
<re.Match object; span=(1180, 1191), match='614-555-116'>
<re.Match object; span=(1269, 1280), match='530-555-267'>
<re.Match object; span=(1355, 1366), match='470-555-275'>
<re.Match object; span=(1439, 1450), match='800-555-608'>
<re.Match object; span=(1526, 1537), m

In [42]:
pattern = re.compile(r'\d{3}.\d{3}.\d{3}')
matches = pattern.finditer(contents)
for match in matches:
    print(match)

<re.Match object; span=(12, 23), match='615-555-716'>
<re.Match object; span=(102, 113), match='800-555-566'>
<re.Match object; span=(191, 202), match='560-555-515'>
<re.Match object; span=(281, 292), match='900-555-934'>
<re.Match object; span=(378, 389), match='714-555-740'>
<re.Match object; span=(467, 478), match='800-555-677'>
<re.Match object; span=(557, 568), match='783-555-479'>
<re.Match object; span=(647, 658), match='516-555-461'>
<re.Match object; span=(740, 751), match='127-555-186'>
<re.Match object; span=(829, 840), match='608-555-493'>
<re.Match object; span=(915, 926), match='568-555-605'>
<re.Match object; span=(1003, 1014), match='292-555-187'>
<re.Match object; span=(1091, 1102), match='900-555-320'>
<re.Match object; span=(1180, 1191), match='614-555-116'>
<re.Match object; span=(1269, 1280), match='530-555-267'>
<re.Match object; span=(1355, 1366), match='470-555-275'>
<re.Match object; span=(1439, 1450), match='800-555-608'>
<re.Match object; span=(1526, 1537), m

## Matching emails

In [54]:
emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
'''

In [62]:
pattern = re.compile(r'[a-zA-Z]+@[a-zA-Z]+\.com')
matches = pattern.finditer(emails)
for match in matches:
    print(match)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>


In [61]:
pattern = re.compile(r'[a-zA-Z]+@[a-zA-Z]+\.(com|edu)')
matches = pattern.finditer(emails)
for match in matches:
    print(match)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(31, 53), match='schafer@university.edu'>


In [63]:
pattern = re.compile(r'[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\.(com|edu|net)')
matches = pattern.finditer(emails)
for match in matches:
    print(match)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
<re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


In [64]:
pattern = re.compile(r'[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\.[a-zA-Z0-9.-]+')
matches = pattern.finditer(emails)
for match in matches:
    print(match)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
<re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


## Matching URLs

In [68]:
urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

In [70]:
pattern = re.compile(r'https?://(www\.)?\w+\.\w+')
matches = pattern.finditer(urls)
for match in matches:
    print(match)

<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 42), match='http://coreyms.com'>
<re.Match object; span=(43, 62), match='https://youtube.com'>
<re.Match object; span=(63, 83), match='https://www.nasa.gov'>


In [71]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = pattern.finditer(urls)
for match in matches:
    print(match)

<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 42), match='http://coreyms.com'>
<re.Match object; span=(43, 62), match='https://youtube.com'>
<re.Match object; span=(63, 83), match='https://www.nasa.gov'>


In [72]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = pattern.finditer(urls)
for match in matches:
    print(match.group(0))

https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov


In [73]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = pattern.finditer(urls)
for match in matches:
    print(match.group(1))

www.
None
None
www.


In [74]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = pattern.finditer(urls)
for match in matches:
    print(match.group(2))

google
coreyms
youtube
nasa


In [75]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = pattern.finditer(urls)
for match in matches:
    print(match.group(3))

.com
.com
.com
.gov


In [78]:
subbed_url = pattern.sub(r'\2\3', urls)
print(subbed_url)


google.com
coreyms.com
youtube.com
nasa.gov

