In [140]:
import re

def printIter(itr):
    for i in itr:
        print(i) 

In [141]:
#  .       - Any Character Except New Line
#  \d      - Digit (0-9)
#  \D      - Not a Digit (0-9)
#  \w      - Word Character (a-z, A-Z, 0-9, _)
#  \W      - Not a Word Character
#  \s      - Whitespace (space, tab, newline)
#  \S      - Not Whitespace (space, tab, newline)

#  \b      - Word Boundary, like [ ] in math, but with words
#  \B      - Not a Word Boundary
#  ^       - Beginning of a String
#  $       - End of a String

#  []      - Matches Characters in brackets
#  [^ ]    - Matches Characters NOT in brackets
#  |       - Either Or
#  ( )     - Group

#  Quantifiers:
#  *       - 0 or More
#  +       - 1 or More
#  ?       - 0 or One
#  {3}     - Exact Number
#  {3,4}   - Range of Numbers (Minimum, Maximum)


#  #### Sample Regexs ####

#  [a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+

In [142]:
text_to_search = '''

abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ

1234567890

Ha HaHa

MetaCharacters (Need to be escaped):

. ^ $ * + ? { } [ ] \ | ( )

coreyms.com
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234

mat
pat
bat

Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T

'''

In [143]:
pattern = re.compile(r'\bHa')  # Making our string raw

matches = pattern.finditer(text_to_search)  # finditer() - find only one pattern in All text 
                                            # and return iterator
printIter(matches)
    
print(text_to_search[66:71])

<re.Match object; span=(69, 71), match='Ha'>
<re.Match object; span=(72, 74), match='Ha'>
0

Ha


In [144]:
pattern = re.compile(r'^Start')  # ^ - said, that we looking for start on line and then word = Start

sentence = 'Start a sentence and then bring it to an end'

matches = pattern.finditer(sentence)  

printIter(matches)

pattern = re.compile(r'end$')  # ^ - said, that we looking for end on line and previous word = Start

matches = pattern.finditer(sentence)  

printIter(matches)

<re.Match object; span=(0, 5), match='Start'>
<re.Match object; span=(41, 44), match='end'>


In [145]:
pattern = re.compile(r'\d{3}.\d{3}.\d{4}')  # we find 3 digits in a row # Dot - means all signs

matches = pattern.finditer(text_to_search)  

printIter(matches)

<re.Match object; span=(157, 169), match='321-555-4321'>
<re.Match object; span=(170, 182), match='123.555.1234'>
<re.Match object; span=(183, 195), match='123*555*1234'>
<re.Match object; span=(196, 208), match='800-555-1234'>
<re.Match object; span=(209, 221), match='900-555-1234'>


In [146]:
pattern = re.compile(r'[89]00[.-]\d\d\d[.-]\d\d\d\d')  # we find 3 digits in a row # Dot - means all signs

# with open("./Files/data_for_ReGex.txt", "r") as f:
#     contents = f.read()

#     matches = pattern.finditer(contents)
    
#     printIter(matches)

matches = pattern.finditer(text_to_search)
    
printIter(matches)

<re.Match object; span=(196, 208), match='800-555-1234'>
<re.Match object; span=(209, 221), match='900-555-1234'>


In [147]:
pattern = re.compile(r'[^b]at')

matches = pattern.finditer(text_to_search)
    
printIter(matches)

<re.Match object; span=(223, 226), match='mat'>
<re.Match object; span=(227, 230), match='pat'>


In [148]:
pattern = re.compile(r'M(r|s|rs)\.?\s[A-Z]\w*')

matches = pattern.finditer(text_to_search)
    
printIter(matches)

<re.Match object; span=(236, 247), match='Mr. Schafer'>
<re.Match object; span=(248, 256), match='Mr Smith'>
<re.Match object; span=(257, 265), match='Ms Davis'>
<re.Match object; span=(266, 279), match='Mrs. Robinson'>
<re.Match object; span=(280, 285), match='Mr. T'>


In [149]:
emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
'''

pattern = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')

matches = pattern.finditer(emails)

printIter(matches)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
<re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


In [150]:
urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')

subbed_urls = pattern.sub(r'\2\3', urls) # sub() -> replaces urls that fits to pattern

print(subbed_urls)


google.com
coreyms.com
youtube.com
nasa.gov



In [151]:
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*')

matches = pattern.findall(text_to_search) # findall() -> finds all pattern in all text

print(matches)

['Mr', 'Mr', 'Ms', 'Mrs', 'Mr']


In [152]:
sentence = 'Start a sentence ANd then bring it to an end AND the begining'

pattern = re.compile(r'S\w*')

matches = pattern.match(sentence) # match() -> finds matches by pattern only in begining

print(matches)

<re.Match object; span=(0, 5), match='Start'>


In [153]:
pattern = re.compile(r'and', re.IGNORECASE)  # <-- Small of re.IGNORECASE is re.I

matches = pattern.search(sentence) # match() -> find what matches first by pattern in hole string

print(matches)

<re.Match object; span=(17, 20), match='ANd'>
