Click <a href='https://www.dataquest.io/blog/web-scraping-tutorial-python/'>here</a> to learn about Regular Expressions (RegEx) using Python.

In [None]:
########################
# DO NOT RUN THIS CELL #
########################

a, X, 9, < -- ordinary characters just match themselves exactly.
. (a period) -- matches any single character except newline '\n'
\w -- matches a "word" character: a letter or digit or underbar [a-zA-Z0-9_].
\W -- matches any non-word character.
\b -- matches word boundary (in between a word character and a non word character)
\s -- matches a single whitespace character -- space, newline, return, tab
\S -- matches any non-whitespace character.
\t, \n, \r -- tab, newline, return
\d -- matches any numeric digit [0-9]
\D matches any non-numeric character.
^ -- matches the beginning of the string, or specify omition of certain characters
$ -- matches the end of the string
\ -- escapes special character.
(x|y|z) matches exactly one of x, y or z.
(x) in general is a remembered group. We can get the value of what matched by using the groups() method of the object returned by re.search.
x? matches an optional x character (in other words, it matches an x zero or one times).
x* matches x zero or more times.
x+ matches x one or more times.
x{m,n} matches an x character at least m times, but not more than n times.
?: matches an expression but do not capture it. Non capturing group.
?= matches a suffix but exclude it from capture. Positive lookahead.
a(?=b) will match the "a" in "ab", but not the "a" in "ac"
In other words, a(?=b) matches the "a" which is followed by the string 'b', without consuming what follows the a.
?! matches if suffix is absent. Negative look ahead.
a(?!b) will match the "a" in "ac", but not the "a" in "ab"
?<= positive look behind
[] matches for groupings of consecutive characters
?<! negative look behind

########################
# DO NOT RUN THIS CELL #
########################

What are word boundaries?
--------------------------------------------------
Before the first character in the string, if the first character is a word character.<br>
After the last character in the string, if the last character is a word character.<br>
Between two characters in the string, where one is a word character and the other is not a word character<br>

In [1]:
import re

# open a new data stream into a file
file = open('./names.txt', encoding='utf-8')

# read the text of the file and store it as Python data
data = file.read()

# always close the data stream
file.close()

### Search for names

In [3]:
re.match(r'Hawkins', data)

<re.Match object; span=(0, 7), match='Hawkins'>

In [4]:
re.match(r'Vader', data)

In [5]:
re.search(r'Hawkins'), data)

SyntaxError: unmatched ')' (Temp/ipykernel_13888/1764157413.py, line 1)

In [7]:
re.search(r'Hawkins', data)

<re.Match object; span=(0, 7), match='Hawkins'>

In [8]:
re.search(r'Vader', data)

<re.Match object; span=(754, 759), match='Vader'>

In [9]:
re.search(r'\w', data)

<re.Match object; span=(0, 1), match='H'>

In [10]:
re.search(r'\w, \w', data)

<re.Match object; span=(6, 10), match='s, D'>

In [11]:
re.search(r'\w\w\w\w\w\w\w, \w\w\w\w\w', data)

<re.Match object; span=(0, 14), match='Hawkins, Derek'>

In [12]:
re.search(r'\w{7}, \w{5}, data')

TypeError: search() missing 1 required positional argument: 'string'

In [13]:
re.search(r'\w{7}, \w{5}', data)

<re.Match object; span=(0, 14), match='Hawkins, Derek'>

In [14]:
re.search(r'\w+, \w+', data)

<re.Match object; span=(0, 14), match='Hawkins, Derek'>

### Search for phone numbers

In [15]:
re.search(r'(\d\d\d) \d\d\d-\d\d\d\d', data)

<re.Match object; span=(613, 625), match='555 555-5551'>

In [16]:
re.search(r'\(\d\d\d) \d\d\d-\d\d\d\d', data)

error: unbalanced parenthesis at position 8

In [17]:
re.search(r'\(\d\d\d\) \d\d\d-\d\d\d\d', data)

<re.Match object; span=(38, 52), match='(555) 555-5555'>

In [18]:
re.search(r'\(\d{3}\) \d\d\d-\d\d\d\d', data)

<re.Match object; span=(38, 52), match='(555) 555-5555'>

In [19]:
re.search(r'\(\d{3}\) \d{3}-\d{4}', data)

<re.Match object; span=(38, 52), match='(555) 555-5555'>

<strong>Exercise 1</strong>:<br>
Write a function that checks for n number of consecutive digits and returns the match

In [22]:
re.search(r'\(\d{3}\) \d{3}-\d{4}', data)

<re.Match object; span=(38, 52), match='(555) 555-5555'>

In [25]:
def consec_digits(n):
    return re.search(r'\d{n}')

In [29]:
def find_digits(n, searchable_text):
    return re.search(r'\d'*n, searchable_text)

In [30]:
find_digits(3, data)

<re.Match object; span=(39, 42), match='555'>

In [44]:
phone_numbers = "(555) 555-5555 (555) 555-5554 555-555-5553 555-555-5552 555-555-5551 555 555-5551 555-555-5550 555555-5549"

In [33]:
re.findall(r'\(\d{3}\)\s\d{3}-\d{4}', phone_numbers)

['(555) 555-5555', '(555) 555-5554']

In [36]:
re.findall(r'\d{3}\s\d{3}-\d{4}', phone_numbers)

['555 555-5551']

In [41]:
re.findall(r'\(?\d{3}\)?\s?-?\d{3}-\d{4}', phone_numbers)

['(555) 555-5555',
 '(555) 555-5554',
 '555-555-5553',
 '555-555-5552',
 '555-555-5551',
 '555 555-5551',
 '555-555-5550',
 '555555-5549']

In [46]:
re.findall(r'\+?\d?\s?\(?\d{3}\)?\s?-?\d{3}-\d{4}', phone_numbers)

['(555) 555-5555',
 ' (555) 555-5554',
 ' 555-555-5553',
 ' 555-555-5552',
 ' 555-555-5551',
 ' 555 555-5551',
 ' 555-555-5550',
 ' 555555-5549']

In [47]:
#search for names
re.findall(r'\w+,\s\w+', data)

['Hawkins, Derek',
 'Teacher, Coding',
 'Milliken, Connor',
 'Teacher, Coding',
 'Johnson, Joe',
 'Carter, Joel',
 'Österberg, Sven',
 'Governor, Norrbotten',
 'Enchanter, Killer',
 'Butz, Ryan',
 'CEO, Coding',
 'Doctor, The',
 'Lord, Gallifrey',
 'Exampleson, Example',
 'Example, Example',
 'Obama, Barack',
 'President, United',
 'Patel, Ripal',
 'Teacher, Coding',
 'Vader, Darth',
 'Lord, Galactic',
 'Sanz, María',
 'Minister, Spanish']

In [None]:
re.findall(r'\w+?\s?,\s\w+?)

In [50]:
re.findall(r'[\d\w\'-+.]@[-.\d\w]+', data)

['k@codingtemple.com',
 'r@codingtemple.com',
 'n@codingtemple.com',
 'r@norrbotten.co.se',
 'm@killerrabbit.com',
 'b@codingtemple.com',
 'n@tardis.co.uk',
 'e@example.com',
 '4@us.gov',
 'p@codingtemple.com',
 'r@empire.gov',
 's@spain.gov']

<strong>Exercise 2</strong>:<br>
Use Regular Expressions to pull the last phone number with the country code in the list using .findall()

##### Exercise 2:

Write a function using regular expressions to find the domain name in the given email addresses (and return None for the invalid email addresses)<br><b>HINT: Use '|' for either or</b>

##### Exercise 3: 

Use a regular expression to find every number in the given string

### Homework Exercise <br>
<p>Print each persons name and twitter handle etc., using groups, should look like:</p>

In [107]:
# [
#     ([first name] [last name],
#      email, 
#      phone,
#      title,
#      Twitter handle)
# ]

In [54]:
re.findall(r'''''?P<name>[\w]+\s[w]+)\s:\s(?P<email>[\d\w\'-+.]+@[.-/\w\d]+)\s:\s(?<phone>\+?\d?\s?\(?\d{3}\)?\s?-?\d{3}-\d{4})\s:\s(?<title>[\w]+\s,\s[w]+)\s:\s(?<twitter>@[w]+)''''', re.M)

error: unbalanced parenthesis at position 21

In [60]:
re findall(r'\w+', data)

SyntaxError: invalid syntax (Temp/ipykernel_13888/3223479389.py, line 1)

In [65]:
 re.compile(r'''
    (?P<name>[\w]+,\s[\w]+)       
    \s:\s
    (?P<email>[\d\w\'-+.]+@[.-/\w\d]+)    
    \s:\s
    (?P<phone>\+?\d?\s?\(?\d{3}\)?\s?-?\d{3}-\d{4})
    ''', re.X)




re.compile(r"\n   (?P<name>[\w]+,\s[\w]+)       \n   \s:\s\n   (?P<email>[\d\w\'-+.]+@[.-/\w\d]+)    \n   \s:\s\n   (?P<phone>\+?\d?\s?\(?\d{3}\)?\s?-?\d{3}-\d{4})\n   ",
re.UNICODE|re.VERBOSE)

In [63]:
re.findall(r'@\w+', data)

['@codingtemple',
 '@derekhawkins',
 '@codingtemple',
 '@codingtemple',
 '@norrbotten',
 '@sverik',
 '@killerrabbit',
 '@codingtemple',
 '@ryanbutz',
 '@tardis',
 '@example',
 '@example',
 '@us',
 '@potus44',
 '@codingtemple',
 '@ripalp',
 '@empire',
 '@darthvader',
 '@spain']

In [69]:
info = re.findall(r'''
    ([\w]+,\s[\w]+)                             # last name, first name
    \s:\s
    ([\d\w\'-+.]+@[.-/\w\d]+)                   # email
    \s:\s
    (\+?\d?\s?\(?\d{3}\)?\s?-?\d{3}-\d{4})      # phone number
''', data, re.X)

people = []

for person in info:
    person_dict = {
        'name': person[0],
        'email': person[1],
        'phone': person[2],
    }
    people.append(person_dict)
    
for p in people:
    print(f"Name: {p['name']}")
    print(f"Email: {p['email']}")
    print(f"Phone: {p['phone']}")
    print()

In [70]:
re.findall(r'''
    ([\w]+,\s[\w]+)                             
    \s:\s
    ([\d\w\'-+.]+@[.-/\w\d]+)                  
    \s:\s
    (\+?\d?\s?\(?\d{3}\)?\s?-?\d{3}-\d{4})      
''', data, re.X)

[]

In [71]:
for person in info:
    person_dict = {
        'name': person[0],
        'email': person[1],
        'phone': person[2],
    }
    people.append(person_dict)
    
for p in people:
    print(f"Name: {p['name']}")
    print(f"Email: {p['email']}")
    print(f"Phone: {p['phone']}")
    print()

In [72]:
re.findall(r'''
    ([\w]+,\s[\w]+)                             
    \s:\s
    ([\d\w\'-+.]+@[.-/\w\d]+)                  
    \s:\s
    (\+?\d?\s?\(?\d{3}\)?\s?-?\d{3}-\d{4})      
''', data)

[]

In [77]:
re.findall(r'''
    ([\w]+,\s[\w]+)                             # last name, first name
    \s:\s
    ([\d\w\'-+.]+@[.-/\w\d]+)                   # email
    \s:\s
    (\+?\d?\s?\(?\d{3}\)?\s?-?\d{3}-\d{4})      # phone number
''', data, re.X)

people = []

for person in data:
    person_dict = {
        'name': person[1],
        'email': person[2],
        'phone': person[3],
    }
    people.append(person_dict)
    
for p in people:
    print(f"Name: {p['name']}")
    print(f"Email: {p['email']}")
    print(f"Phone: {p['phone']}")
    print()

IndexError: string index out of range

In [4]:
re.findall(r'([\w]+,\s[\w]+)[\d\w\'-+.]+@[.-/\w\d]+', data)

[]

In [3]:
re.findall(r'[\s\w]+?,[\s\w]+', data)

['Hawkins, Derek\tderek',
 '5555\tTeacher, Coding Temple\t',
 'derekhawkins\nMilliken, Connor\tconnor',
 '5554\tTeacher, Coding Temple\nJohnson',
 'com\t\tCarter, Joel\nÖsterberg',
 'se\t\tGovernor, Norrbotten\t',
 'sverik\n, Tim\ttim',
 'com\t\tEnchanter, Killer Rabbit Cave\nButz',
 '5543\tCEO, Coding Temple\t',
 'ryanbutz\nDoctor, The\tdoctor',
 'uk\t\tTime Lord, Gallifrey\nExampleson',
 '5552\tExample, Example Co',
 'example\nObama, Barack\tpresident',
 '5551\tPresident, United States of America\t',
 'potus44\nPatel, Ripal\tripalp',
 '5553\tTeacher, Coding Temple\t',
 'ripalp\nVader, Darth\tdarth',
 '4444\tSith Lord, Galactic Empire\t',
 'darthvader\nFernández de la Vega Sanz, María Teresa\tmtfvs',
 'gov\t\tFirst Deputy Prime Minister, Spanish Govt']

In [5]:
re.findall(r'([\w]+,\s[\w]+)[\d\w\'-+.]+@[.-/\w\d]+', data)

[]

In [6]:
re.findall(r'\s[\w]+', data)

[' Derek',
 '\tderek',
 ' 555',
 '\tTeacher',
 ' Coding',
 ' Temple',
 '\nMilliken',
 ' Connor',
 '\tconnor',
 ' 555',
 '\tTeacher',
 ' Coding',
 ' Temple',
 '\nJohnson',
 ' Joe',
 '\tjoejohnson',
 '\tCarter',
 ' Joel',
 '\nÖsterberg',
 ' Sven',
 '\tgovernor',
 '\tGovernor',
 ' Norrbotten',
 ' Tim',
 '\ttim',
 '\tEnchanter',
 ' Killer',
 ' Rabbit',
 ' Cave',
 '\nButz',
 ' Ryan',
 '\tryanb',
 ' 555',
 '\tCEO',
 ' Coding',
 ' Temple',
 '\nDoctor',
 ' The',
 '\tdoctor',
 '\tTime',
 ' Lord',
 ' Gallifrey',
 '\nExampleson',
 ' Example',
 '\tme',
 '\t555',
 '\tExample',
 ' Example',
 ' Co',
 '\nObama',
 ' Barack',
 '\tpresident',
 '\t555',
 ' 555',
 '\tPresident',
 ' United',
 ' States',
 ' of',
 ' America',
 '\nPatel',
 ' Ripal',
 '\tripalp',
 ' 555',
 '\tTeacher',
 ' Coding',
 ' Temple',
 '\nVader',
 ' Darth',
 '\tdarth',
 ' 555',
 '\tSith',
 ' Lord',
 ' Galactic',
 ' Empire',
 '\nFernández',
 ' de',
 ' la',
 ' Vega',
 ' Sanz',
 ' María',
 ' Teresa',
 '\tmtfvs',
 '\tFirst',
 ' Deputy',
 ' 

In [13]:
re.findall(r'@[\w]+', data)

error: unterminated character set at position 1

In [18]:
re.findall(r'@\w\d\'', data)

SyntaxError: closing parenthesis ']' does not match opening parenthesis '(' (Temp/ipykernel_20308/2383119764.py, line 1)

In [22]:
re.findall(r'@[-.\d\w]+', data)
people = []

for person in data:
    person_dict = {
        'name': person[0],
        'email': person[1],
        'phone': person[2],
    }
    people.append(person_dict)
    
for p in people:
    print(f"Name: {p['name']}")
    print(f"Email: {p['email']}")
    print(f"Phone: {p['phone']}")
    print()

IndexError: string index out of range

In [23]:
(r'@[-.\d\w]+', data)


('@[-.\\d\\w]+',
 'Hawkins, Derek\tderek@codingtemple.com\t(555) 555-5555\tTeacher, Coding Temple\t@derekhawkins\nMilliken, Connor\tconnor@codingtemple.com\t(555) 555-5554\tTeacher, Coding Temple\nJohnson, Joe\tjoejohnson@codingtemple.com\t\tCarter, Joel\nÖsterberg, Sven-Erik\tgovernor@norrbotten.co.se\t\tGovernor, Norrbotten\t@sverik\n, Tim\ttim@killerrabbit.com\t\tEnchanter, Killer Rabbit Cave\nButz, Ryan\tryanb@codingtemple.com\t(555) 555-5543\tCEO, Coding Temple\t@ryanbutz\nDoctor, The\tdoctor+companion@tardis.co.uk\t\tTime Lord, Gallifrey\nExampleson, Example\tme@example.com\t555-555-5552\tExample, Example Co.\t@example\nObama, Barack\tpresident.44@us.gov\t555 555-5551\tPresident, United States of America\t@potus44\nPatel, Ripal\tripalp@codingtemple.com\t(555) 555-5553\tTeacher, Coding Temple\t@ripalp\nVader, Darth\tdarth-vader@empire.gov\t(555) 555-4444\tSith Lord, Galactic Empire\t@darthvader\nFernández de la Vega Sanz, María Teresa\tmtfvs@spain.gov\t\tFirst Deputy Prime Ministe