# Regular Expressions in Python - Exercises

In [None]:
# Before doing anything else, run this code block.
import re 

# This makes available the functions in the re module - re.search, re.match, re.findall, etc

# Don't worry about understanding the code below just now, it's just for testing purposes
def test_regex(regex_name, regex, successful_cases, unsuccessful_cases):
    cregex = re.compile(regex)
    errors = 0
    for case in successful_cases:
        if not cregex.search(case):
            print(" ".join(["Your",regex_name,"didn't match",case]))
            errors += 1
    for case in unsuccessful_cases:
        if cregex.search(case):
            print(" ".join(["Your",regex_name,"matched",case,
                            "when it wasn't supposed to"]))
            errors += 1
    if errors == 0:
        print("No errors in your " + regex_name)
        
print("Yay you successfully ran this code block!")

In [None]:
# Exercise 1 - Using metacharacters

# Match sequences that look like this: A1B C2D - pairs of letter-digit-letter sequences.
# Your regular expression should consist of a string of metacharacters
# Remember that \w
regex  = r'...' # replace the ellipsis with a suitable regular expression
string = 'A1B C2D' 
string2 = 'X4Y W8T'
print(re.match(regex, string))
print(re.match(regex, string2))

In [None]:
# Exercise 1 (continued) - what happens if we try to match a non-matching string?

string = 'A1B_C2D' # play around with this and see what happens!
re.match(regex, string)

In [None]:
# Exercise 2 - Specifying custom sets of characters

# We saw some metacharacters in the previous slide: \w, \d, \s
# Define custom character ranges that are equivalent to these, 
# WITHOUT using metacharacters (things like \n are fine)

# Exercise 2a
digit_regex     = r'[...]' # match definition of \d

# Exercise 2b
wordlike_regex  = r'[...]' # match definition of \w (at least, for English)

# Exercise 2c
spacelike_regex = r'[...]' # match definition of \s

# No need to edit below this point. Run this code block to see if your answers were correct.

test_regex('Digit regex', digit_regex, '0123456789', 'aZ \t\n$_')
test_regex('Wordlike regex', wordlike_regex, 'ampBLY_058', ' \t\n-$')
test_regex('Spacelike regex', spacelike_regex, ' \t\r\n', 'ahlB_-$9')

In [None]:
# Exercise 3 - examples with ^, $, \b

# Go through each line and predict whether there will be a match or not. 
# Then run this code block and see whether you were right.

print("Match 1: ", re.search(r'^Hallo$', 'Hallo'))        # Matches: yes/no
print("Match 2: ", re.search(r'^Hallo$', 'Hallo World'))  # Matches: yes/no
print("Match 3: ", re.search(r'^Hallo$', 'Well, Hallo!')) # Matches: yes/no

print("Match 4: ", re.search(r'\bHallo\b', 'Well, Hallo!')) # Matches: yes/no
print("Match 5: ", re.search(r'\bHallo\b', 'Harry Potter and the Deathly Hallows')) # Matches: yes/no

In [None]:
# Exercise 4a - match prices $5.99 and less

# Run this code block without changing anything

price_regex = r'$[0-5].\d\d' # match strings $0.00 to $5.99

if not re.match(price_regex, '$3.99'):
    print("Uh oh, it didn't match!")
    
# Why not?

In [None]:
# Exercise 4b - fixing price regex

# Fix the price_regex below and try running this code block
price_regex = r'[$][0-5][.]\d\d' # Gonna take more than one change!

if not re.match(price_regex, '$3.99'):
    print("Uh oh, it didn't match!")
else :
    print("Nice, you got it.")

if re.match(price_regex, '$3999'):
    print("Uh oh, it matched $3999!")

In [None]:
# Exercise 4d - checking that metacharacters have their usual meaning within []

metacharacter_regex = r'[$^.]'
test_regex('Metacharacter regex', metacharacter_regex, ['^$.','$money','wh^^at?'], ['a1','di'])

In [None]:
# Exercise 5a - Set complements

# Let's rearrange the metacharacter_regex above. Why doesn't it work as we wanted?
metacharacter_regex = r'[^$.]'
test_regex('Metacharacter regex', metacharacter_regex, '^$.', 'a1')

# What can we change to make it work, besides rearranging the characters?
edited_metacharacter_regex = r'[^$.]'
test_regex('Edited Metacharacter regex', edited_metacharacter_regex, '^$.', 'a1')

# So, saying "within character sets [], metacharacters have their regular meaning except backslashes"
# is true EXCEPT that ^ means complement if it's the first character [^...]. Elsewhere, it means the caret character.
# This is confusing...

In [None]:
# Exercise 5b - Set complements

# You're processing some DNA sequences and you notice that some of them have been corrupted - 
# they contain letters other than A,T,C and G! Write a regex that will let you remove the corrupted sequences.

not_dna_regex = r'...' # match sequences that contain anything other than A, T, C and G
test_regex('Not DNA regex', not_dna_regex, ['ATCHGA', '#*@!'], ['AGGGGGCTAA', 'ACGAT'])

# Notice that your regex matches non-alphabetic characters as well!

In [None]:
# Exercise 5c - Set complements

# You're looking at a list of purchases again. 
# Acme company uses product codes with A,B,C followed by three digits.
# Axeme company uses product codes X,Y,Z followed by three digits.
# Write a regex to extract lines with just Acme and Axeme's product codes,.
# Also, make sure that the product code occurs at the end of the line

acme_axeme_regex = r'...'

# No need to edit below this line
matches = ['This line contains Acme product code C180', 
           'This line contains Axeme product X007']
nonmatches = ['This line contains Hugo\'s product code J982',
              'This line contains an incorrect Axeme product code Z48A',
              'This line contains Acme product code C180 but it\'s not at the end.']
test_regex('Acme/Axeme regex', acme_axeme_regex, matches, nonmatches)


In [None]:
# Exercise 6a

# Write a regex that matches the words bad, baad, baaad for any arbitrary number of a's

animal_farm_bleat_regex = r'...' # replace the ellipsis with your regex
test_regex('Animal farm bleat', animal_farm_bleat_regex, ['baaaaaaad', 'This is baaaaaaaaaaad!'], ['bd', 'bacd', 'baaaaa', 'sinbaaaad'])

In [None]:
# Exercise 6b

# Write a regex that matches the word baaad (3 a's) up to baaaaaaaaaad (10 a's).

animal_farm_short_bleat_regex = r'...'
# no need to edit below this point
matches = ['baaaaaaad', 'This is baaaaaaaaaad!']
nonmatches = ['bd', 'bacd', 'baaaaa', 'sinbaaaad', 'baad', 'baaaaaaaaaaaaad']
test_regex('Animal farm short bleat', animal_farm_short_bleat_regex, matches, nonmatches)

In [None]:
# Exercise 6c

# Write a regex that matches both the strings 'homebrew' and 'home-brew'

homebrew_regex = r'...'
test_regex('Homebrew regex', homebrew_regex, ['homebrew', 'home-brew'], ['home brew'])

In [None]:
# Exercise 6d

# Following in the footsteps of Ernest Wright Homer, you're writing a book with no E's.
# That's right, you can't even use the word 'the'. You decide to filter your wordlist so
# contains only words with no E's. Write a regular expression to filter the dictionary.

no_e_regex = r'^...$' # replace the ellipsis with a regex that matches only strings without E's. Spaces are acceptable.
test_regex('No E regex', no_e_regex, ['Brobdingnag', 'ginormous'], ['never', 'what never?', 'no never', 'Elephant'])

# Discuss: why can't we use a regex without the +/*?

In [None]:
# Exercise 6e

# Look at regex 1 and regex 2 defined below. How much of ab_string do you think they will match?
regex1 = r'(ab)*'
regex2 = r'ab*'
ab_string = 'ababababa'

# my guess for regex 1:
# my guess for regex 2: 

print(re.match(regex1, ab_string).group(0))
print(re.match(regex2, ab_string).group(0))

# You can also match repetitions of multiple characters using open and close brackets

In [None]:
# Exercise 7

# Write a regex that matches filenames ending in .doc and .odt

word_file_regex = r'...'

test_regex('Word file regex', word_file_regex, ['data_science.doc', 'textmining.odt'], ['other.ddt', 'odt'])

In [None]:
# Exercise 8a

# Going back to the E-less book, write a function that takes a list of words
# and returns only those words that contain no E's.
# An outline of the function has been supplied for you.

import re
def remove_e_words(wordlist) :
    non_e_words = []
    
    pattern = re.compile(r"...") # replace ... with a pattern
    
    for word in wordlist :
        if ...: # replace the ellipsis with your code
            non_e_words.append(word)
    return(non_e_words)

assert remove_e_words(['Acorn', 'Bread', 'Cornflakes', 'Dairy', 'Elephant ears']) == ['Acorn', 'Dairy']

# Discuss: does it make sense to use re.match or re.search in this instance?

In [None]:
# Exercise 8b

# You want to filter a list of strings for those containing valid email addresses.
# First write a regular expression that checks for a string containing:
# (1) some alphanumeric stuff
# (2) @ sign
# (3) domain name (alphanumeric)
# (4) .com, .org, .edu or .net (Okay, this isn't all valid email addresses)

email_regex = r'...' # replace this with your regex

# Now replace this with the rest of the function you need 

import re
def filter_lines_with_email(list_of_lines):
    filtered_lines = list()
    pattern = re.compile(r"...") # replace ... with your pattern

    for line in list_of_lines :
        if ...: # replace ellipsis with your code
            filtered_lines.append(line)
            
    return(filtered_lines)

line0 = 'This line contains a valid email address amy@gmail.com'
line1 = 'This isn\'t a valid email: @gmail.com'
line2 = 'You can contact me at maf@mit.edu, anytime'
line3 = 'If you tried to send an email to @@(@$@gmail.com I\'m sure it would be rejected'

assert filter_lines_with_email([line0, line1, line2, line3]) == [line0, line2]

In [None]:
# Exercise 10

# Write a regular expression for finding adverbs (ending in -ly) 
# and use it in the find_adverbs() function to find all the adverbs in a sentence.

adverb_regex = r'...' # replace the ellipsis with your regular expression
def find_adverbs(sentence):
    """Returns a list of adverbs in the supplied sentence"""
    ...
    return (...) # return a list of words

extremely_bad_writing = """Mary swooned breathlessly when her paramour entered the room speedily, 
relying greatly on his eagle eyes to find his lady-love lying on the bed."""

try:
    assert find_adverbs(extremely_bad_writing) == ['breathlessly', 'speedily', 'greatly']
except:
    print(" ".join(["Got the wrong set:"] + find_adverbs(extremely_bad_writing)))