# Regex expression in python

### Wildcard

In [5]:
# "." wildcard
import re

def check_aei (text):
    result = re.search(r"a.e.i", text)
    return result != None

print(check_aei("academia")) # True
print(check_aei("aerial")) # False
print(check_aei("paramedic")) # True

True
False
True


### Character classes

In [6]:
print(re.search(r"[Pp]ython", "Python"))

<re.Match object; span=(0, 6), match='Python'>


In [7]:
print(re.search(r"[a-z]way", "The end of the highway"))

<re.Match object; span=(18, 22), match='hway'>


In [8]:
print(re.search(r"[a-z]way", "What a way to go"))

None


In [10]:
#[A-Z], [0-9],[.!?\\-]

import re
def check_punctuation (text):
    result = re.search(r"[.!?\\-]", text)
    return result != None

print(check_punctuation("This is a sentence that ends with a period.")) # True
print(check_punctuation("This is a sentence fragment without a period")) # False
print(check_punctuation("Aren't regular expressions awesome?")) # True
print(check_punctuation("Wow! We're really picking up some steam now!")) # True
print(check_punctuation("End of the line")) # False

True
False
True
True
False


In [11]:
# matching not a letter (matches the first space)
print(re.search(r"[^a-zA-Z]", "This is a sentence with spaces."))

<re.Match object; span=(4, 5), match=' '>


In [13]:
# matching not a letter (matches the first point because we added space)
print(re.search(r"[^a-zA-Z ]", "This is a sentence with spaces."))

<re.Match object; span=(30, 31), match='.'>


In [17]:
# matching cat
print(re.search(r"cat|dog", "I like cats."))

# matching dog
print(re.search(r"cat|dog", "I like dogs."))

# matching cat and dog
print(re.findall(r"cat|dog", "I like cats and dogs."))

<re.Match object; span=(7, 10), match='cat'>
<re.Match object; span=(7, 10), match='dog'>
['cat', 'dog']


### Repetition Qualifiers

In [19]:
#Repeated matches (.*)

print(re.search(r"Py.*n", "Pygmalion"))
print(re.search(r"Py.*n", "Python Programmin"))

<re.Match object; span=(0, 9), match='Pygmalion'>
<re.Match object; span=(0, 17), match='Python Programmin'>


In [20]:
print(re.search(r"Py[a-z]*n", "Python Programmin"))

<re.Match object; span=(0, 6), match='Python'>


In [21]:
print(re.search(r"Py[a-z]*n", "Pyn"))

<re.Match object; span=(0, 3), match='Pyn'>


In [23]:
# (+)
print(re.search(r"o+l+", "goldfish"))

<re.Match object; span=(1, 3), match='ol'>


In [24]:
print(re.search(r"o+l+", "woolly"))

<re.Match object; span=(1, 5), match='ooll'>


In [27]:
print(re.search(r"o+l+", "boil"))
print(re.search(r"o+.l+", "boil"))

None
<re.Match object; span=(1, 4), match='oil'>


In [28]:
# Finding 2a's

import re
def repeating_letter_a(text):
    result = re.search(r"(a.*){2}", text, re.IGNORECASE)
    return result != None

print(repeating_letter_a("banana")) # True
print(repeating_letter_a("pineapple")) # False
print(repeating_letter_a("Animal Kingdom")) # True
print(repeating_letter_a("A is for apple")) # True

True
False
True
True


In [31]:
# (?)
print(re.search(r"p?each", "To each"))
print(re.search(r"p?each", "To peach"))

<re.Match object; span=(3, 7), match='each'>
<re.Match object; span=(3, 8), match='peach'>


### Escaping characters

In [35]:
# (\)

print(re.search(r".com", "welcome"))
print(re.search(r"\.com", "welcome"))
print(re.search(r"\.com", "welcome.com"))

<re.Match object; span=(2, 6), match='lcom'>
None
<re.Match object; span=(7, 11), match='.com'>


In [36]:
# \w alphanumeric AND \d \s \b

print(re.search(r"\w*", "This is an example"))
print(re.search(r"\w*", "This_is_an_example"))

<re.Match object; span=(0, 4), match='This'>


In [37]:
# At least two alphanumeric separated by one or more whitespace characters

import re
def check_character_groups(text):
    result = re.search(r"\w{2}\s", text)
    return result != None

print(check_character_groups("One")) # False
print(check_character_groups("123  Ready Set GO")) # True
print(check_character_groups("username user_01")) # True
print(check_character_groups("shopping_list: milk, bread, eggs.")) # False

False
True
True
False


### Advanced regular expressions

In [42]:
# A.*a

print(re.search(r"A.*a", "Argentina"))
print(re.search(r"A.*a", "Azerbaijan"))

# restricting the word
print(re.search(r"^A.*a$", "Azerbaijan"))
print(re.search(r"^A.*a$", "Australia"))

<re.Match object; span=(0, 9), match='Argentina'>
<re.Match object; span=(0, 9), match='Azerbaija'>
None
<re.Match object; span=(0, 9), match='Australia'>


In [47]:
pattern = r"^[a-zA-Z_][a-zA-Z0-9_]*$"
print(re.search(pattern, "_this_is_a_valid_variable_name"))
print(re.search(pattern, "this isn't a valid varaible"))
print(re.search(pattern, "my_variable_name1"))

# number at beginning is not allowed by "^[a-zA-z]"
print(re.search(pattern, "1my_variable_name"))

<re.Match object; span=(0, 30), match='_this_is_a_valid_variable_name'>
None
<re.Match object; span=(0, 17), match='my_variable_name1'>
None


In [50]:
# Start with Upper case followed by some lowercase or space and finish with period,
# question mark, or exclamation
import re
def check_sentence(text):
    result = re.search(r"^[A-Z][a-z ]*[.?!]$", text)
    return result != None

print(check_sentence("Is this is a sentence?")) # True
print(check_sentence("is this is a sentence?")) # False
print(check_sentence("Hello")) # False
print(check_sentence("1-2-3-GO!")) # False
print(check_sentence("A star is born.")) # True

True
False
False
False
True


### Questions

In [52]:
import re
def check_time(text):
    pattern = r"[1-9]*[:][0-5]{1}[0-9]{1}[PpMm|AaMm| PpMm| AaMm]"
    result = re.search(pattern, text)
    return result != None

print(check_time("12:45pm")) # True
print(check_time("9:59 AM")) # True
print(check_time("6:60am")) # False
print(check_time("five o'clock")) # False

True
True
False
False


In [53]:
import re
def contains_acronym(text):
    pattern = r"\([0-9A-Za-z]*\)" 
    result = re.search(pattern, text)
    return result != None

print(contains_acronym("Instant messaging (IM) is a set of communication technologies used for text-based communication")) # True
print(contains_acronym("American Standard Code for Information Interchange (ASCII) is a character encoding standard for electronic communication")) # True
print(contains_acronym("Please do NOT enter without permission!")) # False
print(contains_acronym("PostScript is a fourth-generation programming language (4GL)")) # True
print(contains_acronym("Have fun using a self-contained underwater breathing apparatus (Scuba)!")) # True

True
True
False
True
True


In [54]:
import re
def check_zip_code (text):
    result = re.search(r"[0-8]{5}|[0-9]{5}[-][0-9]{4}", text)
    return result != None

print(check_zip_code("The zip codes for New York are 10001 thru 11104.")) # True
print(check_zip_code("90210 is a TV show")) # False
print(check_zip_code("Their address is: 123 Main Street, Anytown, AZ 85258-0001.")) # True
print(check_zip_code("The Parliament of Canada is at 111 Wellington St, Ottawa, ON K1A0A9.")) # False

True
False
True
False


### Group match

In [56]:
# \w letters, numbers and underscores
result = re.search(r"^(\w*), (\w*)$", "Lovelace, Ada")
print(result)

<re.Match object; span=(0, 13), match='Lovelace, Ada'>


In [57]:
print(result.groups())

('Lovelace', 'Ada')


In [59]:
result[1]

'Lovelace'

In [60]:
result[2]

'Ada'

In [61]:
"{} {}".format(result[2], result[1])

'Ada Lovelace'

In [78]:
def rearrage_name(name):
        result = re.search(r"^([\w \.-]*), ([\w \.-]*)$", name)
        if result is None:
            return name                            
        return "{} {}".format(result[2], result[1])

In [79]:
rearrage_name("Renato B., Rogerio B.")

'Rogerio B. Renato B.'

### Repetition Qualifiers

In [83]:
# numerical repetition qualifiers
result = re.search(r"[a-zA-Z]{5}","a ghost")
print(result)

# numerical repetition qualifiers
result = re.findall(r"[a-zA-Z]{5}","a scary ghost appeared")
print(result)

<re.Match object; span=(2, 7), match='ghost'>
['scary', 'ghost', 'appea']


In [85]:
# numerical repetition qualifiers riight size
result = re.findall(r"\b[a-zA-Z]{5}\b","a scary ghost appeared")
print(result)

['scary', 'ghost']


In [87]:
# numerical repetition qualifiers riight size
result = re.findall(r"\w{5,10}","I really like strawberries")
print(result)

['really', 'strawberri']


In [88]:
# numerical repetition qualifiers riight size
result = re.findall(r"\b[a-zA-Z]{5,}\b","a scary ghost appeared")
print(result)

['scary', 'ghost', 'appeared']


In [90]:
# numerical repetition qualifiers riight size
result = re.search(r"s\w{,20}","I really like strawberries")
print(result)

<re.Match object; span=(14, 26), match='strawberries'>


In [91]:
# Long_words at least 7 characters
import re
def long_words(text):
    pattern = r"\w{7,}"
    result = re.findall(pattern, text)
    return result

print(long_words("I like to drink coffee in the morning.")) # ['morning']
print(long_words("I also have a taste for hot chocolate in the afternoon.")) # ['chocolate', 'afternoon']
print(long_words("I never drink tea late at night.")) # []

['morning']
['chocolate', 'afternoon']
[]


### Extracting PID

In [96]:
regex = r"\[(\d+)\]"
result = re.search(regex, "A completely different string that also has numbers [34567]")
print(result[1])

34567


In [104]:
def extratec_pid(log_line):
        regex = r"\[(\d+)\]"
        result = re.search(regex, log_line)
        if result is None:
            return "Not a number"
        return result[1]

In [105]:
print(extratec_pid("A completely different string that also has numbers [not_number]"))

Not a number


### Groups REGEX

In [154]:
import re
def extract_pid(log_line):
    regex = r"\[(\d+)\]\:\ ([A-Z]*)"
    result = re.search(regex, log_line)
    if result is None:
        return None
    return "{} ({})".format(result[1], result[2])

print(extract_pid("July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade")) # 12345 (ERROR)
print(extract_pid("99 elephants in a [cage]")) # None
print(extract_pid("A string that also has numbers [34567] but no uppercase message")) # None
print(extract_pid("July 31 08:08:08 mycomputer new_process[67890]: RUNNING Performing backup")) # 67890 (RUNNING)

12345 (ERROR)
None
None
67890 (RUNNING)


### Splitting and replacing

In [156]:
# splitting and replacing
re.split(r"[.?!]", "One sentence. Another one> And the last one!")

['One sentence', ' Another one> And the last one', '']

In [157]:
# splitting and replacing
re.split(r"([.?!])", "One sentence. Another one> And the last one!")

['One sentence', '.', ' Another one> And the last one', '!', '']

### sub

In [158]:
re.sub("[\w.%+-]+@[\w.-]+", "[REDACTED]", "Received an email for go_nuts95@my.example.com")

'Received an email for [REDACTED]'

In [160]:
# backreferences
re.sub(r"^([\w .-]*), ([\w .-]*)$", r"\2 \1", "Lovelace, Ada")

'Ada Lovelace'

### split

In [161]:
re.split(r"the|a", "One sentence. Another one? And the last one!")

['One sentence. Ano', 'r one? And ', ' l', 'st one!']

### Practice

In [162]:
import re
def multi_vowel_words(text):
    pattern = r"\w*[aeiou]{3}\w*"
    result = re.findall(pattern, text)
    return result

print(multi_vowel_words("Life is beautiful")) 
# ['beautiful']

print(multi_vowel_words("Obviously, the queen is courageous and gracious.")) 
# ['Obviously', 'queen', 'courageous', 'gracious']

print(multi_vowel_words("The rambunctious children had to sit quietly and await their delicious dinner.")) 
# ['rambunctious', 'quietly', 'delicious']

print(multi_vowel_words("The order of a data queue is First In First Out (FIFO)")) 
# ['queue']

print(multi_vowel_words("Hello world!")) 
# []

['beautiful']
['Obviously', 'queen', 'courageous', 'gracious']
['rambunctious', 'quietly', 'delicious']
['queue']
[]


In [163]:
import re
def transform_comments(line_of_code):
    result = re.sub(r"#+", r"//", line_of_code)
    return result

print(transform_comments("### Start of program")) 
# Should be "// Start of program"
print(transform_comments("  number = 0   ## Initialize the variable")) 
# Should be "  number = 0   // Initialize the variable"
print(transform_comments("  number += 1   # Increment the variable")) 
# Should be "  number += 1   // Increment the variable"
print(transform_comments("  return(number)")) 
# Should be "  return(number)"

// Start of program
  number = 0   // Initialize the variable
  number += 1   // Increment the variable
  return(number)


In [165]:
import re
def convert_phone_number(phone):
    result = re.sub(r"(\d{3})-(\d{3})-(\d{4}|\d{5})", r"(\1) \2-\3", phone)
    return result

print(convert_phone_number("My number is 212-345-9999.")) # My number is (212) 345-9999.
print(convert_phone_number("Please call 888-555-1234")) # Please call (888) 555-1234
print(convert_phone_number("123-123-12345")) # 123-123-12345
print(convert_phone_number("Phone number of Buckingham Palace is +44 303 123 7300")) # Phone number of Buckingham Palace is +44 303 123 7300

My number is (212) 345-9999.
Please call (888) 555-1234
(123) 123-12345
Phone number of Buckingham Palace is +44 303 123 7300


In [166]:
import re
def convert_phone_number(phone):
    result = re.sub(r"(\d{3})-(\d{3})-((\d{4})$|(\d{4})\.$)", r"(\1) \2-\3", phone)
    return result

print(convert_phone_number("My number is 212-345-9999.")) # My number is (212) 345-9999.
print(convert_phone_number("Please call 888-555-1234")) # Please call (888) 555-1234
print(convert_phone_number("123-123-12345")) # 123-123-12345
print(convert_phone_number("Phone number of Buckingham Palace is +44 303 123 7300")) # Phone number of Buckingham Palace is +44 303 123 7300

My number is (212) 345-9999.
Please call (888) 555-1234
123-123-12345
Phone number of Buckingham Palace is +44 303 123 7300


### Substituting e-mail domain from/to a .csv file

In [None]:
#!/usr/bin/env python3

import re
import csv


def contains_domain(address, domain):
    """Returns True if the email address contains the given,domain,in the domain position, false if not."""
    domain = r'[\w\.-]+@'+domain+'$'
    if re.match(domain,address):
        return True
    return False


def replace_domain(address, old_domain, new_domain):
    """Replaces the old domain with the new domain in the received address."""
    old_domain_pattern = r'' + old_domain + '$'
    address = re.sub(old_domain_pattern, new_domain, address)
    return address

def main():
    """Processes the list of emails, replacing any instances of the old domain with the new domain."""
    old_domain, new_domain = 'abc.edu', 'xyz.edu'
    csv_file_location = '<csv_file_location>'
    report_file = '<path_to_home_directory>' + '/updated_user_emails.csv'
    user_email_list = []
    old_domain_email_list = []
    new_domain_email_list = []

    with open(csv_file_location, 'r') as f:
        user_data_list = list(csv.reader(f))
        user_email_list = [data[1].strip() for data in user_data_list[1:]]

        for email_address in user_email_list:
            if contains_domain(email_address, old_domain):
                old_domain_email_list.append(email_address)
                replaced_email = replace_domain(email_address,old_domain,new_domain)
                new_domain_email_list.append(replaced_email)

        email_key = ' ' + 'Email Address'
        email_index = user_data_list[0].index(email_key)

        for user in user_data_list[1:]:
            for old_domain, new_domain in zip(old_domain_email_list, new_domain_email_list):
                if user[email_index] == ' ' + old_domain:
                    user[email_index] = ' ' + new_domain
    f.close()

    with open(report_file, 'w+') as output_file:
        writer = csv.writer(output_file)
        writer.writerows(user_data_list)
        output_file.close()

main()

### Great !