# Working with Regular Expressions : 
### Regular Expressions are used to find specific text from the data like we use strings but they work faster and use less space to work thus a perfect tool for text extraction
Like if the string was very long then using some space to iterate over the data can take a lot of space and we require very little in case of regular expressions 
Basic command tools used for the purpose are (for using in command line) : 
i)Grep
ii)Sed
iii)Awk
##### Note : in Windows findall command is used in place of the above three
##### In Python re module does the same for us 

In [14]:
import re # to import the re module used in python for regular expressions 
# basic word matching using re :
# r"" signifies sending raw string to the re module i.e. interpreter won't interpret any special characters
result = re.search(r"aza","plaza")
print(result)
print(re.search(r"aza","bazaar"))
print(re.search(r"aza","maze"))

# we use '^' to compare string in start of the line ** only the start is compared : 
print(re.search(r"^x","xenon"))
print(re.search(r"^x","mexico xylo xenon"))

# we use'.' to match any character in between : 
print(re.search(r"p.ng","penguin"))
print(re.search(r"p.ng","ponging"))
            ## if we want to ignore lower upper case : 
print(re.search(r"p.ng","Penguin",re.IGNORECASE))

# if we have a list of characters to compare to we include them in a [] so re selects any one of them we also inclide range
# for ease of use : 
print(re.search(r"[Pp]ython","Python python")) # also it selects the first one found 
print(re.search(r"[a-z]way","the end of the highway")) # selects from the range a-z
print(re.search(r"[a-z]way","the end of the way")) # none because we don't have any letter b/w a-z before way  
print(re.search(r"cloud[a-zA-Z0-9]","cloudy")) # for a range of values (All are taken from ASCII)

<re.Match object; span=(2, 5), match='aza'>
<re.Match object; span=(1, 4), match='aza'>
None
<re.Match object; span=(0, 1), match='x'>
None
<re.Match object; span=(0, 4), match='peng'>
<re.Match object; span=(0, 4), match='pong'>
<re.Match object; span=(0, 4), match='Peng'>
<re.Match object; span=(0, 6), match='Python'>
<re.Match object; span=(18, 22), match='hway'>
None
<re.Match object; span=(0, 6), match='cloudy'>


In [15]:
# code to check if some text passed contains punctuations : 
def check_punctuation(text):
    result = re.search(r"[,.:!?;]",text)
    return result!= None
print(check_punctuation("this sentence ends with ."))

True


In [22]:
# if we want to search for an element not in the group then use ^ at start of [] (inside it) as [^]
print(re.search(r"[^a-zA-Z]","this ")) # space was the first text inside it that didn't comprise of a-zA-Z range
print(re.search(r"[^a-zA-Z ]","this ."))

# if we want to match either expression we use '|' : 
print(re.search(r"cat|dog","I like dogs"))
print(re.search(r"cat|dog","I like doggies")) # also look in this example that it extracts only the dog part from it
print(re.search(r"cat|dog","I like cats and dogs")) # here also the first found one is returned 

# if we want to find all the stings for the pattern defined by us we use findall funtion as : 
print(re.findall(r"cat|dog","I like cats and dogs")) # though we don't get their position 

<re.Match object; span=(4, 5), match=' '>
<re.Match object; span=(5, 6), match='.'>
<re.Match object; span=(7, 10), match='dog'>
<re.Match object; span=(7, 10), match='dog'>
<re.Match object; span=(7, 10), match='cat'>
['cat', 'dog']


### Repetition Qualifiers : 
to find the word that repeat a specific pattern we use * after that part of pattern to specify that we want to find till the that part of pattern is present 

In [31]:
print(re.search(r"Py.*n","Pygmalion")) # now it repeats part . till it start withs Py and ends with n 
# but ...
print(re.search(r"Py.*n","Python Programming")) # in this case it finds the longest string till it finds 'n'
# to avoid the space we can : 
print(re.search(r"Py[a-z]*n","Python Programming"))
#** in repetition 0 is also considered so if string contains Pyn then also it will work since that would be zero repetition
print(re.search(r"Py[a-z]*n","Pyn")) # whereas..
print(re.search(r"Py[a-z]n","Pyn")) # ! gives None since a not a repetiton 

# to overcome this issue of 0 repetition we have '+' operator that repeats the given pattern part till not found 
# "o+" checks for continous 'o' occurences
print(re.search(r"o+l+","goldfish"))
print(re.search(r"o+","ooolaoooooola")) # again only checks the first repeated occurence of 'o'
print(re.search(r"o+l+","woolly"))

<re.Match object; span=(0, 9), match='Pygmalion'>
<re.Match object; span=(0, 17), match='Python Programmin'>
<re.Match object; span=(0, 6), match='Python'>
<re.Match object; span=(0, 3), match='Pyn'>
None
<re.Match object; span=(1, 3), match='ol'>
<re.Match object; span=(0, 3), match='ooo'>
<re.Match object; span=(1, 5), match='ooll'>


## To check for optional occurence of a character we use '?' as : 

In [34]:
print(re.search(r"p?each","To each")) # optional to check for 'p'
print(re.search(r"p?each","peach")) # here p was present so it works well 

# to check for special sequence characters like ? ^ . * [ ( { } ) ]  we use \ before these characters : 
print(re.search(r"\.com","helloworld.com"))

<re.Match object; span=(3, 7), match='each'>
<re.Match object; span=(0, 5), match='peach'>
<re.Match object; span=(10, 14), match='.com'>


## Special charaters for comparison: 
### 1) \w - Compares alphbets,numbers and '_' thus all alphanumeric 
### 2) \d - all digits only 
### 3) \s - all whitespace characters like space,tab,newline

In [37]:
print(re.search(r"\w*","this is")) # matches any alphanumeric before space
print(re.search(r"\w*","this_is_a_number_9"))

<re.Match object; span=(0, 4), match='this'>
<re.Match object; span=(0, 18), match='this_is_a_number_9'>


In [39]:
# Code to check if given string has atleast 2 groups of alphanumeric characters separated by one or more whitespace characters:

def check_character_groups(test):
    result = re.search(r"\w+\s+\w+",test)
    return result != None
print(check_character_groups("one"))
print(check_character_groups("123 Ready Set Go"))


#### For checking your knowledge in Regular Expression use this site : regex101.com

False
True


In [44]:
# Now suppose we want to check if a word starts with A and ends with a : 
print(re.search(r"A[b-z]*a","Argentina")) # this works but ...
print(re.search(r"A[b-z]*a","Azerbaijan")) # not the string we needed as 'a' was in the middle of it 

# to solve this issue we have '$' such that when put after the term gives us the string ending with that term : 
print(re.search(r"A[b-z]*a$","Azerbaijan")) # clearly it works as the word should end with 'a' 
print(re.search(r"A[b-z]*a$","Argentina"))

<re.Match object; span=(0, 9), match='Argentina'>
<re.Match object; span=(0, 6), match='Azerba'>
None
<re.Match object; span=(0, 9), match='Argentina'>


In [45]:
# code to check if variable name is valid or not (in python,c++,java - following the rules):
def check_valid_name(text):
    pattern = r"^[a-zA-Z_][a-zA-Z0-9_]*$" # as variable shoub start with alphabet or _ then it may follow the same + numbers as well also the number and
                                          # it types should end with such characters so $ at the end!
    result = re.search(pattern,text)
    return result!=None

print(check_valid_name("This is valid"))
print(check_valid_name("This_is_valid"))


False
True


## Capturing Groups : 
### Used to capture all the groups following the pattern we gave to follow ( group is formed by enclosing in ( ) )
#### Ques- given names with format "lastname, firstname" return the string with format "firstname lastname" Example - Lovelace, Ada --> Ada Lovelace

In [51]:
# result.groups() method returns a tuple containing elements differed on the basis of groups formed by enclosing in parenthesis ()
def name_format(text):
    pattern = r"^(\w*), (\w*)$" # (\w*) makes one group similarly if we make via ([a-z]*) this will form another group
    result = re.search(pattern,text)
    print(result.groups()) # prints the groups made due to ',' part acting as divider. This function just shows these values
    return result[2]+" "+result[1] # result[0] contains the found string via the pattern

print(name_format("Lovelace, Ada"))
#print(name_format("Hpper, GraceM.")) # this will give error since the pattern doesn't match so result[1] access None which will give error
# so we need to change pattern as : pattern = r"^(\w*), (\w* ?\w?\.?)$"  since it questions for existence of middle names value

('Lovelace', 'Ada')
Ada Lovelace


## If we want a specific number of values to be present we use {} :
{5} tells that only 5 characters should be present of pattern after which {5} is written \
{5,10} makes the same argument just giving possible range b/w 5 to 10 \
{5,} makes that minimum 5 characters must be present following the pattern \
{,5} makes maximum 5 characters must be present following the pattern

In [59]:
print(re.search(r"[a-zA-Z]{5}","a ghost")) # also see that after writing {5} we don't require to use * or + after [a-zA-Z]
print(re.findall(r"[a-zA-Z]{5}","a scary ghost appeared")) # notice that partiall part was only taken if we don't want that : 

# \b is used to make a word limit i.e. separated via space. the pattern should be enclosed within \b pattern\b for the word limit
print(re.findall(r"\b[a-zA-Z]{5}\b","a scary ghost appeared")) # only 5 letter words are allowed !

print(re.findall(r"\w{5,10}","I really love Strawberries"))
print(re.findall(r"\w{5,}","I really love Strawberries"))
print(re.findall(r"\w{,5}","I really love Strawberries")) # since each word starting with '' also makes for less than 5 so they are included as well


<re.Match object; span=(2, 7), match='ghost'>
['scary', 'ghost', 'appea']
['scary', 'ghost']
['really', 'Strawberri']
['really', 'Strawberries']
['I', '', 'reall', 'y', '', 'love', '', 'Straw', 'berri', 'es', '']


In [62]:
# a simple example of all the above implementation : 
log = 'July 31 07:51:48 mycomputer bad_process[12345]: error performing package upgrade'
pattern = r"\[(\d+)\]" # we made a group here so result[1] would contain data within square brackets 
result = re.search(pattern,log)
print(result)
print(result[0] +" "+ result[1])

<re.Match object; span=(39, 46), match='[12345]'>
[12345] 12345


## Splitting and Replacing in Regular Expressions : 
we can split the raw string pattern using the function **re.split()**\
we can replace the raw string pattern using the function __re.sub()__

In [66]:
# splitting data : 
print(re.split(r"[.?!]","One Sentence. Another One! And the last One?"))
print(re.split(r"([.?!])","One Sentence. Another One! And the last One?")) # if we want the splitters to be included to just group them !

#replacing data : 
print(re.sub(r"[\w.%+-]+@[\w.-]+","[NOT AVAILABLE]","Recieved an email for go_nuts95@gmail.com")) #just provide the pattern to be replcaed with 
#also we can use groups directly to replace the string as as : 
re.sub(r"^([\w.-]*), ([\w.-]*)$",r"\2 \1","Lovelace, Ada") # \2 is the group2 and \1 is group 1 which we used directly to replace in the string


['One Sentence', ' Another One', ' And the last One', '']
['One Sentence', '.', ' Another One', '!', ' And the last One', '?', '']
Recieved an email for [NOT AVAILABLE]


'Ada Lovelace'

## a basic function to change the domain names of all the emails 
The given will convert all the emails with domain **abc.edu** to __xyz.edu__ 

In [80]:
# making the csv file for the purpose of editing and making a new file : 
import csv
x = [["Full Name","Email Address"],['Blossom Gill','blossom@abc.edu'],['Hayes Delgado','nonummy@utnisia.com'],['Petra Jone','ac@abc.edu']
    ,['Oleg Noel','noel@liberomauris.ca'],['Ahmed Miller','ahmed.miller@nequeno.co.uk'],['Macaulay Douglas','mdouglas@xyz.edu']
    ,['Aurora Grant','enim.non@abc.edu'],['Madison Mcintosh','mcintosh@nissan.net'],['Montana Powell','Montanap@abc.edu']
    ,['rogan robinson','rr.robinson@abc.edu'],['simon rivera','sri@abc.edu'],['Benedict Pacheo','bpachecha@abc.edu']
    ,['Masie Hendricks','mai@abc.edu'],['Xaviera Gould','xlg@utnisia.com'],['Orin Rollins','orin@semmagna.com'],['Flavia Santiago','flavia@utnisia.net']
    ,['Jackson Owens','jackOwens@abc.edu'],['Brittani Humphrey','brittani@ut.net'],['Kirk Nixxon','kirk@abc.edu'],['Bree Campbell','breee@utnisia.net']]

with open('user_emails.csv','w',newline = "") as file : 
    writer = csv.writer(file)
    writer.writerows(x)
    
# checking all the written data : 
with open('user_emails.csv') as file :
    reader = csv.reader(file)
    for i in reader:
        print(i)

['Full Name', 'Email Address']
['Blossom Gill', 'blossom@abc.edu']
['Hayes Delgado', 'nonummy@utnisia.com']
['Petra Jone', 'ac@abc.edu']
['Oleg Noel', 'noel@liberomauris.ca']
['Ahmed Miller', 'ahmed.miller@nequeno.co.uk']
['Macaulay Douglas', 'mdouglas@xyz.edu']
['Aurora Grant', 'enim.non@abc.edu']
['Madison Mcintosh', 'mcintosh@nissan.net']
['Montana Powell', 'Montanap@abc.edu']
['rogan robinson', 'rr.robinson@abc.edu']
['simon rivera', 'sri@abc.edu']
['Benedict Pacheo', 'bpachecha@abc.edu']
['Masie Hendricks', 'mai@abc.edu']
['Xaviera Gould', 'xlg@utnisia.com']
['Orin Rollins', 'orin@semmagna.com']
['Flavia Santiago', 'flavia@utnisia.net']
['Jackson Owens', 'jackOwens@abc.edu']
['Brittani Humphrey', 'brittani@ut.net']
['Kirk Nixxon', 'kirk@abc.edu']
['Bree Campbell', 'breee@utnisia.net']


In [90]:
#!/usr/bin/env python3  (this line is for linux users !! in case using editor like vim or nano)

import re
import csv


def contains_domain(address, domain): # function to check if the domain matches for the email address
    """Returns True if the email address contains the given,domain,in the domain position, false if not."""
    domain = r'[\w\.-]+@'+domain+'$' # our pattern to match
    if re.match(domain,address):    # match() function simply returns true or false if given pattern matches 
        #print(True)
        return True
    return False

def replace_domain(address, old_domain, new_domain): # as name suggests replacing the domain if the address domain matches
    """Replaces the old domain with the new domain in the received address."""
    old_domain_pattern = r'' + old_domain + '$'
    address = re.sub(old_domain_pattern, new_domain, address)
    return address

def main():
    """Processes the list of emails, replacing any instances of the old domain with the new domain."""
    old_domain, new_domain = 'abc.edu', 'xyz.edu' # we are converting all the emails with @abc.edu to @xyz.edu
    csv_file_location = 'user_emails.csv'#'<csv_file_location>' # location and name of file we are reading
    report_file = 'updated_user_emails.csv'#'<path_to_home_directory>' + '/updated_user_emails.csv' # location and name of file we are saving 
    user_email_list = []
    old_domain_email_list = []
    new_domain_email_list = []
    
    with open(csv_file_location, 'r') as f:
        user_data_list = list(csv.reader(f)) # taking all the row wise data as a single list
        #print(user_data_list)
        user_email_list = [data[1].strip() for data in user_data_list[1:]] # taking all the email addresses and stripping in case any space present 
        #print(user_email_list)
        for email_address in user_email_list:
            if contains_domain(email_address, old_domain):
                old_domain_email_list.append(email_address)
                replaced_email = replace_domain(email_address,old_domain,new_domain)
                new_domain_email_list.append(replaced_email)
        #print(new_domain_email_list)       
        email_key = '' + 'Email Address'
        email_index = user_data_list[0].index(email_key)
        #print(email_index)
        for user in user_data_list[1:]:
            for old_domain, new_domain in zip(old_domain_email_list, new_domain_email_list):
                if user[email_index] == '' + old_domain:
                    user[email_index] = '' + new_domain
    f.close()
    with open(report_file, 'w+',newline="") as output_file:
        writer = csv.writer(output_file)
        writer.writerows(user_data_list)
        output_file.close()
main()

In [91]:
# printing data to check if it worked :

#old data
with open('user_emails.csv') as file :
    reader = csv.reader(file)
    print("old data : ")
    for i in reader:
        print(i)
    file.close()
#new data 
with open('updated_user_emails.csv') as file :
    reader = csv.reader(file)
    print("new data : ")
    for i in reader:
        print(i)

old data : 
['Full Name', 'Email Address']
['Blossom Gill', 'blossom@abc.edu']
['Hayes Delgado', 'nonummy@utnisia.com']
['Petra Jone', 'ac@abc.edu']
['Oleg Noel', 'noel@liberomauris.ca']
['Ahmed Miller', 'ahmed.miller@nequeno.co.uk']
['Macaulay Douglas', 'mdouglas@xyz.edu']
['Aurora Grant', 'enim.non@abc.edu']
['Madison Mcintosh', 'mcintosh@nissan.net']
['Montana Powell', 'Montanap@abc.edu']
['rogan robinson', 'rr.robinson@abc.edu']
['simon rivera', 'sri@abc.edu']
['Benedict Pacheo', 'bpachecha@abc.edu']
['Masie Hendricks', 'mai@abc.edu']
['Xaviera Gould', 'xlg@utnisia.com']
['Orin Rollins', 'orin@semmagna.com']
['Flavia Santiago', 'flavia@utnisia.net']
['Jackson Owens', 'jackOwens@abc.edu']
['Brittani Humphrey', 'brittani@ut.net']
['Kirk Nixxon', 'kirk@abc.edu']
['Bree Campbell', 'breee@utnisia.net']
new data : 
['Full Name', 'Email Address']
['Blossom Gill', 'blossom@xyz.edu']
['Hayes Delgado', 'nonummy@utnisia.com']
['Petra Jone', 'ac@xyz.edu']
['Oleg Noel', 'noel@liberomauris.ca']
