### Regex - Regular Expressions

In [None]:
# Many times, we need to extract required information from given text data. For example, we want to know the number of
# people who contacted us in the last month through Gmail or we want to know the phone numbers of employees in a company 
# whose names start with 'A' or we want to retrieve the date of births of the patients in a hospital who joined for
# treatment for hypertension, etc. To get such information, we have to conduct a searching operation on the text data. Once
# the required information is found, we may have to perform further operations on such data. Regular expressions are useful
# to perform such operations on data.

In [None]:
# Regular Expressions
# A regular expression is a string that contains special symbols and characters to find and extract the information needed
# by us from the given data. 

# Where a string method in Python to search for a substring in a string would look like this:

input_str = 'Betty bought some butter but the butter was bitter so Betty bought some better butter to make\
the bitter butter better'

sub_str = 'b'

input_str.find(sub_str)

In [None]:
import re

sub_re = 'b'

result = re.findall(sub_re, input_str)
print(result)

In [None]:
print(len(result))

# We can use the findall method of the re module to look for all the occurrences of 'b'. 

In [None]:
# However, if we wanted to find out all the occurrences of b - whether small or capital, we would have to do some
# manipulations to get the desired result. Regex gives us tools to handle these queries and operations in a much simpler
# manner. 

input_str = 'Betty bought some butter but the butter was bitter so Betty bought some better butter to make\
the bitter butter better'

sub_re = '[bB]e\w+'
result = re.findall(sub_re, input_str)
print(result)
print(len(result))


In [None]:
# Note here how the capital B was also returned in the result. We shall see the other available methods in regex module 
# shortly.

In [None]:
# A regular expression helps us to search match, find and split based on specified patterns as per
# our requirements. A regular expression is also called simply regex. Regular expressions are available in many languages
# besides Python. 


# Python provides re module that stands for regular expressions. This module contains methods
# like compile(), search(), match(), findall(), split(), etc. which are used in finding the information in
# the available data. So, when we write a regular expression, we should import re module as:

import re

#### The re module has several methods to help us write regex. 

search - returns a match object if the substring is matched in the string to be searched. It returns only the first
occurrence of the match.

findall - returns a list containing all matches

split - returns a list where string has been split at each pattern match. 

sub - replaces one or many pattern matches with a specified string. 

As well as other methods which we shall see in a bit.

In [None]:
# While going forward - it is important to remember that the RegEx module works character by character from left to right 
# i.e. continues matching the pattern and keeps going on as long as the conditions for matching are continuing to be
# satisfied (or not satisfied depending on how the regex pattern is written). You shall see examples of this later in the
# class.

In [None]:
# List of special sequences. A special sequence is a \ followed by one of the characters from list below and each special
# sequence has a special meaning.

# Special Sequence             Description
# \A                           Matches if the string begins with the given pattern

# \b                           Matches if the word begins or ends with the given character.(\b before pattern to check if it
#                              begins with the pattern and \b after pattern to see if it ends with the specified pattern).
# \B                           It is the opposite of the \b i.e. the word should not start or end with the given regex.
# \d                           Matches any decimal digit, this is equivalent to the set class [0-9]
# \D                           Matches any non-digit character, this is equivalent to the set class [^0-9]
# \s                           Matches any whitespace character.
# \S                           Matches any non-whitespace character
# \w                           Matches any alphanumeric character, this is equivalent to the class [a-zA-Z0-9_].
# \W                           Matches any non-alphanumeric character.
# \Z                           Matches if the string ends with the given regex

In [52]:
import re

input_str = 'Betty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me Bett3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

In [57]:
sub_str = r't3r'

result = re.sub(sub_str, 'ter', input_str, count=3)

print(result)

Betty b0u6ht some butter but the butter w@s b!tter s0 Betty b0u6ht s0me Bett3r butt3r to m@k3 the b!tt3rbutt3r b3tt3r.


In [None]:
print(input_str)

In [None]:
sub_str = r'\bb\w+'

result = re.findall(sub_str, input_str)
print(result)

# The findall function takes the two parameters, substring and the string to be searched. It returns the matches in a list 
# in the order they are found. If no matches are found, it returns an empty list.

In [None]:
sub_str = r'b\w+'

result = re.findall(sub_str, input_str)
print(result)

In [None]:
#\A Returns a match if the specified characters are at the beginning of the string(NOT words but the whole string)
sub_str = r'\w+'

print(re.findall(sub_str, input_str))

In [None]:
sub_str = r'\bb\w+'

print(type(sub_str))

result = re.findall(sub_str, input_str)
      
print(result)

In [None]:
print(input_str)
print(sub_str)

In [None]:
result = re.search(sub_str, input_str)

print(result)

print(dir(result))

In [None]:
for i in result:
    print(i.span())

In [None]:
# Note here how we started using r denoting (raw-string) before the regex? This is because in Regex \ is used in front of
# many shorthand notations while \ is also an escape character in Python. To avoid conflict, we always put regular
# expressions to be searched in r format. 

In [None]:
input_str = 'betty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me bett3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'



sub_str = '\bbu\w+'

result = re.search(sub_str, input_str)

print(result)
#print(result.group())

#Note the result when not putting the r rawstring

In [None]:
#Workaround

sub_str = '\\bbu\\w+'

result = re.search(sub_str, input_str)

print(result)
print(result.group())

In [None]:
#Easiest way

sub_str = r'\bbu\w+'

result = re.search(sub_str, input_str)

print(result)
print(result.group())

In [None]:
print(input_str)

In [None]:
result = re.finditer(sub_str, input_str)

print(result)

In [None]:
for x in result:
    print(x.group(), x.span())

In [39]:
#\b Returns a match if the specificed characters are at the beginning or end of a word. 


input_str = 'B3tty 3b0u6ht 5ome butt3r but the butt3r w@s bitt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the bitt3r \
butt3r b3tt3r'

sub_str = r'\B\d\w+'

result = re.findall(sub_str, input_str)
print(result)

['3tty', '0u6ht', '3r', '3r', '3r', '0u6ht', '0me', '3tt3r', '3r', '3r', '3r', '3tt3r']


In [None]:
result = re.findall(sub_str, input_str)
print(result)

In [23]:
import re

input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

print(input_str)

sub_str = r'\b\w+t\w*'

result = re.findall(sub_str, input_str)
print(result)

B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3rbutt3r b3tt3r.
['B3tty', 'b0u6ht', 'butt3r', 'but', 'butt3r', 'tt3r', 'Betty', 'b0u6ht', 'b3tt3r', 'butt3r', 'tt3rbutt3r', 'b3tt3r']


In [16]:
sub_str = r'\bt\w+'

result = re.findall(sub_str, input_str)
print(result)

['the', 'tt3r', 'to', 'the', 'tt3rbutt3r']


In [None]:
print(input_str.index('the'))

In [24]:
input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

sub_str = r'\bb0u6\w*'

result = re.search(sub_str, input_str)
print(result)

<re.Match object; span=(6, 12), match='b0u6ht'>


In [None]:
print(input_str.index('b0u6'))

In [None]:
print(result.group())

In [None]:
print(result.span())

In [None]:
print(result.start())

In [None]:
print(result.end())

In [25]:
print(result.string)

B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3rbutt3r b3tt3r.


In [None]:
result = re.finditer(sub_str, input_str)

print(result)

In [None]:
for x in result:
    print(x.span(), x.group())

In [None]:
print(f'Found match {result.group()} beginning at {result.start()} and ending at {result.end()} and span is {result.span()}.')

In [None]:
# The match object returned from the search function has the following methods to retrieve the information:

# .span() - returns the beginning and end index numbers of the matched string in a tuple. 
# .string - returns the string passed into the function to be searched. 
# .group() - returns the part of the string where there was a match. 
# .start() - returns the start index
# .end() - returns the end index

In [None]:
print(result.span())

In [None]:
print(result.start())

In [None]:
print(result.end())

In [None]:
print(result.string)

In [None]:
print(result.group())

In [26]:
#Finding all match objects for a pattern using finditer

input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

sub_str = 'but\w+'

result = re.finditer(sub_str, input_str)

print(result)

<callable_iterator object at 0x00000222EACC0640>


In [27]:
for x in result:
    print('-'*100)
    print(x)
    print(f'Found match {x.group()} beginning at {x.start()} and ending at {x.end()}.')

----------------------------------------------------------------------------------------------------
<re.Match object; span=(18, 24), match='butt3r'>
Found match butt3r beginning at 18 and ending at 24.
----------------------------------------------------------------------------------------------------
<re.Match object; span=(33, 39), match='butt3r'>
Found match butt3r beginning at 33 and ending at 39.
----------------------------------------------------------------------------------------------------
<re.Match object; span=(79, 85), match='butt3r'>
Found match butt3r beginning at 79 and ending at 85.
----------------------------------------------------------------------------------------------------
<re.Match object; span=(104, 110), match='butt3r'>
Found match butt3r beginning at 104 and ending at 110.


In [None]:
input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

sub_str = r'\w+e\b'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
#\w returns a match where the strings contains any word characters - upper, lower case alphabets, digits - 0 to 9 and _
# underscore. 
# + (outside square brackets) is a metacharacter specifying 1 or more occurrences. 

# So in the above substring - r'\bbu\w+' - we specified:

# r - this is a raw string - do not consider \ escape characters. 
# '' - quotes denoting strings.
# \b - pattern begins with
# bu - characters to search - so, pattern we are looking for - begins with 'bu'
# \w - After 'bu' search for any word character
# + - One or more occurrences of word character. 

# So, in summary: 

# Search for a pattern in the string which begins with 'bu' and has one or more word characters after bu. Note here that - 
# it wont catch 'bu' if it occurred in the input string in this case.

In [None]:
input_str = 'B3tty b0u6ht some bu but the butt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

sub_str = r'\bbu\w+'
result = re.search(sub_str, input_str)

print(result.group())

In [None]:
input_str = 'B3tty b0u6ht some the butt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

sub_str = r'\bbu\w+'
result = re.search(sub_str, input_str)

print(result.group())
print(result.span())

In [None]:
# Changing the + to * will return bu. 

sub_str = r'\bbu\w*'

result = re.search(sub_str, input_str)

print(result.group())

In [None]:
input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

sub_str = r'\bbu\w*'

result = re.search(sub_str, input_str)

print(result.group())

In [None]:
input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

sub_str = r'\w+r\b'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
# \B Returns a match where the specified pattern is NOT at beginning or end of string.


input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

print(input_str)

In [None]:
#print(input_str.index('b!tt3rbutt3r'))

In [None]:
sub_str = r'\B'

result = re.findall(sub_str, input_str)

print(result)



In [None]:
input_str = 'B3tty b0u6ht some butt3r but hello therefore butt3r w@s b!tt3r s0 hermit Betty b0u6ht helium s0me b3tt3r butt3r \
to m@k3 tehe b!tt3r butt3r b3tt3r.'

sub_str = r't\w*he\B'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
import re

input_str = 'B3tty b0u6ht  some butt3r but the butt3r w@s b!tt3r s0 Bettyb0u6ht s0me b3tt3rbutt3r to m@k3 the b!tt3r \
butt3r b3tt3r.'

sub_str = r'\Bb\w+'

result = re.finditer(sub_str, input_str)

for x in result:
    print(x.span(), x.group())
    print(input_str[x.start()-5:x.end()+5])


In [None]:
sub_str = r'\w+ht\w*'

result = re.findall(sub_str, input_str)

print(result)

In [40]:
input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r \
butt3r b3tt3r.'

In [41]:
print(input_str)

B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r butt3r b3tt3r.


In [50]:
name1 = 'My name is Anthony Gonsalvez. Roop nagar, Prem Galli kholi no. 420. Excuse me please'

sub_str = r'[a-zA-Z]*'

result = re.findall(sub_str, name1)

print(result)

['My name is Anthony Gonsalvez. Roop nagar, Prem Galli kholi no. ', '420. Excuse me please']


In [51]:
print(input_str)

B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r butt3r b3tt3r.


In [54]:
input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s 3b!tt3r 50 betty b0u6ht 5ome b3tt3r butt3r to m@k3 the b!tt3r butt3r b3tt3r.'

In [59]:
#\d Returns a match where the string contains digits.

sub_str = r'\D[a-z]*'

result = re.findall(sub_str, input_str)

print(result)


['B', 'tty', ' b', 'u', 'ht', ' some', ' butt', 'r', ' but', ' the', ' butt', 'r', ' w', '@s', ' ', 'b', '!tt', 'r', ' ', ' betty', ' b', 'u', 'ht', ' ', 'ome', ' b', 'tt', 'r', ' butt', 'r', ' to', ' m', '@k', ' the', ' b', '!tt', 'r', ' butt', 'r', ' b', 'tt', 'r', '.']


In [None]:
# Note here how '6ht' and '3r' are not separate outputs from '0u6ht' and '3tt3r'. This is because the regex takes a match
# till the pattern continues to match and starts searching for the next match from the next index number. 

In [None]:
sub_str = r'\w+\d\w+'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r \
butt3r b3tt3r.'

In [None]:
#\s Returns a match where the string contains a space character. 

In [None]:
sub_str = r'\w+\sb\w+'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
sub_str = r'\w+\St\w+'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
input_str = 'B3ttyb0u6ht some butt3rbut the butt3r w@s b!tt3r s0 bettyb0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r \
butt3r b3tt3r.'

sub_str = r'\w+\Sb\w+'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
#\w Returns a match where the pattern match contains any word characters - a to z, A to Z and 0 to 9

input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r \
butt3r b3tt3r.'

sub_str = r'\w+'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
sub_str = r'\w+\W\w+'

result = re.findall(sub_str, input_str)

print(result)

In [71]:
#\Z - returns a match if the pattern is found at the end of the string(not each word)

input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r \
butt3r b3tt3r'

sub_str = r'\w\S\w3r'

result = re.findall(sub_str,input_str)

print(result)
#print(result.start())

#print(input_str[:116])

['utt3r', 'utt3r', '3tt3r', 'utt3r', 'utt3r', '3tt3r']


In [None]:
# Regex used for extracting matches of a pattern from text. 

# Greedy match - Tries to match as much of the text as part of its pattern. It wont stop as soon as a match is found but 
# continue to add characters to the pattern as long as the conditions for the pattern are being met. 

# \A
# \b
# \B
# \d
# \D
# \s
# \S
# \w
# \W
# \Z


In [None]:
# There are also Metacharacters.

# MetaCharacters               Description
# \                            Used to drop the special meaning of character following it
# []                           Represent a character class
# ^                            Matches the beginning of string = \A
# $                            Matches the end of string = \Z
# .                            Matches any character except newline
# |                            Means OR (Matches with any of the characters separated by it).

# And Quantifiers

# ?                            Matches zero or one occurrence - It signifies optional character.
# *                            Any number of occurrences (including 0 occurrences)
# +                            One or more occurrences
# {}                           Indicate the number of occurrences of a preceding regex to match.
# ()                           Enclose a group of Regex


In [None]:
# Metacharacters - Characters with special meaning in RegEx

In [None]:
input_str = 'B3tty b0u6ht some bu\tt3r but the butt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

In [None]:
#[] Any 'set' of characters inside the braces. 

In [8]:
import re

input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 betty b0u6ht some b3tt3r butt3r to make the b!tt3r \
butt3r b3tt3r.'

sub_str = r'\w+t|\w+h'

result = re.findall(sub_str, input_str)

print(result)

# Matches any word with EITHER t or h in it. Gives a match including any alphanumeric characters before and after the t or h
# is found. 

['B3tt', 'b0u6ht', 'butt', 'but', 'th', 'butt', 'tt', 'bett', 'b0u6ht', 'b3tt', 'butt', 'th', 'tt', 'butt', 'b3tt']


In [None]:
input_str = 'B3tty b0u6ht some buttre but the butt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

print(input_str)

In [None]:
sub_str = r'\w+[et]'

result = re.findall(sub_str, input_str)

print(result)

# Returns match of any words that have either e or t in them. Matches any number of characters before the e or t is found.

In [None]:
# Note here how - 'B3tt', 'butt', 'tt', 'bett' - the match wasnt stopped as soon as the first t was found. That is because
# regex patterns perform 'greedy' matches as much as they can match - they will try to match. In the example above - when 
# the regex program reaches the first t of b3tt - it satisfies BOTH conditions that it is an alphanumeric character \w AND
# it is part of the set [et], so it moves on to the next character which also satisfies both conditions - BUT we have not
# specified that after it finds e or t - can there be any text after that? Since we have not specified that - the match
# stops. However, if we were to add more t's after the first 2 - they would continue to get matched till the last t. 

input_str = 'B3tty b0u6ht some bu\tt3r but the buttttt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

sub_str = r'\w+[et]'

result = re.findall(sub_str, input_str)


print(result)


In [None]:
# \ Usually signifies a special sequence but put before a special character can be used to signify escaping. 

input_str = '''B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r \
butt3r b3tt3r.'''

print(input_str)

In [None]:
sub_str = r'\\'

result = re.search(sub_str, input_str)

print(result)

In [None]:
input_str = '''B3tty b0u6ht some butt3r but the butt3r w@s 'btt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r \
butt3r b3tt3r.'''

print(input_str)

In [None]:
sub_str = r'\'\w+'

result = re.search(sub_str, input_str)

print(result)
#print(input_str.index('\\'))

In [None]:
input_str = r"B3tty b0u6ht some butt3r but the bu\tt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r."

print(input_str)

In [None]:
sub_str = r'\\t\w+'
result = re.search(sub_str, input_str)

print(result.group())

In [None]:
input_str = r'B3tty b0u6ht some butt3r but the bu\tt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

print(input_str)

In [None]:
sub_str = r'\\'

result = re.search(sub_str, input_str)

print(result)

In [None]:
# . Signifies any character except newline characters, \r, \r\n

In [6]:
import re

input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s \n b!tt3r s0 Betty b0u6ht \r s0me b3tt3r butt3r \r\n to m@k3 \
the b!tt3r \n butt3r b3tt3r.'

print(input_str)

B3tty b0u6ht some butt3r but the butt3r w@s \n b!tt3r s0 Betty b0u6ht \r s0me b3tt3r butt3r \r\n to m@k3 \
the b!tt3r \n butt3r b3tt3r.


In [7]:
sub_str = r'.+'

result = re.findall(sub_str,input_str)

print(result)

['B3tty b0u6ht some butt3r but the butt3r w@s \\n b!tt3r s0 Betty b0u6ht \\r s0me b3tt3r butt3r \\r\\n to m@k3 \\', 'the b!tt3r \\n butt3r b3tt3r.']


In [None]:
sub_str = '\w+.'

result = re.findall(sub_str, input_str)
print(result)

In [None]:
# ^ Starts with specified character - same as \A

In [None]:
input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

sub_str = r'^B\w+'


result = re.findall(sub_str, input_str)

print(result)

In [None]:
sub_str = r'\AB\w+'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
sub_str = r'\bB\w+'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
sub_str = r'^b\w+'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
input_str = 'b3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

sub_str = r'^b.+'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
sub_str = r'\Ab\w+'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
# $ - Checks if whole string ends with specified characters. Same as \Z

In [None]:
input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

sub_str = r'\w+t3r.$'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
#Note no result since the string does not end with 't3r' but with 't3r.'

sub_str = r't3r.$'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
sub_str = r't3r\.'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
sub_str = r't3r\.'

result = re.search(sub_str, input_str)

print(result)

In [None]:
# * - 0 or more occurrences of specified characters(placed on the right of the characters we wish to specify)

input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!773r\
butt3r b3tt3r.'

sub_str = r'\w+ht*'

result = re.findall(sub_str, input_str)

print(result)


In [18]:
# + - 1 or more occurrences of specified characters(placed on the right side of the characters we wish to specify)
input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!773r\
butt3r b3tt3r.'

sub_str = r'\w+ht+'

result = re.findall(sub_str, input_str)

print(result)

['b0u6ht', 'b0u6ht']


In [None]:
# {} - Exactly the specified number of occurrences. 
print(input_str)

In [19]:
sub_str = r'\w+t{2}\w*'

result = re.findall(sub_str, input_str)

print(result)

['B3tty', 'butt3r', 'butt3r', 'Betty', 'b3tt3r', 'butt3r', '773rbutt3r', 'b3tt3r']


In [None]:
? = 0 or 1
* = 0 or more
+ = 1 or more
{2} = ONLY 2 - not one, not zero, not more than two

In [None]:
# {} - Exactly the specified number of occurrences. 

sub_str = r'\w+t{2}'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
# {} - Exactly the specified number of occurrences. 

sub_str = r'\w+t{2}\w+'

result = re.findall(sub_str, input_str)

print(result)


In [22]:
# {x,y} - Between the specified number of occurrences. 

input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!t3r s0 Betty b0u6ht s0me b3tttttt3r butt3r to m@k3 the b!t3r\
butt3r b3tttt3r.'

sub_str = r'\w+[3ue]t{2,4}'

result = re.findall(sub_str, input_str)

print(result)

['B3tt', 'butt', 'butt', 'Bett', 'b3tttt', 'butt', 't3rbutt', 'b3tttt']


In [None]:
# | - Either / or any of the specified characcters in pattern. 

input_str = 'B3tty b0u6ht some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'


sub_str = r'\w+3|@\w+ |s\w+'

result = re.findall(sub_str, input_str)

print(result)

In [11]:
# ? - Makes the character preceding the ? mark optional.

text = 'The colonel colouRs the car in a Red coloR'

sub_str = r'colou?R'

result = re.findall(sub_str, text)

print(result)

['coloR']


In [None]:
# As you probably noticed, the regex query matched both 'colour' and 'color' since it was optional to match the u, if
# present it was matched, even if not present the pattern was matched.

In [None]:
# () Capture and group the specified characters pattern. Allows you to match (or capture) a specific group of characters
# collectively.

In [None]:
input_str = 'B3tty b0u6ht some btt3r but the btt3r w@s btt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
butt3r b3tt3r.'

sub_str = r'[bB]3?t\w+'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
sub_str = r'(bar)'

result = re.search(sub_str, input_str)

print(result)

In [None]:
# The difference though - between regular regex without the parenthesis and with is that now the characters defined in the
# parentheses are treated as one group. e.g.

import re

input_str = 'foo barbar baz'

sub_str = r'\sbar'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
# As we can see it took the r as an optional character in this case. 

In [None]:
# input_str = 'B3tty b0u6ht boon goon opt option soon moon some butt3r but the butt3r w@s b!tt3r s0 Betty b0u6ht s0me b3tt3r butt3r to m@k3 the b!tt3r\
# butt3r b3tt3r.'

# sub_str = r'(opt(ion)?)'

# result = re.findall(sub_str, input_str)

# print(result)

In [None]:
butt?e?r?

butt
bute
butr

In [28]:
#We can use nested grouping to capture specific characters.

input_str = 'B3tty b0u6ht s0me butter but the butt3r was bitt3r so B3tty b0u6ht some butter butt3r to mak3 the bitt3r \
butt3r b3tt3r.'

sub_str = r'((b[uie3]t)(t[e3]r)?)'

result = re.findall(sub_str, input_str)

print(result)

[('butter', 'but', 'ter'), ('but', 'but', ''), ('butt3r', 'but', 't3r'), ('bitt3r', 'bit', 't3r'), ('butter', 'but', 'ter'), ('butt3r', 'but', 't3r'), ('bitt3r', 'bit', 't3r'), ('butt3r', 'but', 't3r'), ('b3tt3r', 'b3t', 't3r')]


In [32]:
sub_str2 = r'(b[ue3]t(t3r)?)'
result = re.search(sub_str, input_str)
result2 = re.search(sub_str2, input_str)

In [33]:
print(result)
print(result2)
#print(result.group())

<re.Match object; span=(18, 24), match='butter'>
<re.Match object; span=(18, 21), match='but'>


In [35]:
print(result.group())

butter


In [None]:
# Note how the groups() method gave out a tuple of the matches. We have seen 3 captures in our regex - outer capture 1, 
# inner capture 1 and inner capture2. 

# ((innercap1)(innercap2)) - Not all captures may have participated in the group. To get the breakdown of the groups
# captured by the regex in a match object, we can use the group or groups methods. 

In [40]:
input_str = 'I love basketball. But I am not very good at it.'
input_str2 = 'I also like badminton which I am actually pretty good at'

sub_str = r'I (also )?(love|like) (basketball|badminton)'

result = re.search(sub_str, input_str)

print(result)
print(result.group())
print(result.groups())

<re.Match object; span=(0, 17), match='I love basketball'>
I love basketball
(None, 'love', 'basketball')


In [37]:
result = re.search(sub_str, input_str2)

print(result)
print(result.group())

<re.Match object; span=(0, 21), match='I also like badminton'>
I also like badminton


In [None]:
# Sets - A set in Regex is a set of characters inside a pair of square brackets [] with a special meaning. 

# Set        Description
# [apz]      Returns a match where any one of the specified characters (a, p, or z) are present
# [a-e]      Returns a match for any lower case character, alphabetically between a and e
# [^apz]     Returns a match for any character EXCEPT a, p, and z
# [0123]     Returns a match where any of the specified digits (0, 1, 2, or 3) are present
# [0-9]      Returns a match for any digit between 0 and 9
# [0-5][0-9] Returns a match for any two-digit numbers from 00 and 59	
# [a-zA-Z]   Returns a match for any character alphabetically between a and z, lower case OR upper case	
# [+]        In sets, +, *, ., |, (), $,{} has no special meaning, so [+] means: return a match for any + character
#            in the string, [*] for any * character and so on.



In [48]:
inputstr = 'abpzxyz'

substr = r'[^apz]\w+'

result = re.findall(substr, inputstr)

print(result)

['bpzxyz']


In [None]:
# [apz] - Square brackets around specified characters - Returns a match where any one of the specified characters
# (a, p, or z) are present


input_str = 'Betty bought some butter but the butter was bitter so Betty bought some better butter to make the bitter \
butter better'

sub_str = r'\w+[mh]'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
# [a-e] - Returns a match for any lower case character, alphabetically between a and e

input_str = 'Betty bought some bubtter but the butter was bitter so Betty bocught some bedtter butter to make the bitter \
butter better'

sub_str = r'\w+[a-d]'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
# [^apz] - Returns a match for any character EXCEPT a, p, and z

input_str = 'betty b0u6ht s0me butter but the but^3r was bitter so Betty bou6ht s0m3 better butt3r to mak3 th3 bitt3r \
butt3r b3tt3r'

sub_str = r'b\w+t[\^3]\w+'

result = re.findall(sub_str, input_str)

print(result)

In [51]:
# [0123]     Returns a match where any of the specified digits (0, 1, 2, or 3) are present

input_str = 'B3tty b0u6ht s0me butter but the butt3r was bitter so Betty bou6ht s0m3 better butt3r to mak3 th3 bitt3r \
butt3r b3tt3r'

sub_str = r'\w+[03]'

result = re.findall(sub_str, input_str)

print(result)

['B3', 'b0', 's0', 'butt3', 's0m3', 'butt3', 'mak3', 'th3', 'bitt3', 'butt3', 'b3tt3']


In [None]:
# [0-9]      Returns a match for any digit between 0 and 9

input_str = 'B3tty b0u64t s0me butter but 43 t4e butt3r was bitt3r so B3tty bou64t s0m3 b3tt3r butt3r to mak3 t43 bitt3r \
butt3r b3tt3r'

sub_str = r'\w+[0-9]'

result = re.findall(sub_str, input_str)
print(result)

In [None]:
sub_str = r'(s0me)|(t4e)'

result = re.finditer(sub_str,input_str)

print(result)

for x in result:
    print(x.group(), x.span())

In [None]:
# [0-5][0-9] Returns a match for any two-digit numbers from 00 and 59

input_str = 'B3tty b0u64t s0me butter but 43 t4e butt3r was bitt3r so B3tty bou64t s0m3 b3tt3r butt3r to mak3 t43 bitt3r \
butt3r b3tt3r'

sub_str = r'\w+[2-6][3-5]'

result = re.findall(sub_str, input_str)

print( result)

In [None]:
sub_str = r'\w+[2-6][4-5]'

result = re.findall(sub_str, input_str)

print( result)

In [None]:
# [a-zA-Z]   Returns a match for any character alphabetically between a and z, lower case OR upper case

input_str = 'B3tty b0ught s0m3 b~tt3r'

sub_str = r'[a-zA-Z]'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
# [+]        In sets, +, *, ., |, (), $,{} has no special meaning, so [+] means: return a match for any + character
#            in the string, [*] for any * character and so on.

In [None]:
import re

input_str = 'B3++y b*ugh+ s*m3 b~tt3r'

sub_str = r'\w+[*~+]'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
result 

In [None]:
# Flags - Most regex methods allow a third parameter called flags. The most common flags used are: 

In [None]:
# Short Name          Long Name         Effect
# re.I                re.IGNORECASE     Makes matching of alphabetic characters case-insensitive
# re.M                re.MULTILINE      Causes start-of-string and end-of-string anchors to match embedded newlines
# re.S                re.DOTALL         Causes the dot metacharacter to match a newline

In [58]:
# Ignore case

input_str = 'B3tty b0u64t s0me buBBer bUt tb4e buTT3r was bITt3r sbo 3BBy oBB64t s0m3 b3tt3r butt3r to mak3 t43 bitt3r \n\
butt3r b3tt3r'

sub_str = r'b\w+'

result = re.findall(sub_str, input_str)

print(result)

['b0u64t', 'buBBer', 'bUt', 'b4e', 'buTT3r', 'bITt3r', 'bo', 'b3tt3r', 'butt3r', 'bitt3r', 'butt3r', 'b3tt3r']


In [60]:
result = re.findall(sub_str, input_str, flags = re.I)

print(result)

['B3tty', 'b0u64t', 'buBBer', 'bUt', 'b4e', 'buTT3r', 'bITt3r', 'bo', 'BBy', 'BB64t', 'b3tt3r', 'butt3r', 'bitt3r', 'butt3r', 'b3tt3r']


In [None]:
# Multiline

In [61]:
input_str = 'B3tty b0u64t s0me butter but t4e butt3r was bitt3r so B3tty bou64t s0m3 b3tt3r butt3r to mak3 t43 bitt3r \
butt3r b3tt3r\nB3tty b0u64t s0me butter but t4e butt3r was bitt3r so B3tty bou64t s0m3 b3tt3r butt3r to mak3 t43 bitt3r \
butt3r b3tt3r\nB3tty b0u64t s0me butter but t4e butt3r was bitt3r so B3tty bou64t s0m3 b3tt3r butt3r to mak3 t43 bitt3r \
butt3r b3tt3r'


print(input_str)

B3tty b0u64t s0me butter but t4e butt3r was bitt3r so B3tty bou64t s0m3 b3tt3r butt3r to mak3 t43 bitt3r butt3r b3tt3r
B3tty b0u64t s0me butter but t4e butt3r was bitt3r so B3tty bou64t s0m3 b3tt3r butt3r to mak3 t43 bitt3r butt3r b3tt3r
B3tty b0u64t s0me butter but t4e butt3r was bitt3r so B3tty bou64t s0m3 b3tt3r butt3r to mak3 t43 bitt3r butt3r b3tt3r


In [62]:
sub_str = r'^B\w+y'

result = re.findall(sub_str, input_str)

print(result)

['B3tty']


In [63]:
result = re.findall(sub_str, input_str, flags = re.M)

print(result)

['B3tty', 'B3tty', 'B3tty']


In [None]:
#Dotall includes the \n characters in the . search character set. 

input_str = 'B3tty b0u64t s0me butter but t4e butt3r was bitt3r 3rxB3 so B3tty bou64t s0m3 b3tt3r butt3r to mak3 t43 bitt3r \
butt3r b3tt3r\nB3tty b0u64t s0me butter but t4e butt3r was bitt3r so B3tty bou64t s0m3 b3tt3r butt3r to mak3 t43 bitt3r \
butt3r b3tt3r\nB3tty b0u64t s0me butter but t4e butt3r was bitt3r so B3tty bou64t s0m3 b3tt3r butt3r to mak3 t43 bitt3r \
butt3r b3tt3r'

print(input_str)

In [66]:
sub_str = r'3r.B3'

result = re.findall(sub_str, input_str, flags = re.S)

print(result)

['3r\nB3', '3r\nB3']


In [67]:
result = re.findall(sub_str, input_str, flags=re.S | re.I)

print(result)

['3r b3', '3r\nB3', '3r b3', '3r\nB3', '3r b3']


In [None]:

input_str = 'B3tty b0u64t s0me butter but t4e butt3r was bitt3r so B3tty bou64t s0m3 b3tt3r butt3r to mak3 t43 bitt3r \n\
butt3r b3tt3r'

sub_str = r'b.*r'

result = re.findall(sub_str, input_str)

print(result)

In [None]:
input_str = 'B3tty b0u64t s0me butter but t4e butt3r was bitt3r so B3tty bou64t s0m3 b3tt3r butt3r to mak3 t43 bitt3r \n\
butt3r b3tt3r'

sub_str = r'b\w+'

result = re.finditer(sub_str, input_str)

print(result)


for x in result:
    print(x)

In [None]:
# Other methods in re module

# We have already seen search (along with span, start, end, string, group and groups methods on match objects) method.
# We have also seen findall method. 

# Some other methods in regex are:

In [69]:
# compile method - compiles the stated regex into a regex object that can be reused and methods can be applied to it
# directly. 

input_str = 'B3++y b*ugh+ s*m3 b~tt3r'

sub_str = r'\w+[+*~]'

result = re.findall(sub_str, input_str)

print(result)

['B3+', 'b*', 'ugh+', 's*', 'b~']


In [None]:
ss = r'\w+[+*~]'

result = re.findall(ss, input_str)

In [70]:
ss = re.compile(r'\w+[+*~]')

result = ss.findall(input_str)

print(result)
print(type(ss))

['B3+', 'b*', 'ugh+', 's*', 'b~']
<class 're.Pattern'>


In [68]:
# Note above when we used the compile method on the regex we converted it to a re.Pattern object. And now we can apply the
# methods directly on the ss object. If we have a lot of operations to perform with the same regex pattern, or the pattern
# may be used frequently, it may be better to save it as a re.Pattern object.

# Intenally compilation of substring happens when we call the sub_str we want to search as the parameter to the different
# method as follows: 

sub_str = re.compile(r'\w+[+*~]')

result = re.findall(re.compile(sub_str), input_str)

print(result)

[]


In [None]:
# For our current module, since we were changing the regex frequently, we did not compile the regex patterns. However, 
# complex patterns that are frequently used, more often than not are compiled and stashed. 

In [None]:
#match method - only returns the match if it is at the beginning of the string

In [73]:
input_str = 'Xyz abc xyz abc'

sub_str = r'xyz'

result = re.match(sub_str, input_str, flags = re.I)

print(result)

<re.Match object; span=(0, 3), match='Xyz'>


In [None]:
sub_str = r'xyz'

result = re.match(sub_str, input_str)

print(result)

In [None]:
sub_str = r'xyz'

result = re.match(sub_str, input_str, flags = re.I)

print(result)

In [None]:
#split method in re takes 4 parameters

#re.split(pattern, string, maxsplit=0, flags=0)

#1. The regex pattern - mandatory
#2. The string to be checked - mandatory
#3. Maxsplit - count of how many maximum splits we want
#4. flags - as discussed above. 

input_str = 'B3tty b0u64t s0me butter but t4e butter was bitter'
sub_str = r'b[uei]tter'

result = re.split(sub_str, input_str)

print(result)

In [None]:
result = re.split(sub_str, input_str, maxsplit = 2)

print(result)

In [None]:
input_str = 'B3tty b0u64t S0me butter but t4e butter was bitter'
sub_str = r'b[uei]tter'
result = re.split(sub_str, input_str, flags = re.I)

print(result)

In [None]:
#sub method takes in 5 parameters

#1. The regex expression to be matched - mandatory
#2. The replacement string - mandatory
#3. The string to be checked - mandatory
#4. Count - the max number of times the replacement is to be performed - optional
#5. Flag - optional

input_str = 'Xyz Abc xyz abc xyz aBc'

sub_str = r'abc'

repl = r'pqr'

print(input_str)

In [None]:
result = re.sub(sub_str, repl, input_str)

print(result)

In [None]:
result = re.sub(sub_str, repl, input_str, flags = re.I)

print(result)

In [75]:
input_str = 'Xyz Abc xyz abc xyz aBc'

sub_str = r'abc'

repl = r'pqr'

print(input_str)

Xyz Abc xyz abc xyz aBc


In [76]:
result = re.sub(sub_str, repl, input_str, flags = re.I)

print(result)

Xyz pqr xyz pqr xyz pqr


In [77]:
#Subn method is the same as the sub method except it provides the replacement count along with the replaced string as a 
# tuple

result = re.subn(sub_str, repl, input_str, flags = re.I)

print(result)

('Xyz pqr xyz pqr xyz pqr', 3)


In [None]:
result = re.subn(sub_str, repl, input_str, flags = re.I)

print(result)

In [None]:
result = re.subn(sub_str, repl, input_str, flags = re.I, count = 2)

print(result)

In [None]:
str_email = '''boleh di kirim ke email saya ekoprasetyo.crb@outlook.com tks...
boleh minta kirim ke db.maulana@gmail.com. 
dee.wien@yahoo.com. .
deninainggolan@yahoo.co.id Senior Quantity Surveyor
Fajar.rohita@hotmail.com, terimakasih bu Cindy Hartanto
firmansyah1404@gmail.com saya mau dong bu cindy
fransiscajw@gmail.com 
Hi Cindy ...pls share the Salary guide to donny_tri_wardono@yahoo.co.id thank a'''

In [None]:
#Ansh

sub_string=r'\[a-zA-Z0-9]\S*@\S*[a-zA-Z]'
result = re.findall(sub_string, str_email)

print(result)


In [None]:
#Chandrabose

substr=r'\w+[.]?\w+@\w+.co(.)?\w+'
result=re.finditer(substr,str_email)

for x in result:
    print(x.group())



In [None]:
#Grishma

sub_str= r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
result= re.findall(sub_str , str_email)
print(result)


In [None]:
#mithila

#sub_str = r'\b\w+[.]?\w+@\w+.\w+\b'
sub_str = r'\b\w+[.]?\w+@'

result= re.findall(sub_str , str_email)
print(result)



In [None]:
#Sachin

sub_str = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
result = re.findall(sub_str, str_email)
print(result)

In [79]:
str_email = '''boleh di kirim ke email saya ekoprasetyo.crb@outlook.com tks...
boleh minta kirim ke db.maulana@gmail.com. 
dee.wien@yahoo.com. .
deninainggolan@yahoo.co.id Senior Quantity Surveyor
Fajar.rohita@hotmail.com, terimakasih bu Cindy Hartanto
firmansyah1404@gmail.com saya mau dong bu cindy
fransiscajw@gmail.com 
Hi Cindy ...pls share the Salary guide to donny_tri_wardono@yahoo.co.id thank a'''

In [80]:
str_phone = '''<p><strong>Kuala Lumpur</strong><strong>:</strong> +60 (0)3 2723 7900</p>
        <p><strong>Mutiara Damansara:</strong> +60 (0)3 2723 7900</p>
        <p><strong>Penang:</strong> + 60 (0)4 255 9000</p>
        <h2>Where we are </h2>
        <strong>&nbsp;Call us on:</strong>&nbsp;+6 (03) 8924 8686
        </p></div><div class="sys_two">
    <h3 class="parentSchool">General enquiries</h3><p style="FONT-SIZE: 11px">
     <strong>&nbsp;Call us on:</strong>&nbsp;+6 (03) 8924 8000
+ 60 (7) 268-6200 <br />
 Fax:<br /> 
 +60 (7) 228-6202<br /> 
Phone:</strong><strong style="color: #f00"> +601-4228-8055</strong>'''

In [87]:
#Hitesh

sub_str = r'\w*[.]?\w*@\w*.\w*.?\w*?'

result = re.findall(sub_str, str_email)

print(result)

['ekoprasetyo.crb@outlook.com ', 'db.maulana@gmail.com.', 'dee.wien@yahoo.com.', 'deninainggolan@yahoo.co.', 'Fajar.rohita@hotmail.com,', 'firmansyah1404@gmail.com ', 'fransiscajw@gmail.com ', 'donny_tri_wardono@yahoo.co.']


In [85]:
#Geetha

a= r'\w+[.]?\w+@\w+.com?'
result=re.findall(a,str_email)
print(result)


['ekoprasetyo.crb@outlook.com', 'db.maulana@gmail.com', 'dee.wien@yahoo.com', 'deninainggolan@yahoo.co', 'Fajar.rohita@hotmail.com', 'firmansyah1404@gmail.com', 'fransiscajw@gmail.com', 'donny_tri_wardono@yahoo.co']


In [89]:
#Shivam

a= r'(\w+[.]?\w+?@\w+((.com)|(.co.id)))'
result=re.findall(a,str_email)
print(result)


[('ekoprasetyo.crb@outlook.com', '.com', '.com', ''), ('db.maulana@gmail.com', '.com', '.com', ''), ('dee.wien@yahoo.com', '.com', '.com', ''), ('deninainggolan@yahoo.co.id', '.co.id', '', '.co.id'), ('Fajar.rohita@hotmail.com', '.com', '.com', ''), ('firmansyah1404@gmail.com', '.com', '.com', ''), ('fransiscajw@gmail.com', '.com', '.com', ''), ('donny_tri_wardono@yahoo.co.id', '.co.id', '', '.co.id')]


In [94]:
#Vidya

sub_str = re.compile(r'[A-z.0-9]*@\w+[.](com|co[.]id)')   
result = sub_str.finditer(str_email)  
for match in result:
    print(match)


<re.Match object; span=(29, 56), match='ekoprasetyo.crb@outlook.com'>
<re.Match object; span=(85, 105), match='db.maulana@gmail.com'>
<re.Match object; span=(108, 126), match='dee.wien@yahoo.com'>
<re.Match object; span=(130, 156), match='deninainggolan@yahoo.co.id'>
<re.Match object; span=(182, 206), match='Fajar.rohita@hotmail.com'>
<re.Match object; span=(238, 262), match='firmansyah1404@gmail.com'>
<re.Match object; span=(286, 307), match='fransiscajw@gmail.com'>
<re.Match object; span=(351, 380), match='donny_tri_wardono@yahoo.co.id'>


In [None]:
#Ashwini

import re
str_email = '''boleh di kirim ke email saya ekoprasetyo.crb@outlook.com tks...
boleh minta kirim ke db.maulana@gmail.com. 
dee.wien@yahoo.com. .
deninainggolan@yahoo.co.id Senior Quantity Surveyor
Fajar.rohita@hotmail.com, terimakasih bu Cindy Hartanto
firmansyah1404@gmail.com saya mau dong bu cindy
fransiscajw@gmail.com 
Hi Cindy ...pls share the Salary guide to donny_tri_wardono@yahoo.co.id thank a'''


sub_str = r'\w+[.]?\w+[@]\w+[.com]\w+[.id]\w+'
result = re.findall(sub_str, str_email)
print(result)


In [None]:
#Archana

sub_str = r'\w+@\w+[.]\w+[.]\w+|\w+[._]?\w+@\w+[.]\w+'

result = re.findall(sub_str, str_email)
print(result)

In [None]:
#Sowjanya

result = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", str_email)
print(result)

In [None]:
#Apurva

sub_str=r'[^ \n]+[@]\w+[.]\w+[.]?\w*'
result=re.finditer(sub_str,str_email)
for i in result:
    print(i.group())


In [None]:
#Vinti

sub_str=r'[\w\._]+@+[\w\._]'

result = re.findall(sub_str, str_email)

print(result)


In [None]:
#Shilpy

sub_str=r'\w+[._]?\w*\@\w+.\w.*?\w+\w.*?'
result=re.finditer(sub_str,str_email)
for x in result:
    print(x.group())
