# RE Module Methods 

In [1]:
import re

In [2]:
t1 = 'This is a beautiful day'

In [3]:
# lets call search on this
# we need to pass pattern which we want to search (pass as r'pattern') as first parameter and the second is the object 
# in which we want to search
re.search(r'is', t1)

# it gives match object.

<re.Match object; span=(2, 4), match='is'>

In [4]:
m = re.search(r'is', t1)

In [5]:
# we can call some methods on this m object.
m.group()
# this will give the actual match from the string

'is'

In [6]:
# this will give the starting and ending index of the string.
m.start(), m.end()

(2, 4)

In [7]:
# same as above two functions
m.span()

(2, 4)

In [8]:
# match finds the pattern on the start of the string 
# in this case, it will give None as there is no 'is' on the start of our string. The character is 'Th'.
m = re.match(r'is', t1)

In [9]:
print(m)

None


In [10]:
# now lets look for 'Th'
m = re.match(r'Th', t1)

In [11]:
print(m)
# as we can see, it has found the pattern.

<re.Match object; span=(0, 2), match='Th'>


In [12]:
m.group(), m.start(), m.end(), m.span()

('Th', 0, 2, (0, 2))

In [13]:
# re.findall
t1 

'This is a beautiful day'

In [14]:
re.findall(r'is', t1)
# if we put r before any string, it will become a raw string. The difference between raw string and simple string is 
# a raw string is raw string does not interpret the backslash character. 
# Regular Expressions usually contain a lot of backslashes(\). 
# When using Python's “re” module , regular expressions are represented as strings.
# So, like all strings with a lot of backslashes, they are more readable when written in raw literal form.
# Raw Strings are amazing for regex.

# we got a list of characters.

['is', 'is']

In [15]:
t2 = 'abbbaaabbbbabababa'

In [16]:
re.findall(r'ba', t2)
# below are the matches we found.

['ba', 'ba', 'ba', 'ba', 'ba']

In [17]:
# lets use finditer method.
mat = re.finditer(r'ba', t2)

In [18]:
type(mat)
# as we can see, it returns iterable object. we can run for loop on this.

callable_iterator

In [19]:
for m in mat:
    print(m)
# as we can see, it returns match object and this itertor object can be only run once.

<re.Match object; span=(3, 5), match='ba'>
<re.Match object; span=(10, 12), match='ba'>
<re.Match object; span=(12, 14), match='ba'>
<re.Match object; span=(14, 16), match='ba'>
<re.Match object; span=(16, 18), match='ba'>


In [20]:
# lets find the match word with its start and end index.
for m in mat:
    print(m.group(), m.start(), m.end())

The finditer() is useful when we have a lot of matches to find as it conserves memory.

**sub()**  
sub substitutes the given pattern or words with the provided letter or string

In [21]:
re.sub(r'ba', 'xy', t2)

'abbxyaabbbxyxyxyxy'

In [22]:
# lets pass the counter in above
# counter will replace only n amount of characters.
re.sub(r'ba', 'xy', t2, count = 2)

'abbxyaabbbxybababa'

**compile()**   
Compile a regular expression pattern into a regular expression object, which can be used for matching using its match(), search() and other methods, described below.  
This is useful when we have a lot of patterns to find.

In [23]:
pat = re.compile(r'ba')

In [24]:
re.findall(pat, t2)

['ba', 'ba', 'ba', 'ba', 'ba']

**re.split(pattern, string, maxsplit=0, flags=0)**  
Split string by the occurrences of pattern. If capturing parentheses are used in pattern, then the text of all groups in the pattern are also returned as part of the resulting list. If maxsplit is nonzero, at most maxsplit splits occur, and the remainder of the string is returned as the final element of the list.  
In simple string, we can pass only simple delimiter, while in re.split, we can pass multiple delimiter.

In [25]:
t3 = 'akaks ksdkdkd; aksakks: ajsjss, shshs; ususu;    hshs'

In [26]:
# this is simple string split
t3.split()

['akaks', 'ksdkdkd;', 'aksakks:', 'ajsjss,', 'shshs;', 'ususu;', 'hshs']

In [27]:
# it says split on space, ;, :, ','. 
# \s* means multiple spaces.
re.split(r'[ ;:,]\s*', t3)

['akaks', 'ksdkdkd', 'aksakks', 'ajsjss', 'shshs', 'ususu', 'hshs']

# Writing REs - Greedy and Non Greedy Repetitions.

### Writing Patterns 1: Repetition(.*+?{m}{m, n})  
  
  
**'ab\*'** : a followed by 0 or more b's.      
**'ab+** : a followed by one or more b's.  
**'ab?** : a followed by zero or one b.  
**'ab{n}'** : a followed by n number of b's.  
**'ab{m,n}'** : a followed by min m and max n  b's.  
**'ab{m,}'** : a followed by min m and unlimited b's.  
**'ab.'** : ab followed by any single  non newline character.  


In [28]:
t = "ab abb a a a abbbb abbbbbbb"

In [29]:
re.findall(r'ab*', t) # means a must be followed by 0 or more times of b.

['ab', 'abb', 'a', 'a', 'a', 'abbbb', 'abbbbbbb']

In [30]:
re.findall(r'ab+', t) # means must be followed by one or more b.

['ab', 'abb', 'abbbb', 'abbbbbbb']

In [31]:
re.findall(r'ab?', t) # will return a followed by one b from the given expressions.

['ab', 'ab', 'a', 'a', 'a', 'ab', 'ab']

In [32]:
re.findall(r'ab{7}', t)  # will return a followed by 7 b's.

['abbbbbbb']

In [33]:
re.findall(r'ab{2,7}', t)  # will return a followed by minimum of 2 and maximum of 7 b's in an expression.

['abb', 'abbbb', 'abbbbbbb']

In [34]:
re.findall(r'ab{3,}', t) # will return a followed by minimum of 3 and maximum of infinite b's in an expression.

['abbbb', 'abbbbbbb']

**\* \+ and ? are Greedy Repetitors.  
Lets say the expression -> re.findall(r'ab*', t). It is a greedy expression. To make it non greedy, we add ? sign in it**

In [35]:
re.findall(r'ab*?', t)
# it will make it as "a followed by 0 b's".

['a', 'a', 'a', 'a', 'a', 'a', 'a']

In [36]:
# lets try the above with a +?
re.findall(r'ab+?', t)
# it will make it as "a followed by 1 b's."

['ab', 'ab', 'ab', 'ab']

In [37]:
# Lets try with ??
re.findall(r'ab??', t)
# it will make it as "a followed by 0 b's"

['a', 'a', 'a', 'a', 'a', 'a', 'a']

In [39]:
re.findall(r'ab.', t) # as b is non newline character, it will make it as "a followed by one b."

['ab ', 'abb', 'abb', 'abb']

# Writing REs  : Character Sets and Ranges

A character set is a group of characters enclosed in square brackets[], any one of which can match at that point in the pattern.  
  
As character sets grow larger, typing every character that should match could become very tedious. A more compact format is using character ranges.  

**Examples:**  
  
**a[xy]** : Would match either ax or ay.  
**a[^xy]** : would exclude ax and ay. (^ excludes characters in []).  
**[a-k]** : Match with any character between a and k.  
**[^p-z]** : Exclude matches with any character between p and z.  


In [40]:
t = 'xyyxxyyyzzzx'

In [41]:
re.findall(r'[xy]', t)    # here we are saying match either x or y.

['x', 'y', 'y', 'x', 'x', 'y', 'y', 'y', 'x']

In [42]:
# let us add a repitition character
re.findall(r'x[xy]', t)  # here we say match either xx or xy.

['xy', 'xx']

In [43]:
re.findall(r'x[xy]+', t)  # + means preceeding character. which in our case is x or y. 

['xyyxxyyy']

In [44]:
# now lets make the above expression non greedy.
re.findall(r'x[xy]+?', t)  # we made it non greedy means we say match either xx or xy.

['xy', 'xx']

In [45]:
t = 'xxy xyxyx xaxb xxyy aaxz'

In [47]:
re.findall(r'x[^xy]', t) # we say here find match with x followed by any character except xy

['x ', 'xa', 'xb', 'xz']

In [48]:
re.findall(r'x[^xy]+', t)   # it has to match with one or more pattern where it is not xx or xy.

['x ', 'xa', 'xb ', 'xz']

In [50]:
# making the above expression non greedy
re.findall(r'x[^xy]+?', t)

['x ', 'xa', 'xb', 'xz']

In [51]:
text = "This is a sample text. -- with some Punctuation marks!!!"

In [56]:
# lets say we want to match on 'This' and 'Punctuation'. 
# as we can see, This and Punctuation starts with capital letters, we use ranges.
re.findall(r'[A-Z][a-z]', text)  
# but this is not what we want, we want the whole character so we use *.

['Th', 'Pu']

In [57]:
# below means starting with captial letter followed by n small letters.
re.findall(r'[A-Z][a-z]*', text) 

['This', 'Punctuation']

In [61]:
# lets say we want to pull out all the words without puctuation marks.
# so we mention all the punctuation(including space) but it will give error.
# because this - is used in ranges, but here we dont want to use it as range character so we cancel it by adding \ before it
re.findall(r'[^.-! ]', text)

error: bad character range .-! at position 2

In [63]:
re.findall(r'[^.\-! ]', text)  
# so we are getting output as single character, this is not what we want. FOr that, we add +sign before it.

['T',
 'h',
 'i',
 's',
 'i',
 's',
 'a',
 's',
 'a',
 'm',
 'p',
 'l',
 'e',
 't',
 'e',
 'x',
 't',
 'w',
 'i',
 't',
 'h',
 's',
 'o',
 'm',
 'e',
 'P',
 'u',
 'n',
 'c',
 't',
 'u',
 'a',
 't',
 'i',
 'o',
 'n',
 'm',
 'a',
 'r',
 'k',
 's']

In [64]:
re.findall(r'[^.\-! ]+', text)  
# + is a greedy repetitior.

['This', 'is', 'a', 'sample', 'text', 'with', 'some', 'Punctuation', 'marks']

# Writing REs : Escape Codes, Anchoring and Flags.

### Writing Paterns: Escape Codes
  
We can use escape codes to find specific types of patterns in data, such as digits, non-digits, whitespaces etc.  
  
**\d** : Matches a single digit.  
**\D** : Matches a single non-digit.  
**\w** : Matches a single alphanumeric character.  
**\W** : Matches a non alphanumeric character.  
**\s** : Matches a single whitespace character ( tab, space, newline, etc).   
**\S** : Matches a single non whitespace character.  
**\b** : Matches on word boundary.  

In [65]:
text = "The cost of Python course is $125."

In [67]:
re.findall(r'\d', text)  # will look for single numerical character.

['1', '2', '5']

In [68]:
# making above expression as greedy
re.findall(r'\d+', text)

['125']

In [70]:
# if we use *, it will look on every expression and return 0 or more numeric character.
re.findall(r'\d*', text)

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '125',
 '',
 '']

In [71]:
re.findall(r'\D', text)  # will return single nondigits character.

['T',
 'h',
 'e',
 ' ',
 'c',
 'o',
 's',
 't',
 ' ',
 'o',
 'f',
 ' ',
 'P',
 'y',
 't',
 'h',
 'o',
 'n',
 ' ',
 'c',
 'o',
 'u',
 'r',
 's',
 'e',
 ' ',
 'i',
 's',
 ' ',
 '$',
 '.']

In [72]:
# making above expression as greedy
re.findall(r'\D+', text)

['The cost of Python course is $', '.']

In [73]:
# lets use \s
re.findall(r'\s', text)  # will return single whitespaces, tabs and newline character.  

[' ', ' ', ' ', ' ', ' ', ' ']

In [75]:
# lets make above expression as greedy.
re.findall(r'\s+', text)

[' ', ' ', ' ', ' ', ' ', ' ']

In [76]:
re.findall(r'\S', text)  # will return single non-whitespace character.

['T',
 'h',
 'e',
 'c',
 'o',
 's',
 't',
 'o',
 'f',
 'P',
 'y',
 't',
 'h',
 'o',
 'n',
 'c',
 'o',
 'u',
 'r',
 's',
 'e',
 'i',
 's',
 '$',
 '1',
 '2',
 '5',
 '.']

In [77]:
# lets make above expression as greedy
re.findall(r'\S+', text)

['The', 'cost', 'of', 'Python', 'course', 'is', '$125.']

In [78]:
# lets check for alphanumeric
re.findall(r'\w', text)  # it will return single alphanumeric(means alphabets and numbers 
                         # excluding punctuations and whitespaces and more)

['T',
 'h',
 'e',
 'c',
 'o',
 's',
 't',
 'o',
 'f',
 'P',
 'y',
 't',
 'h',
 'o',
 'n',
 'c',
 'o',
 'u',
 'r',
 's',
 'e',
 'i',
 's',
 '1',
 '2',
 '5']

In [79]:
# making it greedy
re.findall(r'\w+', text)

['The', 'cost', 'of', 'Python', 'course', 'is', '125']

In [80]:
re.findall(r'\W', text) # will return non-alphanumeric characters.

[' ', ' ', ' ', ' ', ' ', ' ', '$', '.']

In [81]:
# making above expression as greedy
re.findall(r'\W+', text)

[' ', ' ', ' ', ' ', ' ', ' $', '.']

# Writing Patterns : Anchoring

In addition to describing the content of a pattern to match, the relative location can be specified in the input text where the pattern should appear by using anchoring instructions.  
  
**^** : Start of a string  
**$** : End of a string  
**\A** : Start of a string.  
**\Z** : End of a string.  
**\b** : Empty string at the begining and end of a word.

In [82]:
text = "This is a beautiful day."

In [83]:
re.findall(r'is', text)

['is', 'is']

In [84]:
# to look 'is' at the begining of sentence, we add ^ sign at the 
re.findall(r'^is', text)

[]

In [85]:
re.findall(r'^T', text) # looking for t at the beginning of the string

['T']

In [87]:
# lets say we want to look at the dot at the end of the string
# it might be possible that this '.' has some meaning in the expression so we nullify the meaning by adding \
re.findall(r'\.$', text)

['.']

In [88]:
re.findall(r'K$', text)

[]

In [89]:
# let us see the word boundary
# our text is 
text

'This is a beautiful day.'

In [90]:
# lets say we want to look at 'is' which is a pure word and not the substring of the word.
# for that we use word boundary
re.findall(r'\bis\b', text)

['is']

In [91]:
# to know this 'is' is not the part of substring, we do search()
re.search(r'\bis\b', text)
# means we got the index of that particular 'is'.

<re.Match object; span=(5, 7), match='is'>

# Writing Patterns : Flags

Sometimes, we need to slightly tweak the behaviour of the regular expression. The regular expression engine in Python, offer a small number of flags that modify the behaviour of the entire expression.

**re.IGNORECASE or reI or 2 :** Makes the regular expression case-insensitive.  
  
**re.DOTALL or re.S or 16 :** Makes . character to include \n newline character.  
  
**re.MULTILINE or re.M or 8 :** Makes the ^ and $ characters, which normally would only match against the beginnig or end of the string, to instead match against the beginning or end of any line within the string.   
  
**re.VERBOSE or re.X or 64 :** Makes complicated regular expressions to be more readable. This flag does two things: First, it causes all whitespaces (other than in character classes) to be ignored, including line breaks. Second it treats the # character (again, unless it's inside a character class) as a comment character.    
  
**re.DEBUG or 128 :** Provides some debugging information while compiling a regular expression.   
  
**Multiple Flags :** Sometimes we may have to use multiple flags at the same time, This is done using bitwise OR operator. Example: re.I | re.S | re.M.  

In [92]:
text = "Python python PYTHON"

In [93]:
re.findall(r'Python', text)

['Python']

In [94]:
# if we want to make it case insensitive(match all the 3), we need to specify the flag
re.findall(r'Python', text, re.IGNORECASE)

['Python', 'python', 'PYTHON']

In [96]:
# we can also write the above expression as
re.findall(r'Python', text, re.I), re.findall(r'Python', text, 2)  # writing number is not recommended.

(['Python', 'python', 'PYTHON'], ['Python', 'python', 'PYTHON'])

In [97]:
# to find that number or case,
re.I

re.IGNORECASE

In [98]:
re.S

re.DOTALL

In [99]:
re.M

re.MULTILINE

In [100]:
text = 'Py\nthon'

In [101]:
re.findall(r'.+', text)
# dot above will match on any character except newline character.

['Py', 'thon']

In [102]:
re.findall(r'.+', text, re.S) # it will include all characters including newline characters.

['Py\nthon']

In [103]:
text = 'Python is fun. Learning python.'

In [104]:
re.sub(r'Py', 'My', text)
# sub is for substitution. In this we substitue the substring 'Py' with 'My' .

'Mython is fun. Learning python.'

In [105]:
# as we can see, it does not replace small py from last word. In this casem we can use ignorecase to replace all 'Py'
# in the sentence
re.sub(r'Py', 'My', text, re.I) 
# as we can see, it is still not substituted. For this we need to pass the syntax flags = re.I

'Mython is fun. Learning python.'

In [106]:
re.sub(r'Py', 'My', text, flags = re.I)

'Mython is fun. Learning Mython.'

In [108]:
# we can also specify the no of replacements. 
re.sub(r'Py', 'My', text, count = 1, flags = re.I)  # here we specified only one replacement

'Mython is fun. Learning python.'

# Writing REs : Groups and Named Groups

### Writing Patterns : Grouping  
Regular expressions provide a mechanism to split the expression into groups. When using groups, we will be able to select each individual group within the match in addition to getting the entire match. You can specify groups within a regular expression by using Parenthesis.

In [109]:
text = '123-4567 is my telephone.'

In [113]:
re.findall(r'[\d]{3}-[\d]{4}', text)
# [\d]{3}-[\d]{4} means [\d]{3} means for first 3 numbers then we need to find the -  
# then [\d]{4} for last 4 digits.  
# We use {} for repetition, means finding n values.

['123-4567']

In [116]:
# lets say we want to group the above numbers into two, so how do we do that? By adding parenthesis.
m = re.search(r'([\d]{3})-([\d]{4})', text)

In [118]:
print(m)

<re.Match object; span=(0, 8), match='123-4567'>


In [120]:
m.group()

'123-4567'

In [121]:
m.groups()
# as we see, it gave us two different groups.
# for seeing individual groups

('123', '4567')

In [123]:
m.group(1)

'123'

In [124]:
m.group(2)

'4567'

We can also give names to the groups.  
We need to write **?P< group-name>** inside the tuple.

In [125]:
m = re.search(r'(?P<first3>[\d]{3})-(?P<last4>[\d]{4})', text)

In [127]:
# after assigning name, we can access the group by name.
m.group('first3')

'123'

In [128]:
m.group('last4')

'4567'

# Writing a Regular Expression : An Example

In [4]:
text = ['123 456 7890', '(123) 456 7890']

In [5]:
# adding pattern to look for
pat = r'\d{3}\s\d{3}\s\d{4}'

In [6]:
# as it is a list, we will go for each string
for dt in text:
    m = re.search(pat, dt)
    if m:
        print(m.group())

123 456 7890


As we can see, it only prints the first string not the second because the second string starts with parenthesis and we didnt defined the parenthesis. To define, 

In [8]:
pat = r'\(\d{3}\)\s\d{3}\s\d{4}'

In [9]:
for dt in text:
    m = re.search(pat, dt)
    if m:
        print(m.group())

(123) 456 7890



To print both the characters, we need to add ? before parenthesis

In [10]:
pat = r'\(?\d{3}\)?\s\d{3}\s\d{4}'

In [11]:
for dt in text:
    m = re.search(pat, dt)
    if m:
        print(m.group())

123 456 7890
(123) 456 7890


Lets add couple of more telephone numbers.  
Now to match last phone no, we need to add space and - in square bracket.

In [15]:
text = ['123 456 7890', '(123) 456 7890', '123-456-7890']

In [16]:
pat = r'\(?\d{3}\)?[\s\-]\d{3}[\s\-]\d{4}'

In [17]:
for dt in text:
    m = re.search(pat, dt)
    if m:
        print(m.group())

123 456 7890
(123) 456 7890
123-456-7890


Let us add more telephone number.   
Changes in pattern will be very simple we have to add only dot in the square bracket.

In [18]:
text = ['123 456 7890', '(123) 456 7890', '123-456-7890', '123.456.7890']
pat = r'\(?\d{3}\)?[\s\-\.]\d{3}[\s\-\.]\d{4}'

In [19]:
for dt in text:
    m = re.search(pat, dt)
    if m:
        print(m.group())

123 456 7890
(123) 456 7890
123-456-7890
123.456.7890


Let us add more telephone number.  
So we have no space here which means we need to make the square bracket as optional by adding ? after it.

In [20]:
text = ['123 456 7890', '(123) 456 7890', '123-456-7890', '123.456.7890', '1234567890']
pat = r'\(?\d{3}\)?[\s\-\.]?\d{3}[\s\-\.]?\d{4}'

In [21]:
for dt in text:
    m = re.search(pat, dt)
    if m:
        print(m.group())

123 456 7890
(123) 456 7890
123-456-7890
123.456.7890
1234567890


let us add more telephone numbers.

In [52]:
text = ['123 456 7890', '(123) 456 7890', '123-456-7890', '123.456.7890', '1234567890', '+1 123 456 7890', '+1 (123) 456 7890']
pat = r'\W?\d?\s?\(?\d{3}\)?[\s\-\.]?\d{3}[\s\-\.]?\d{4}'

In [53]:
for dt in text:
    m = re.search(pat, dt)
    if m:
        print(m.group())

123 456 7890
(123) 456 7890
123-456-7890
123.456.7890
1234567890
+1 123 456 7890
+1 (123) 456 7890


Lets compile the above code.

In [54]:
text = ['123 456 7890', '(123) 456 7890', '123-456-7890', '123.456.7890', '1234567890', '+1 123 456 7890', '+1 (123) 456 7890']
pat = r'\W?\d?\s?\(?\d{3}\)?[\s\-\.]?\d{3}[\s\-\.]?\d{4}'
patc = re.compile(pat)

for dt in text:
    m = re.search(patc, dt)
    if m:
        print(m.group())

123 456 7890
(123) 456 7890
123-456-7890
123.456.7890
1234567890
+1 123 456 7890
+1 (123) 456 7890


Let us add a group to this

In [58]:
text = ['123 456 7890', '(123) 456 7890', '123-456-7890', '123.456.7890', '1234567890', '+1 123 456 7890', '+1 (123) 456 7890']
pat = r'(\W?\d?)\s?(\(?\d{3}\)?)[\s\-\.]?(\d{3})[\s\-\.]?(\d{4})'
patc = re.compile(pat)

for dt in text:
    m = re.search(patc, dt)
    if m:
        print(m.group(), "\t", m.group(1),"\t", m.group(2),"\t", m.group(3), "\t", m.group(4))

123 456 7890 	  	 123 	 456 	 7890
(123) 456 7890 	 ( 	 123) 	 456 	 7890
123-456-7890 	  	 123 	 456 	 7890
123.456.7890 	  	 123 	 456 	 7890
1234567890 	  	 123 	 456 	 7890
+1 123 456 7890 	 +1 	 123 	 456 	 7890
+1 (123) 456 7890 	 +1 	 (123) 	 456 	 7890


In [59]:
text = ['123 456 7890', '(123) 456 7890', '123-456-7890', '123.456.7890', '1234567890', '+1 123 456 7890', '+1 (123) 456 7890']
pat = r'(?P<add1>\W?\d?)\s?(?P<area>\(?\d{3}\)?)[\s\-\.]?(?P<first3>\d{3})[\s\-\.]?(?P<last4>\d{4})'
patc = re.compile(pat)

for dt in text:
    m = re.search(patc, dt)
    if m:
        print(m.group(), "\t", m.group('add1'),"\t", m.group('area'),"\t", m.group('first3'), "\t", m.group('last4'))

123 456 7890 	  	 123 	 456 	 7890
(123) 456 7890 	 ( 	 123) 	 456 	 7890
123-456-7890 	  	 123 	 456 	 7890
123.456.7890 	  	 123 	 456 	 7890
1234567890 	  	 123 	 456 	 7890
+1 123 456 7890 	 +1 	 123 	 456 	 7890
+1 (123) 456 7890 	 +1 	 (123) 	 456 	 7890


It is advisable to start building with small expression and then keep building it.