# Python Regular Expressions

In [None]:
#%%html
#<style>
#table {float:left; width:100%;} 
#    table th {float:center; color:blue;font-size:14pt;} 
#    table th:first-of-type {width: 12%;}
#    table th:nth-of-type(2) {width: 10%;text-align:center;} 
#    table th:nth-of-type(3) {width: 70%;}    
#    table tr {font-size:10pt;}  
#</style>

In [1]:
 from IPython.core.display import HTML
 css = open('./Markdown/stylesheets/style_table4.css').read()
 HTML('<style>{}</style>'.format(css))           

<hr style="color:black; border: 1px solid gray;"><h3>Sequences</h3>

|Expression | | Explanations
|:--- |:--- |:---|
^ ||Matches the start of a string without consuming any characters. If /m multiline mode is used, this will also match immediately after a newline character.
$ | |Matches the end of a string without consuming any characters. If multiline mode is used, this will also match immediately before a newline character.
[a-z] || Matches any lowecase characters between a and z, including a and z.
[A-Z] || Matches any uppercase characters between A and Z, including A and Z.
[0-9] || Matches any number digit between 0 and 9, including 9.
[abc] || Matches either an a, b or c character.
[^abc] || Matches any character except for an a, b or c.
a\|b | |Matches either what is before the "\|" or what is after it - in this case 'a' or 'b'. 
(?:...) ||A non-capturing group allows you to apply quantifiers to part of your regex but does not capture/assign an ID. 
(...) || Capturing group. Capures everything enclosed.
\w | |Matches any word character. Equivalent to [a-zA-Z0-9_].
\W | |Matches any non-word character Equivalent to [^a-zA-Z0-9_].
. | |Matches any single character except newline
\d | |Matches any digit [0-9] character
\D | |Matches any non-digit [^0-9] character
\s | |Matches any space, tab or newline character.  [\t\n\r\f\v]
\S | |Matches anything other than a space, tab or newline. [^\t\n\r\f\v]
xy | |Matches the string xy
a\|b | |Matches expression a or b. If a is matched first, b is left untried.

<hr style="color:black; border: 1px solid gray;"><h3>Quantifiers</h3>

|Expressions |	|Explanations
|:-|:- |:---
\+ ||Matches the expression to its left 1 or more times.
\* | |Matches the expression to its left 0 or more times.
\? | |Matches the expression to its left 0 or 1 times
{p}| |Matches the expression to its left p times, and not less.
{p, q} ||Matches the expression to its left p to q times, and not less.
{p, } ||Matches the expression to its left p or more times.
{ , q} ||Matches the expression to its left up to q times

|Note |	|Description | 
|:--- |:--- |:--- |
(?!pat) |	|negative lookahead assertion |
(?<!pat) |	|negative lookbehind assertion  |
(?=pat) |	|positive lookahead assertion  |
(?<=pat) |	|positive lookbehind assertion  |
(?!pat1)(?=pat2) |	|multiple assertions can be specified next to each other in any order as they mark a matching location without consuming characters | 
((?!pat).)* |	|Negate a grouping, similar to negated character class  | 

<hr style="color:black; border: 1px solid gray;"><h2>Popular Python re Module Functions</h2>

|Function | | Description
|:--- |:--- |:---  
| re.search(A, B) | | Matches the first instance of an expression A in a string B, and returns it as a re match object.It searches for the whole string even if the string contains multi-lines and tries to find a match of the substring in all the lines of string.
| re.match(A, B) | | Searches only from the beginning of the string and return match object if found. But if a match of substring is found somewhere in the middle of the string, it returns none. 
| re.split(A, B) | | Split a string B into a list using the delimiter A.
| re.sub(A, B, C) | | Replace A with B in the string C.
| re.findall(A, B) | | Matches all instances of an expression A in a string B and returns them in a list.

In [2]:
import re

<hr style="color:black; border: 1px solid gray;"><h3>re flags</h3>

<div class="table3col">

|Long|Short|Inline
|:---|:---|:---
|re.VERBOSE|re.X|"(?x)"
|re.IGNORECASE|re.I| "(?i)"
|re.MULTILINE|re.M| "(?m)"
|re.DOTALL|re.S|"(?s)"

</div>    

## Python re functions - examples

### Import **re** package

In [14]:
import re

### Setup regex expression and compile it

In [15]:
text = """
My name is Tor and I live in Chicago.
My dummy phone number is 224-323-1234
and my zip code is 60060 
My dummy email is: tor@gmail.com."
"""

regex = r"my"
re.compile(regex)

re.compile(r'my', re.UNICODE)

### re.search()

In [16]:
result = re.search(regex,text)
result1 = re.search(regex,text, flags=re.IGNORECASE)
result2 = re.search(regex,text, flags=re.IGNORECASE | re.MULTILINE)
print(result)
print(result1)
print(result2)
print(result.span())
print(result1.span())
print(result2.span())

<re.Match object; span=(45, 47), match='my'>
<re.Match object; span=(1, 3), match='My'>
<re.Match object; span=(1, 3), match='My'>
(45, 47)
(1, 3)
(1, 3)


In [None]:
text = """
My name is Tor and I live in Chicago.
My dummy phone number is 224-323-1234
and my zip code is 60060 
My dummy email is: tor@gmail.com."
"""

### re.split()

In [19]:
result = re.split(regex,text, flags=re.IGNORECASE)
result1 = re.split(regex,text,1)
result3 = re.split(regex,text,3)
#print(result)
#print(result1)
print(result3)

['\nMy name is Tor and I live in Chicago.\nMy dum', ' phone number is 224-323-1234\nand ', ' zip code is 60060 \nMy dum', ' email is: tor@gmail.com."\n']


### re.match()

In [20]:
result = re.match(regex,text)
result1 = re.match(regex,text, re.I)
result2 = re.match(regex,text, flags= re.I | re.M)
print(result)
print(result1)
print(result2)

None
None
None


### re.findall()

In [21]:
result = re.findall(regex,text)
result1 = re.findall(regex,text, re.I)
result2 = re.findall(regex,text, flags= re.I | re.M)
print(result)
print(result1)
print(result2)

['my', 'my', 'my']
['My', 'My', 'my', 'my', 'My', 'my']
['My', 'My', 'my', 'my', 'My', 'my']


In [22]:
text = """
My name is Tor and I live in Chicago.
My dummy phone number is 224-323-1234
and my zip code is 60060 
My dummy email is: tor@gmail.com."
"""

regex = r"\d+"
regex1 = r"\d+[-]?"
regex2 = r"\d+[-]?.*"
regex3 = r"\d+[-]?\W"

re.compile(regex)
re.compile(regex1)
re.compile(regex2)
re.compile(regex3)

re.compile(r'\d+[-]?\W', re.UNICODE)

In [23]:
result = re.findall(regex,text)
result1 = re.findall(regex1,text)
result2 = re.findall(regex2,text)
result3 = re.findall(regex3,text)

print(result)
print(result1)
print(result2)
print(result3)

['224', '323', '1234', '60060']
['224-', '323-', '1234', '60060']
['224-323-1234', '60060 ']
['224-', '323-', '1234\n', '60060 ']


In [24]:
import re

text = "Python is a programming language"

result = re.search(r'''
    ^Python     # match "Python" at the start of the string
    \s          # match a whitespace character
    is          # match "is"
    .*          # match any characters
''', text, re.VERBOSE)

print(result)

<re.Match object; span=(0, 32), match='Python is a programming language'>


In [25]:
import re

text = "Python is a programming language"

result = re.search(r"(?i)python", text)

print(result)

<re.Match object; span=(0, 6), match='Python'>
