# Python Regex

## 1. Introduction

'r' means raw string 
r'string\n' -> converts to python raw string.
\n will be not treated as newline .

1. import re
2. re.search(pattern,string)

In [256]:
import re
re.search('n','\n')

In [257]:
re.search('n','\\n')

<_sre.SRE_Match object; span=(1, 2), match='n'>

#### Applying r to a pattern doesn't change anything

In [258]:
re.search('\n','\n\n\n\n\n')

<_sre.SRE_Match object; span=(0, 1), match='\n'>

In [259]:
re.search(r'\n','\n\n\n\n\n')

<_sre.SRE_Match object; span=(0, 1), match='\n'>

#### Now, this will not produce a result as r applied to a string converts it into raw string

In [260]:
re.search(r'\n',r'\n\n\n\n\n')

## 2. Match,Search (Both will return Match object) & findall (Returns list)

### 2.1 re.search - *<font color=red>Searches through whole string for 1st match</font>*</br>
    re.search(pattern,string,flag)
   Note that
   1. looks only for 1st match
   2. Works with new line
    

### 2.2 re.match -  *<font color=red>Searches only starting of the string</font>*
   
    re.match(pattern,string,flag)
   Note that
   1. Not works with new line
        
### 2.3 re.findall -   *<font color=red>Searches all patterns in the string</font>*
    re.findall(pattern,string,flag)        
   Note that
   1. If groups () are present inside findall then it'll output them only
        
####  Match_object - span(start index,end index) match='match string'
     

<br>
<br>

### Examples

#### re.search()


In [14]:
re.search('c','abcdecf')

<_sre.SRE_Match object; span=(2, 3), match='c'>

In [13]:
re.search('c','a\ncdef')

<_sre.SRE_Match object; span=(2, 3), match='c'>

Start,End & Group -Extract information from match object

In [19]:
re.search('c','abcdecf').group()

'c'

In [20]:
re.search('c','abcdecf').start()

2

In [21]:
re.search('c','abcdecf').end()

3

In [261]:
re.search('c','abcdecf').span()

(2, 3)

In [25]:
re.search('cd','abcdecf')

<_sre.SRE_Match object; span=(2, 4), match='cd'>

In [28]:
re.search('c|d','abcdecf')

<_sre.SRE_Match object; span=(2, 3), match='c'>

#### re.match

In [262]:
re.match('c','abcdef')

In [263]:
bool(re.match('c','abcdef'))

False

In [264]:
bool(re.match('a','abcdef'))

True

Start,End & Group -Extract information from match object

In [15]:
re.match('a','abcdef').group()

'a'

In [18]:
re.match('ab','abcdef').group(0)

'ab'

In [23]:
re.match('ab','abcdef').start()

0

In [24]:
re.match('ab','abcdef').end()

2

#### re.findall

In [30]:
re.findall('a|n','abncdeafn')

['a', 'n', 'a', 'n']

In [31]:
re.search('abcd','abcdwerfc abcd')

<_sre.SRE_Match object; span=(0, 4), match='abcd'>

In [32]:
re.findall('abcd','abcdwerfc abcd')

['abcd', 'abcd']

## 3. Character Sets

### 3.1 Quantifiers

#### \w - Searches for <font color=red>1</font> instance of [A-Za-z0-9<font color=red>_</font>]  (Underscore also included)
#### \W - !(\w)


In [35]:
re.search('\w\w\w\w','abc_werfc abcd')

<_sre.SRE_Match object; span=(0, 4), match='abc_'>

In [38]:
re.search('\w\w\w\w','abc_werfc abcd').group()

'abc_'

In [37]:
re.search('\w\w\w\w','ab@_w')

#### \d - Searches for <font color=red>1</font> instance of [0-9]  
#### \D - !(\d)

In [39]:
re.search('\d','a7c_werfc abcd')

<_sre.SRE_Match object; span=(1, 2), match='7'>

In [41]:
re.search('\d\d','a7c_w9rfc abcd')

#### \s - Searches for <font color=red>1</font> instance of ' '  
#### \S - !(\s)

In [42]:
re.search('\s','a7c_w9rfc abcd')

<_sre.SRE_Match object; span=(9, 10), match=' '>

In [52]:
string = "Lorem ipsum dolor sit amet consectetur, adipisicing elit. Ratione quo veritatis obcaecati suscipit unde rerum animi sequi voluptates id possimus incidunt, facilis necessitatibus asperiores odio tempore perferendis sit? Est, blanditiis!"
','.join(re.findall('\S+',string))

'Lorem,ipsum,dolor,sit,amet,consectetur,,adipisicing,elit.,Ratione,quo,veritatis,obcaecati,suscipit,unde,rerum,animi,sequi,voluptates,id,possimus,incidunt,,facilis,necessitatibus,asperiores,odio,tempore,perferendis,sit?,Est,,blanditiis!'

#### . - Matches anything except newline(\n)

In [54]:
string = """Lorem ipsum dolor sit amet consectetur,
adipisicing elit. Ratione quo veritatis obcaecati
suscipit unde rerum animi sequi voluptates id possimus incidunt,
facilis necessitatibus asperiores odio tempore perferendis sit? Est, blanditiis!"""
re.findall('.+',string)

['Lorem ipsum dolor sit amet consectetur,',
 'adipisicing elit. Ratione quo veritatis obcaecati',
 'suscipit unde rerum animi sequi voluptates id possimus incidunt,',
 'facilis necessitatibus asperiores odio tempore perferendis sit? Est, blanditiis!']

In [59]:
#Will ignore newline

string = """Lorem ipsum dolor sit amet consectetur,
adipisicing elit. Ratione quo veritatis obcaecati
suscipit unde rerum animi sequi voluptates id possimus incidunt,
facilis necessitatibus asperiores odio tempore perferendis sit? Est, blanditiis!"""
re.findall('.+',string,flags=re.DOTALL)

['Lorem ipsum dolor sit amet consectetur,\nadipisicing elit. Ratione quo veritatis obcaecati\nsuscipit unde rerum animi sequi voluptates id possimus incidunt,\nfacilis necessitatibus asperiores odio tempore perferendis sit? Est, blanditiis!']

### 3.2 Modifiers

'+' = 1 or more

'?' = - or 1

'\*' = 0 or more

'{n,m}' = n to m repetitions

#### Matching 1 or more

In [44]:
# Searches first occurrence of w+ 
re.search('\w+','abcdedef def')

<_sre.SRE_Match object; span=(0, 8), match='abcdedef'>

In [46]:
 re.search('\w+\W+\w+','abcdedef def')

<_sre.SRE_Match object; span=(0, 12), match='abcdedef def'>

In [266]:
 re.findall('\w+','abcdedef def')

['abcdedef', 'def']

#### Matching 1 or none

In [47]:
 re.search('\w+\W?\w+','abcdedefdef')

<_sre.SRE_Match object; span=(0, 11), match='abcdedefdef'>

In [48]:
 re.search('\w+\W?\w+','abcdedef def')

<_sre.SRE_Match object; span=(0, 12), match='abcdedef def'>

#### Matching exact occurence

In [49]:
re.search('\w{3}','aaaaaa')

<_sre.SRE_Match object; span=(0, 3), match='aaa'>

#### Matching occurence greedily(at max upper range)

In [50]:
re.search('\w{1,3}','aaaaaa')

<_sre.SRE_Match object; span=(0, 3), match='aaa'>

In [51]:
re.search('\w{1,3}','a')

<_sre.SRE_Match object; span=(0, 1), match='a'>

### 3.3 Creating our own character sets using [ ]


In [62]:
string = "Yo Whats up !!"

In [63]:
re.findall('[A-Z]|!',string)

['Y', 'W', '!', '!']

### 3.4 Quantifiers with custom sets

In [64]:
re.search('[A-Z]+',string)

<_sre.SRE_Match object; span=(0, 1), match='Y'>

In [65]:
re.findall('[A-Z]+',string)

['Y', 'W']

In [66]:
re.findall('[A-Z]?[a-z]+',string)

['Yo', 'Whats', 'up']

#### ^[ ] = Start of the line , [^] - Negation

In [68]:
re.findall('[^A-Z]+',string)

['o ', 'hats up !!']

## 4. Groups ( ) - Used with search

### 4.1 Introduction
Allows us to pull out sections of a match & store them

#### re.findall() - Capture only groups, if present else capture everything 
#### re.search() - Capture everything irrespective of groups

In [81]:
s1 = 'I will have 3 choclates and Sarthak will have 2 egg-rolls and Parkhi will have 4 oreo-shakes.'

In [145]:
 re.findall('\w+ \w+ \w+ \d+ \w+',s1)

['I will have 3 choclates',
 'Sarthak will have 2 egg',
 'Parkhi will have 4 oreo']

In [147]:
 re.findall('(\w+) \w+ \w+ \d+ \w+',s1)

['I', 'Sarthak', 'Parkhi']

In [148]:
 re.findall('\w+ \w+ \w+ \d+ (\w+)',s1)

['choclates', 'egg', 'oreo']

In [149]:
 re.findall('(\w+) \w+ \w+ \d+ (\w+)',s1)

[('I', 'choclates'), ('Sarthak', 'egg'), ('Parkhi', 'oreo')]

*By default group = 0*

##### Now its re.search therefore it'll capture every thing

In [150]:
re.search('(\w+) \w+ \w+ \d+ (\w+)',s1)

<_sre.SRE_Match object; span=(0, 23), match='I will have 3 choclates'>

In [151]:
re.search('(\w+) \w+ \w+ \d+ (\w+)',s1).group()

'I will have 3 choclates'

##### search.groups() are same as re.findall (will capture only groups)

In [164]:
match = re.search('(\w+) \w+ \w+ (\d+) (\w+)',s1)

In [168]:
match.group()

'I will have 3 choclates'

In [169]:
match.group(0)

'I will have 3 choclates'

In [166]:
match.groups()

('I', '3', 'choclates')

In [130]:
match.group(1)


'I'

In [131]:
match.span(1)

(0, 1)

In [123]:
match.group(2)

'3'

In [132]:
match.span(2)

(12, 13)

In [124]:
match.group(3)

'choclates'

In [133]:
match.span(3)

(14, 23)

#### All subsets

In [153]:
re.findall('((\w+) \w+ \w+ (\d+) (\w+))',s1)

[('I will have 3 choclates', 'I', '3', 'choclates'),
 ('Sarthak will have 2 egg', 'Sarthak', '2', 'egg'),
 ('Parkhi will have 4 oreo', 'Parkhi', '4', 'oreo')]

##### re.finditer() - Iterating through groups

In [154]:
it = re.finditer('((\w+) \w+ \w+ (\d+) (\w+))',s1)

In [142]:
next(it).groups()

('I will have 3 choclates', 'I', '3', 'choclates')

In [143]:
next(it).groups()

('Sarthak will have 2 egg', 'Sarthak', '2', 'egg')

In [144]:
next(it).groups()

('Parkhi will have 4 oreo', 'Parkhi', '4', 'oreo')

### 4.2. Naming Groups

In [181]:
s2 = 'New York, New York 11369'

In [182]:
match = re.search('([A-Za-z\s]+), ([A-Za-z\s]+)(\d+)',s2)

##### match.group() = match.group(0)
##### match.groups() = match.group(1) +  match.group(2) + match.group(3) + ...

In [183]:
match.group(), match.group(0), match.groups() , match.group(1) , match.group(2) , match.group(3)

('New York, New York 11369',
 'New York, New York 11369',
 ('New York', 'New York ', '11369'),
 'New York',
 'New York ',
 '11369')

In [162]:
match.group(0)

'New York, New York 11369'

### ?P< > - Group name inside  < > followed by regex

#### re.compile - just a way to save the pattern

In [188]:
pattern = re.compile('(?P<City>[A-Za-z\s]+),(?P<State>[A-Za-z\s]+) (?P<ZipCode>\d+)')

In [189]:
match = re.search(pattern,s2)

In [191]:
match.group(1) , match.group(2) , match.group(3)

('New York', ' New York', '11369')

In [192]:
match.groupdict()

{'City': 'New York', 'State': ' New York', 'ZipCode': '11369'}

### 4.3 Groups with quantifiers
Finding repetition of a smaller pattern

In [197]:
s3 = 'ababababab'

In [208]:
match = re.search('(ab)+',s3)

In [209]:
re.search('[ab]+',s3)

<_sre.SRE_Match object; span=(0, 9), match='abababbbb'>

In [210]:
s3 = 'abababbbb'

In [211]:
re.search('(ab)+',s3)

<_sre.SRE_Match object; span=(0, 6), match='ababab'>

In [212]:
re.search('[ab]+',s3)

<_sre.SRE_Match object; span=(0, 9), match='abababbbb'>

#### <font color=red>Beware</font> - Value keeps updating untill last group

##### Ex1

In [219]:
match.group(),match.group(0)

('ababab', 'ababab')

In [226]:
match.groups(),match.group(1)

(('9',), '9')

##### Ex2

In [220]:
str = '123456789'

In [222]:
match=re.search('(\d)+',str)

In [225]:
match.group(),match.group(0)

('123456789', '123456789')

In [224]:
match.groups()

('9',)

#### 4.3.1 Quantifiers with groups within findall

In [229]:
str = '123456789'
re.findall('(\d)+',str)

['9']

In [230]:
str = '1234 56789'
re.findall('(\d)+',str)

['4', '9']

In [231]:
str = 'abbbbb abababababab'
re.findall('(ab)+',str)

['ab', 'ab']

In [232]:
str = 'abbbbb abababababab'
re.findall('((ab)+)',str)

[('ab', 'ab'), ('abababababab', 'ab')]

### 4.4 Group for word completion

In [233]:
re.search('Happy (Valentines|Birthday|Anniversary)','Happy Birthday')

<_sre.SRE_Match object; span=(0, 14), match='Happy Birthday'>

In [234]:
re.search('Happy (Valentines|Birthday|Anniversary)','Happy Valentines')

<_sre.SRE_Match object; span=(0, 16), match='Happy Valentines'>

In [235]:
re.search('Happy (Valentines|Birthday|Anniversary)','Happy Anniversary')

<_sre.SRE_Match object; span=(0, 17), match='Happy Anniversary'>

## 5. Non-capture Backreferencing groups
Capturing groups - Either save result or output it

### Non-capturing groups (?:) 
We don't want to save result or output it

##### Ex1

In [243]:
str = '1234 56789'

In [249]:
re.findall('(?:\d)+',str),re.findall('\d+',str)

(['1234', '56789'], ['1234', '56789'])

In [239]:
re.findall('(?:\d)+(\d+)',str)

['4', '9']

##### Ex2 - Pull out names whose ID has  123 within it

In [250]:
str = '123123 = Shantanu, 123123123 = Sarthak, 456456 = Parkhi, 123432 = Micky,'
re.findall('(?:123)+ = (\w+),',str)


['Shantanu', 'Sarthak']

##### Ex3- Find 2 or more 1 followed by *

In [251]:
str = '1*1*1*1*2222 1*1*3333 2*1*2*1*222 1*2*2*2*333 3*3*3*444'
re.findall(r'(?:1\*){2,}\d+',str)

['1*1*1*1*2222', '1*1*3333']

### Backreferences - \1 
Using captured groups inside other operations

#### Useful for finding duplicates

##### Ex1

In [253]:
re.search(r'(\w+) \1','Very Very Good')

<_sre.SRE_Match object; span=(0, 9), match='Very Very'>

##### Ex2

In [255]:
re.findall(r'(\w+) \1','Very Very Good marks for super super intelligent boy')

['Very', 'super']