# Master Python Regular Expression

## Section 1. Introduction to regular expression
### V2. Regular expression-examples

In [102]:
#Extract weights without using regular expressions

#read the text file having weights
fp=open('weights.txt','r')
text=fp.read()
text

'A weight is  46kg\nB weight is  54kg\nC weight is  60kg\nD weight is  70kg\n\n'

In [103]:
#extract the weights without using regular expressions
#lengthy and complex code
weights=[]
for i in range(len(text)-1):
    sub=text[i:i+2]
    if sub.isdecimal():
        weights.append(sub)

#display the weights       
print(weights)

['46', '54', '60', '70']


In [104]:
#Extract weights using regular expressions
#import the module
import re

#read the text file having weights
fp=open('weights.txt','r')
text=fp.read()

#extract the weights using regular expressions
#simple and short code
pattern=r'\d\d'
regex=re.compile(pattern)
weights=regex.findall(text)

#display the weights
print(weights)

['46', '54', '60', '70']


## Section 2. re Module Basics
### V3. reModule Basics

In [105]:
#program to illustrate the basics of re module

#import the re module
import re

text='Roll Number is 7004'

#write the regular expression
pattern=r'\d\d\d\d'

#creat the regex object using compile function
regex=re.compile(pattern)

#call the findall function using regex object
number=regex.findall(text)

#display the extracted numbers
print(number)

['7004']


## Section 3. re Module Functions

### V5. `findall()`

In [106]:
#program to illustrate the working of findall 

import re
text='Kalyan roll number is 1001. Meghana roll number is 1002'
pattern=r'\d\d\d\d'
# \d represents a digit from 0-9
# 4 \d will represent numbers with 4 digits

regex=re.compile(pattern)
rollNumbers=regex.findall(text)

if rollNumbers:
    print(rollNumbers)
else:
    print('Roll Numbers are not found')

['1001', '1002']


### V6. `finditer()`

In [107]:
#program to illustrate the working of finditer

import re
text='Kalyan roll number is 1001. Meghana roll number is 1002'
pattern=r'\d\d\d\d'

regex=re.compile(pattern)
rollNumbers=regex.finditer(text)

if rollNumbers:
    print(rollNumbers)
else:
    print('Roll Numbers are not found')
# if/else works for .findall() as it returns list of matched substring
# does NOT work here as it returns objects for matched substring only

<callable_iterator object at 0x113bb4a58>


In [108]:
# objects of the matched substring
if rollNumbers:
    for r in rollNumbers:
        print(r.group())
        # display roll number
        print(r.start())
        # display starting index
        print(r.end())
        # display ending index
        print(r.span())
        # display start and end index
        print(r.re.pattern)
        # display pattern
        print(r.string)
        # display the entire string from 
        # which roll no. are extracted
else:
    print('No roll numbers found')

1001
22
26
(22, 26)
\d\d\d\d
Kalyan roll number is 1001. Meghana roll number is 1002
1002
51
55
(51, 55)
\d\d\d\d
Kalyan roll number is 1001. Meghana roll number is 1002


### V7. `sub()`

In [109]:
#program to illustrate the working of sub

import re

text='Kalyan roll number is 1001. Meghana roll number is 1002'
pattern=r'\d\d\d\d'

regex=re.compile(pattern)
mtext=regex.sub('****',text)
# to replace roll no. with ****

print(mtext)

Kalyan roll number is ****. Meghana roll number is ****


### V8. `split()`

In [110]:
#program to illustrate the working of split
import re

text='Kalyan is a good boy and he is going to marry Meghana'
pattern=r'\s'
# '\s' means space, so it will split sentence wherever space is  
regex=re.compile(pattern)

split_text=regex.split(text)
print(split_text)

['Kalyan', 'is', 'a', 'good', 'boy', 'and', 'he', 'is', 'going', 'to', 'marry', 'Meghana']


In [111]:
import re

text='Kalyan is a good boy and he is going to marry Meghana'
pattern=r'a'

regex=re.compile(pattern)

split_text=regex.split(text)
print(split_text)

['K', 'ly', 'n is ', ' good boy ', 'nd he is going to m', 'rry Megh', 'n', '']


## Section 4. GROUPS

### V10. `Numbered groups`

In [112]:
#program to illustrate the numbered groups

import re 
text='Kalyan roll number is CS1004'
     
pattern=r'(CS)(\d\d\d\d)'
regex=re.compile(pattern)

mo=regex.search(text)
print(mo.group())   #prints CS1004
print(mo.group(0))  #prints CS1004
print(mo.group(1))  #prints CS
print(mo.group(2))  #prints 1004
print(mo.groups())  #prints (CS,1004)

CS1004
CS1004
CS
1004
('CS', '1004')


### V11. `Named groups`

In [113]:
#program to illustrate the working of named groups
import re

text='Kalyan roll number is CS1004'
pattern=r'(?P<branch>CS)(?P<roll>\d\d\d\d)'
# branch, roll are name of the groups

regex=re.compile(pattern)

mo=regex.search(text)
print(mo.group())            #prints CS1004
print(mo.group(0))           #prints CS1004
print(mo.group('branch'))    #prints CS
# calling by name of the group
print(mo.group('roll'))      #prints 1004
print(mo.groups())           #prints (CS,1004)

CS1004
CS1004
CS
1004
('CS', '1004')


### V12. `Non-Capturing groups`

In [117]:
#program to illustrate the non capturing groups

import re
text='My  personal number is 043-225431  and my office number is 043-225143'

pattern1=r'\d\d\d-\d\d\d\d\d'
regex=re.compile(pattern1)
numbers=regex.findall(text)
print(numbers)                        #list of strings

pattern2=r'(\d\d\d)-(\d\d\d\d\d)'
regex=re.compile(pattern2)
numbers=regex.findall(text)
print(numbers)                         #list of tuples

pattern3=r'(?:\d\d\d)-(?:\d\d\d\d\d)'
regex=re.compile(pattern3)
numbers=regex.findall(text)
print(numbers)                         #list of strings

['043-22543', '043-22514']
[('043', '22543'), ('043', '22514')]
['043-22543', '043-22514']


In [119]:
import re
text='My  personal number is 043-225431  and my office number is 043-225143'

pattern1=r'\d{3}-\d{4}'
regex=re.compile(pattern1)
numbers=regex.findall(text)
print(numbers)  

['043-2254', '043-2251']


## Section 5. META-CHARACTERS

### V14. `| (PIPE)` - Or

In [120]:
import re

text='My lcuky number is 06, and her lucky number is 121'
pattern=r'(\d{2}|\d{3})'

regex=re.compile(pattern)
number = regex.findall(text)
print(number)
# unfortunately it calls first 2 digits of 121 if `\b` is NOT used

['06', '12']


In [121]:
import re

text='My lcuky number is 06, and her lucky number is 121'
pattern=r'\b(\d{2}|\d{3})\b'

regex=re.compile(pattern)
number = regex.findall(text)
print(number)

['06', '121']


In [122]:
# 2nd example
#program to illustrate the pipe meta character
import re
text='words are cat,mat,bat'
pattern=r'(c|m|b)at'
regex=re.compile(pattern)
words=regex.findall(text)
print(words)

['c', 'm', 'b']


As non-capturing group is not used only content of the group is displayed i.e. c or a or t. That is NOT the intention.

In [123]:
# 2nd example
#program to illustrate the pipe meta character
import re
text='words are cat,mat,bat'
pattern=r'(?:c|m|b)at'
regex=re.compile(pattern)
words=regex.findall(text)
print(words)
# Using non-capturing group (?:) solved the issue

['cat', 'mat', 'bat']


In [124]:
# 2nd example- another way
#program to illustrate the pipe meta character
import re
text='words are cat,mat,bat'
pattern=r'cat|mat|bat'
regex=re.compile(pattern)
words=regex.findall(text)
print(words)

['cat', 'mat', 'bat']


In [125]:
# 2nd example- yet another way
#program to illustrate the pipe meta character
import re
text='words are cat,mat,bat'
pattern=r'[c|m|b]at'
regex=re.compile(pattern)
words=regex.findall(text)
print(words)

['cat', 'mat', 'bat']


### V17. `? (Question mark)` - 0 or 1

In [126]:
#program to illustrate the question mark meta character

import re
text='My lucky number is 06 and her lucky number is 121'
pattern=r'\d\d\d?'
# this was done previously by `|`

regex=re.compile(pattern)
numbers=regex.findall(text)
print(numbers)

['06', '121']


In [127]:
import re
text='Sun will rise and Christ has risen'
pattern=r'risen?'

regex=re.compile(pattern)
numbers=regex.findall(text)
print(numbers)

['rise', 'risen']


In [128]:
import re
text='My lucky number is -12,24,19,-64'
pattern=r'-?\d\d'

regex=re.compile(pattern)
numbers=regex.findall(text)
print(numbers)

['-12', '24', '19', '-64']


### V20. `* (Asterisk mark)` - 0 or More

In [129]:
#program to illustrate the astersik meta character

import re

text='words are abbbc  abbc abc  ac'
pattern=r'ab*c'

regex=re.compile(pattern)
print(regex.findall(text))

['abbbc', 'abbc', 'abc', 'ac']


In [130]:
import re

text='words are c abc ababc abababc'
pattern=r'(?:ab)*c'

regex=re.compile(pattern)
print(regex.findall(text))

['c', 'abc', 'ababc', 'abababc']


### V22. `+ (Plus)` - 1 or More

In [131]:
#program to illustrate the plus  meta character

import re
text='abbbc abbc  abc  '

pattern=r'ab+c'
regex=re.compile(pattern)
words=regex.findall(text)
print(words)

['abbbc', 'abbc', 'abc']


In [132]:
import re
text='1,10,100,1000'

pattern=r'\d+'
regex=re.compile(pattern)
words=regex.findall(text)
print(words)

['1', '10', '100', '1000']


In [133]:
import re
text='names are Kalyan and Meghna'

pattern=r'[A-Z][a-z]+'
regex=re.compile(pattern)
words=regex.findall(text)
print(words)

['Kalyan', 'Meghna']


### V25. `. (Dot symbol)` - any character except \n

In [134]:
#program to illustrate the dot meta character

import re
text='kalyan1234\n145'
pattern=r'.'   

regex=re.compile(pattern)
characters=regex.findall(text)

print(characters)

['k', 'a', 'l', 'y', 'a', 'n', '1', '2', '3', '4', '1', '4', '5']


### V26. `Metacharacter` example-1

In [135]:
#meta character example program

import re
text='values are +10,20,+30,40'
pattern=r'\+?\d{2}'
regex=re.compile(pattern)
numbers=regex.findall(text)
print(numbers)

['+10', '20', '+30', '40']


`+` is a metacharacter or special character so we can NOT use it directy, so we have to provide a backslash '\'

In [136]:
import re
text='values are +10.01,20.02,+30.433'
pattern=r'\+?\d{2}\.\d{2}'
regex=re.compile(pattern)
numbers=regex.findall(text)
print(numbers)

['+10.01', '20.02', '+30.43']


## Section 6. Character class
### V28. 

3 types of character class

1. Positive character class
2. Negative character class
3. Shorthand character class

## Section 8. Greedy & Non-Greedy Matching
### V41. Greedy Matching

In [149]:
#program to illustrate greedy matching

import re
text='The pattern is abcabcabcabc'

pattern=r'a[a-z]+c'
regex=re.compile(pattern)
mo=regex.search(text)

print(mo.group())  

abcabcabcabc


In [150]:
pattern=r'a[a-z]+c'
regex=re.compile(pattern)
mo2=regex.findall(text)

print(mo2)

['abcabcabcabc']


**Greedy matching looks for MAXIMUM possible match**

The regular expression r'a[a-z]+c' suggests word will start with a, then 1 or more of any character between a to z and then end with c.

The string abcabcabcabc has a pattern of `abc`. This `abc` pattern is recognized by the regular expression. Regular expression can also recognize
abc, abcabc, abcabcabc, and abcabcabcabc. But it will recognize the maximum possible match.

Regular expression is greedy by nature so they look for maximum possible match while using `+, *, {3,}`

### V42. Non-Greedy Matching

In [151]:
#program to illustrate  non-greedy matching

import re

text='The pattern is abcabcabcabc'
pattern=r'a[a-z]+?c'

regex=re.compile(pattern)
mo=regex.search(text)

print(mo.group()) 

abc


**Non-Greedy matching looks for MINIMUM possible match.**

The regular expression is r'a[a-z]+?c'. `?` will ensure it will go for NON-Greedy match. Regular expression suggests word will start with a, then 1 or minimum of any character between a to z and then end with c.

The string abcabcabcabc has a pattern of abc. This abc pattern is recognized by the regular expression. Regular expression can also recognize abc, abcabc, abcabcabc, and abcabcabcabc. But it will recognize the MINIMUM possible match i.e. abc.

## Section 9. Back Reference
### V44. Numbered Back Reference

In [154]:
#program to illustrate  numbered back references 

import re

text='The numbers are 1116,1414,4035,2020'
pattern=r'(\d{2})\1'

regex=re.compile(pattern)

mo=regex.finditer(text)
for i in mo:
    print(i.group())

# can NOT use .findall(text) here. It will generate 14 and 20
# we have to use .finditer(text) & then iterate through them

1414
2020


1414 and 2020, there is repetitions of first 2 digits. So first 2 digits could be represented by \d{2}. We want groups and therefore (\d{2}). 

`(\d{2})\1`. The repetition of the digits is recognized by back referencing i.e. `\1`

In [155]:
# 2nd program to illustrate  numbered backreferences 

import re

text='Office Land Line number is 043405117 '
pattern=r'(\d{3})(\d{5})'

regex=re.compile(pattern)

text=regex.sub(r'\1-\2',text)

print(text)

Office Land Line number is 043-405117 


Here regular expression for 043405117 would be `r'(\d{3})(\d{5})'`. First 3 digits will represent area group i.e. 1st group and next 5 will represent landline number i.e. 2nd group.

`r'\1-\2'` ensures 1st group with 3 digits and 2nd group has 5 digits. And they are separated by a `-`

### V46. Named Back Reference

In [157]:
# 1st program to illustrate  NAMED backreferences 

import re

text='The numbers are 1414,1618,2020,4038'
pattern=r'(?P<first>\d{2})(?P=first)'

regex=re.compile(pattern)

numbers=regex.finditer(text)

for n in numbers:
    print(n.group())

1414
2020


1414 and 2020, there is repetitions of first 2 digits. So first 2 digits could be represented by \d{2}. We want groups and therefore (\d{2}).

Now we want to name the group. `?P<first>` is naming the group. where `first` is the name of the group

`(?P=first)` ensures next group is also similar to the first group as we are matching 1414 and 2020. Here first 2 digits of each number (1st group) is same as the next 2 number (2nd group). Thus we say next group is same as the 1st group.

In [158]:
# 2nd program to illustrate  NAMED backreferences 

import re

text='Office Land Line number is 043405117 '
pattern=r'(?P<area>\d{3})(?P<number>\d{5})'

regex=re.compile(pattern)

text=regex.sub(r'\g<area>-\g<number>',text)

print(text)

Office Land Line number is 043-405117 


First 3 digits of the number 043405117 would be represented by 1st group `(?P<area>\d{3})`, where the 1st group is named as `area`. Similarly next 5 digits would be represented by 2nd group `(?P<number>\d{5})`.

Whereas, regular expression of 043-405117 will be `r'\g<area>-\g<number>'`. Here first group `area` will represent first 3 digits followed by a `-` and then second group `number` will represent next 5 digits.

## Section 10. Assertions
### V49. Positive Look Ahead Assertions

In [159]:
#program to illustrate  positive look ahead assertion

import re

text='Kalyan_cs,Meghana_cs,John,Jack'
pattern=r'(?i)[a-z]+(?=_cs)'

regex=re.compile(pattern)

names=regex.findall(text)

print(names)

['Kalyan', 'Meghana']


Here we have to extract names followed by something. Both Kalyan and Meghna are followed by `_cs`. To select name or anything that is followed by something we use `Positive Look Ahead Assertion`

`r'(?i)[a-z]+(?=_cs)`: The name contains only alpha characters so by `(?i)` we ensure to neglect case (upper/lower). Then 1 or more of alpha characters. `(?=_cs)` ensures positive look ahead assertion i.e. they will look for names followed by `_cs`.

### V50. Negative Look Ahead Assertions

In [162]:
#program to illustrate  negative look ahead assertion

import re

text='values are 12,13,14a,15b'
pattern=r'\d{2}(?![a-z])'

regex=re.compile(pattern)

values=regex.findall(text)
print(values)

['12', '13']


Here we want to extract the numbers that are NOT followed by any alphabets (exact opposite to the previous case, i.e. Positive Look Ahead Assertion). 

`r'\d{2}(?![a-z])'`: `\d{2}` will recognize 2 digit numbers. And after that we do NOT want it to be followed by any alphabets. We ensure that by Negative Look Ahead Assertions i.e. `(?![a-z])`

### V51. Positive Look Behind Assertions

In [164]:
#program to illustrate  positive look behind assertion
 
import re

text='Values are CS1001,CS1002,CS1003,1989'
pattern='(?<=CS)\d{4}'

regex=re.compile(pattern)

values=regex.findall(text)
print(values)

['1001', '1002', '1003']


Here we want to extract numbers which are preceeded by something (`CS` in this case). 

`(?<=CS)\d{4}`: Here `\d{4}` will recognize the 4 digit numbers. Positive look behind assertion ``(?<=CS)` will look for the numbers that are preceeded by `CS`

### V52. Negative Look Behind Assertions

In [165]:
#program to illustrate  negative look behind assertion
 
import re

text='Values are CS1001,CS1002,CS1003,1989'
pattern='(?<!CS)\d{4}'

regex=re.compile(pattern)
values=regex.findall(text)

print(values)

['1989']


Here we want to extract numbers which are NOT preceeded by something (`CS` in this case). 

`(?<!CS)\d{4}`: Here `\d{4}` will recognize the 4 digit numbers. Positive look behind assertion `(?<!CS)` will look for the numbers that are NOT preceeded by `CS`. `!` or not equal ensures it recognizes numbers that are NOT preceeded by `CS`.

## Section 11. Case Study 1- URLs
### V55. URLs-Question 1

url.txt contains follwoing URLs. And we have to extract urls of pattern:  `www.abcd123.com`

www.google.com

www.facebook.com

https://www.yahoomail.com

https://www.microsoft.com

https://www.abcd123.in

www.amazon.in

www.pqrs.in

www.xyz.in

https://www.flipkart.in

https://www.nitt.edu

In [2]:
#URL Program1

import re

#read the text file
fp=open('url.txt','r')
text=fp.read()

#write the regex pattern
pattern=r'(?<!https://)www\.[a-z0-9]+\.[a-z]+'
# pattern=r'[^https://]www\.[a-z0-9]+\.[a-z]+'

#create the regex object
regex=re.compile(pattern)

#extract the urls
urls=regex.findall(text)
for u in urls:
    print(u)

www.google.com
www.facebook.com
www.amazon.in
www.pqrs.in
www.xyz.in


pattern=`r'(?<!https://)www\.[a-z0-9]+\.[a-z]+'`

`\.` backslash because `.` is a special character and we want a literal `.` here.

`(?<!https://)` because we do NOT want `https://` and thats why we are using `negative look behind assertion`. 

### V57. URLs-Question 2

url.txt contains following URLs. And we have to extract urls of pattern:  `https://www.abcd123.com`

www.google.com

www.facebook.com

https://www.yahoomail.com

https://www.microsoft.com

https://www.abcd123.in

www.amazon.in

www.pqrs.in

www.xyz.in

https://www.flipkart.in

https://www.nitt.edu

In [3]:
#URL Program2

import re
#read the text file
fp=open('url.txt','r')
text=fp.read()

#write the regex expression
pattern=r'https://www\.[a-z0-9]+\.[a-z]+'

#create the regex object
regex=re.compile(pattern)

#extract the urls
urls=regex.findall(text)
for u in urls:
    print(u)

https://www.yahoomail.com
https://www.microsoft.com
https://www.abcd123.in
https://www.flipkart.in
https://www.nitt.edu


### V59. URLs-Question 3

url.txt contains follwoing URLs. And we have to extract urls of pattern:  `www.abcd123.com` and also `https://www.abcd123.com`
So this is combination of first 2 questions

www.google.com

www.facebook.com

https://www.yahoomail.com

https://www.microsoft.com

https://www.abcd123.in

www.amazon.in

www.pqrs.in

www.xyz.in

https://www.flipkart.in

https://www.nitt.edu

In [4]:
#URL Program 3

import re
#read the text file
fp=open('url.txt','r')
text=fp.read()

#write the regex expression
pattern=r'(?:https://)?www\.[a-z0-9]+\.[a-z]+' # will NOT work for gmail.co.in
# pattern=r'.*www\.[a-z0-9]+\.[a-z]+' # will NOT work for gmail.co.in
# pattern=r'.+\.[a-z0-9]+\.[a-z]+' # will NOT work for gmail.co.in

# pattern = r'(?:https://)?www\.[a-z0-9]+(?:\.[a-z]+)+' # will work for gmail.co.in
# pattern = '(?:https://)?www\S+' # will work for gmail.co.in

#create the regex object
regex=re.compile(pattern)

#extract the urls
urls=regex.findall(text)
for u in urls:
    print(u)

www.google.com
www.facebook.com
https://www.yahoomail.com
https://www.microsoft.com
https://www.abcd123.in
www.amazon.in
www.pqrs.in
www.xyz.in
https://www.flipkart.in
https://www.nitt.edu


`r'(?:https://)?www\.[a-z0-9]+\.[a-z]+'`  

**Focus `(?:https://)?`**

`(https://)` part is optional. It can be 0 OR 1 time. So we have to use `?` i.e. `(https://)?`
        
As we are trying to extract the entire url we will have to add non-capturing group here i.e. `?:`

So overall it becomes `(https://)` 

______________________________________________________________________________________________________________________
In case we have website like `www.flipkart.co.in` then used pattern will not work.

`pattern = r'(?:https://)?www\.[a-z0-9]+(?:\.[a-z]+)+'` will work.

`(?:\.[a-z]+)+` : Here after literal dot `\.` matches dot after `flipkart`. Then any number of letters MATCHED by   `[a-z]+`. Now for `.in` which requires the same code, we put a `+` outside `(?:\.[a-z]+)`. Non-capturing group is denoted by `?:` so their entire website can be matched.

Another alternative pattern would be `'(?:https://)?www\S+'`

### V61. URLs-Question 4

url.txt contains following URLs. And we have to extract urls of pattern:  `www.abcd123.in` and also `https://www.abcd123.in`

www.google.com

www.facebook.com

https://www.yahoomail.com

https://www.microsoft.com

https://www.abcd123.in

www.amazon.in

www.pqrs.in

www.xyz.in

https://www.flipkart.in

https://www.nitt.edu

In [11]:
#URL Program 4

import re
#read the text file
fp=open('url.txt','r')
text=fp.read()

#write the regex expression
pattern=r'(?:https://)?www\.[a-z0-9]+\.in'

#create the regex object
regex=re.compile(pattern)

#extract the urls
urls=regex.findall(text)
for u in urls:
    print(u)

https://www.abcd123.in
www.amazon.in
www.pqrs.in
www.xyz.in
https://www.flipkart.in


`r'(?:https://)?www\.[a-z0-9]+\.[a-z]+'`  

**Focus `(?:https://)?`**

`(https://)` part is optional. It can be 0 OR 1 time. So we have to use `?` i.e. `(https://)?`
        
As we are trying to extract the entire url we will have to add non-capturing group here i.e. `?:`

So overall it becomes `(https://)` 

### V63. URLs-Question 5

Trying to find urls with either `.in` or `.com`

In [7]:
#URL Question 5 Solution

import re
#read the text file having urls
fp=open('url.txt','r')
text=fp.read()

#write the regex expression
pattern=r'(?:https://)?www\.[a-z0-9]+\.(?:in|com)'

#create the regex object
regex=re.compile(pattern)

#extract the urls
urls=regex.findall(text)

for u in urls:
    print(u)

www.google.com
www.facebook.com
https://www.yahoomail.com
https://www.microsoft.com
https://www.abcd123.in
www.amazon.in
www.pqrs.in
www.xyz.in
https://www.flipkart.in


`(?:in|com)`. This `(in|com)` group allows matching to URLs with `in` or `com`. To have the entire URL we need to add **non-capturing group** i.e. `?:` at the beginning of the group

## Section 12. Case Study 2- Dates
### V66. Dates-Question 1

**Task** match dates in the format dd-mm-yyyy

In [21]:
#text= 'Dates are 20-06-1989, 12/04/1982, 12.04.1963'
# One can just put text file here. Or put a .txt file in
# the directory Jupyter notebook is started
import re

fp=open('dates.txt','r')
text=fp.read()

pattern = r'\b\d{2}-\d{2}-\d{4}\b'
regex = re.compile(pattern)
dates = regex.findall(text)

for d in dates:
    print(d)

20-06-1989


Here `\b` is the boundary.

`\b` represents the word boundary that is non word characters. Word characters are A-Z, a-z,0-9 and _

`\bcat\b`  - means no word character at the beginning and  end of the word cat.

`\bcat\b` matches cat in cat, but not cat in catalyst.

`\bfoo\b` matches foo in (foo), foo. but not foo in fool, fool3.

### V66. Dates-Question 2

**Task** match dates in the format dd-mm-yyyy, dd/mm/yyyy, dd.mm.yyyy

In [22]:
#text= 'Dates are 20-06-1989, 12/04/1982, 12.04.1963'
import re

fp=open('dates.txt','r')
text=fp.read()

#pattern = r'\b\d{2}\W\d{2}\W\d{4}\b'
pattern = r'\b\d{2}[-/.]\d{2}[-/.]\d{4}\b'

regex = re.compile(pattern)
dates = regex.findall(text)

for d in dates:
    print(d)

20-06-1989
12/04/1982
12.04.1963


`\W` means any non-word character

`[-/.]` putting `.` and rest in the `[ ]` makes it literal (strip them from being special characters)

### V70. Dates-Question 3

**Task** match dates in the format dd/mm/yyyy, dd/mm/yy

In [2]:
import re

fp=open('dates1.txt','r')
text=fp.read()

# pattern = r'\b\d{2}/\d{2}/\d{2,4}\b' 
# d{2,4} will match yy, yyy, yyyy
pattern = r'\b\d{2}/\d{2}/\d{2}(?:\d{2})?\b'

regex = re.compile(pattern)
dates = regex.findall(text)

for d in dates:
    print(d)

12/04/1982
12/04/67
12/04/1967


Here **trick** is years is present in the format `yyyy` and `yy`. So for first `yy` it will be `\d{2}`. But next 2 `yy` is optional i.e. the optional or extra `yy` can appear 0 or 1 time. So it will be `(\d{2})?`. Here `?` mark is signifying extra `\d` is optional i.e. it can appear 0 or 1 time.

Now as we are trying to get the entire date, we have to add `non-capturing group` to the optional year code. Thus it will be `(?:\d{2}?)`

### V72. Dates-Question 4

**Task** match dates in the format dd-mm-yyyy where yyyy can be ONLY 1999

In [3]:
text= 'Dates are 22-06-1999, 12/12/1999, 12-04-1999, 12-06-1963'
import re

pattern = r'\b\d{2}-\d{2}-1999\b'
regex = re.compile(pattern)
dates = regex.findall(text)

print(dates)

['22-06-1999', '12-04-1999']


### V74. Dates-Question 5

**Task** match dates in the format dd-mm-yyyy where yyyy can be from `1980` to `1999`

In [4]:
text= 'Dates are 22-06-1980, 12-12-1984, 12-04-1999, 12-06-1963'
import re

pattern = r'\b\d{2}-\d{2}-19[89][0-9]\b'
regex = re.compile(pattern)
dates = regex.findall(text)

print(dates)

['22-06-1980', '12-12-1984', '12-04-1999']


3rd `Y` of `yyyy` can be either 8 or 9 and therefore `[89]`. However, the 4th `y` of `yyyy` can be any digit and thus we use `[0-9]`

### V76. Dates-Question 6

**Task** match dates in the format dd-mm-yyyy where mm should be months where days can be 31

In [5]:
text= 'Dates are 22-06-1980, 12-07-1984, 12-04-1999, 12-08-1963, 12-12, 1986'
import re

#pattern = r'\b\d{2}-(?:01|03|05|07|08|09|10|12)-\d{4}\b'
pattern = r'\b\d{2}-(?:0[135789]|1[02])-\d{4}\b'

regex = re.compile(pattern)
dates = regex.findall(text)

print(dates)

['12-07-1984', '12-08-1963']


For months with 31 days we know they have to be `01|03|05|07|08|09|10|12` so we can just put that. `?:` used to add non-capturing group with that as we are trying to extract the entire date value.

**Trick** We see for `01|03|05|07|08|09|`,  `0` is common so regular expression for these months can be `0[135789]`

For 10|12 we see `1` is comon and other 2 digits can be `0 or 2`. So regular expression for these months can be `1[02]`

So overall it will be `(?:0[135789]|1[02])`. As always the motivation is to extract the entire date so we have to add `non-capturing group` with the code and thats why `?:` is added

## Section 13. Case Study 3- Numbers
### V79. Numbers-Question 1

**Match all numbers between 10 and 99**

In [6]:
text= 'The numbers are 10, 100, 140, 64, 62, 49, 200'
import re

pattern = r'\b[1-9][0-9]\b'

regex = re.compile(pattern)
dates = regex.findall(text)

print(dates)

['10', '64', '62', '49']


`r'[1-9][0-9]'` can match 99 as well as 199. To keep the limit or boundary of the digits within 2 we set up boundary by `\b`

### V81. Numbers-Question 2

**Match all even integers**

In [16]:
text= 'The numbers are 2, 8 ,18, 109, 140, 44, 63, 59, 206'
import re

pattern = r'\b\d*[02468]\b'
#pattern = r'\b[02468]\b|\b\d+[02468]\b'

regex = re.compile(pattern)
dates = regex.findall(text)

print(dates)

['2', '8', '18', '140', '44', '206']


**Be careful about digits which can be single or the containing several digits. Take both of them into account.**

### V83. Numbers-Question 3

**Match all odd integers**

In [8]:
text= 'The numbers are 2, 8 ,7, 18, 109, \
                            140, 44, 63, 59, 206'
import re

#pattern = r'\b\d*[13579]\b'
pattern = r'\b[13579]\b|\b\d+[13579]\b'

regex = re.compile(pattern)
dates = regex.findall(text)

print(dates)

['7', '109', '63', '59']


### V85. Numbers-Question 4

** Match even digits of digits**

In [10]:
text= 'The numbers are 2, 8 ,7, 18, 109, 140, \
                        44, 63, 59, 206, 1414, 4003'
import re

pattern = r'\b(?:\d{2})+\b'


regex = re.compile(pattern)
dates = regex.findall(text)

for d in dates:
    print(d)

18
44
63
59
1414
4003


`\d{2}` means 2 digits. `+` means 1 or more time. 1 time will ensure 2 digits. 2 time will ensure d{2} and d{2} that means 4 digits and so on. Thus it will lead to even number of digits.

`?:` is used for `non-capturing groups` so that the entire group is extracted.
    
`\b` is also necessary so that 2 digits are NOT extracted from a 3 digit number.


### V87. Numbers-Question 5

** Match all numbers between 100 and 150**

In [12]:
text= 'The numbers are 2, 8 ,7, 18, 109, 140, \
                150, 100, 44, 63, 59, 206, 1414, 4003'
import re

pattern = r'\b(1[0-4][0-9]|150)\b'


regex = re.compile(pattern)
dates = regex.findall(text)

for d in dates:
    print(d)

109
140
150
100


Necessary to put pattern in the `()` or use group. Otherwise it will extract 141 from 1414 as well.

### V89. Numbers-Question 6

** Match all numbers below 1000**

In [13]:
text= 'The numbers are 2, 8 ,7, 18, 109, 140, \
                150, 100, 44, 63, 59, 206, 1414,0, 999 4003'
import re

pattern = r'\b(\d|\d{2}|\d{3})\b'
#pattern = r'\b([0-9]|[1-9][0-9]|[1-9][0-9][0-9])\b'

regex = re.compile(pattern)
dates = regex.findall(text)

for d in dates:
    print(d)

2
8
7
18
109
140
150
100
44
63
59
206
0
999


## Section 14. Case Study 4- Email IDs
### V91. Email IDs-Question 1

**Match all email IDs**

In [14]:
text= 'The email ids include abcd.fg@yahoo.co.in, \
            xyz_abc12@yahoo.com, pqr-143@gmail.com, \
            jack_14@zoho.com, bacd_fg@outlook.com, \
            bad_boy@gmx.com, white.149@yahoo.co.in, \
            black_28@gmx.com, abcd.47@zoho.com, bad_boy2@gmx.com \
            white.19@yahoo.co.in, black_78@gmx.com. These are awesome'
import re

pattern = r'\b([A-Za-z0-9._-]+@[a-z]+(?:[.][a-z]+)+)\b'

regex = re.compile(pattern)
emails = regex.findall(text)

for e in emails:
    print(e)

abcd.fg@yahoo.co.in
xyz_abc12@yahoo.com
pqr-143@gmail.com
jack_14@zoho.com
bacd_fg@outlook.com
bad_boy@gmx.com
white.149@yahoo.co.in
black_28@gmx.com
abcd.47@zoho.com
bad_boy2@gmx.com
white.19@yahoo.co.in
black_78@gmx.com


**Explanation** `pattern = r'\b([A-Za-z0-9._-]+@[a-z]+(?:[.][a-z]+)+)\b'` 

`[A-Za-z0-9._-]+@` matches for anything before `@`

`@[a-z]+` matches for anything after `@` like @gmail or @outlook

`(?:[.][a-z]+)+)` ,it has 2 parts `[.][a-z]+` matches .com or .co

`(?:[.][a-z]+)+` extra `+` ensures .in after .co or anything similar

?: for non-capturing groups

## Section 14. Case Study 4- Email IDs
### V94. Email IDs-Question 2

** Match all yahoo email addresses**

In [15]:
text= 'The email ids include abcd.fg@yahoo.co.in, \
            xyz_abc12@yahoo.com, pqr-143@gmail.com, \
            jack_14@zoho.com, bacd_fg@outlook.com, \
            bad_boy@gmx.com, white.149@yahoo.co.in, \
            black_28@gmx.com, abcd.47@zoho.com, bad_boy2@gmx.com \
            white.19@yahoo.co.in, black_78@gmx.com. These are awesome'
import re

pattern = r'\b([A-Za-z0-9._-]+@yahoo(?:[.][a-z]+)+)\b'

regex = re.compile(pattern)
emails = regex.findall(text)

for e in emails:
    print(e)

abcd.fg@yahoo.co.in
xyz_abc12@yahoo.com
white.149@yahoo.co.in
white.19@yahoo.co.in


As yahoo is constant, so fix yahoo after @ and rest of the pattern remains same from the last question.

## Section 15. Project-Password checker
### V95. Password checker

![Screen%20Shot%202018-09-30%20at%205.20.56%20AM.png](attachment:Screen%20Shot%202018-09-30%20at%205.20.56%20AM.png)

**Password length is between 8 to 32 characters **

**1. Password length is between 8 to 32 characters ** `r'^.{8,32}$'` 
`.` represents any character and `{8,32}` ensures 8-32 length of characters 

**2. Password must contain A-Z, a-z, 0-9 and special characters such as `*,#,@,!,&`**  

`r'[^A-Za-z0-9*#@!&]'` ensures password contains only valid characters

**3. Password must contain atleast 1 uppercase letter and that can be in any position**

`r'^.*[A-Z].*$'` : Here `[A-Z]` ensures atleast 1 uppercase letter. `.*` before `[A-Z]` ensures there could be any character before the uppercase letter. `.*` after `[A-Z]` ensures there could be any character after the uppercase letter as well.

**4. Password must contain atleast 1 lowercase letter and that can be in any position**

`r'^.*[a-z].*$'` : Here `[a-z]` ensures atleast 1 lowercase letter. `.*` before `[a-z]` ensures there could be any character before the lowercase letter. `.*` after `[a-z]` ensures there could be any character after the lowercase letter as well.

**5. Password must contain atleast 1 number and that can be in any position**

`r'^.*[0-9].*$'` : Here `[0-9]` ensures atleast 1 number. `.*` before `[0-9]` ensures there could be any character before the number. `.*` after `[a-z]` ensures there could be any character after the number as well.

**6. Password should NOT contain any sequential identical characters**

Sequential character can exist anywhere in the password.

`r'^.*(.)\1{2,}*$'`: Here `.*` at the start and at the end ensures there can be characters before or after sequential characters. 
`(.)\1{2,}` Here `.` represents any character `\1` represents numbered back reference which means a copy of the whatever character `.` is which makes it a sequential characters. `{2,}` means `(.)\1` or sequential character can be 2 or more so that is can match `aa`, `aaa`, `aaaa` etc

In [2]:
import re
pwd = input('Enter the password:')

pattern1 = r'^.{8,32}$'
pattern2 = r'[^A-Za-z0-9*#@!&]'
pattern3 = r'^.*[A-Z].*$'
pattern4 = r'^.*[a-z].*$'
pattern5 = r'^.*[0-9].*$'
pattern6 = r'^.*[#@*!&].*$'
pattern7 = r'^.*(.)\1{2,}.*$'

if not re.search(pattern1,pwd):
    print('Length is not between 8-32 characters.')

elif re.search(pattern2,pwd):
    print('Password contains an invalid character.')

elif not re.search(pattern3,pwd):
    print('Password should atleast contain 1 uppercase letter')

elif not re.search(pattern4,pwd):
    print('Password should atleast contain 1 lowercase letter')
    
elif not re.search(pattern5,pwd):
    print('Password should atleast contain 1 digit')
    
elif not re.search(pattern6,pwd):
    print('Password should atleast contain 1 special character')
    
elif re.search(pattern7,pwd):
    print('Password should NOT contain any identical sequential character')
    
else:
    print(pwd+ 'is a valid password')

Enter the password:Ducky#123
Ducky#123is a valid password
