In [1]:
import re

### Lookup
* .       - Any Character Except New Line
* \d      - Digit (0-9)
* \D      - Not a Digit (0-9)
* \w      - Word Character (a-z, A-Z, 0-9, _)
* \W      - Not a Word Character
* \s      - Whitespace (space, tab, newline)
* \S      - Not Whitespace (space, tab, newline)

* \b      - Word Boundary - eg. \bHa will search all 'Ha' with space on left side
* \B      - Not a Word Boundary - eg. \BHa will search all 'Ha' with no space on left side
* ^       - Beginning of a String
* $       - End of a String

* []      - Matches Characters in brackets
* [^ ]    - Matches Characters NOT in brackets
* |       - Either Or
* ( )     - Group

Quantifiers:
* \*       - 0 or More
* \+       - 1 or More
* ?       - 0 or One
* {3}     - Exact Number
* {3,4}   - Range of Numbers (Minimum, Maximum)


#### Sample Regexs ####

[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+

\1
* The first \1 means the first group - i.e. the first bracketed expression (\b[a-z]+)
* From the docs \number - 
    "Matches the contents of the group of the same number. Groups are numbered starting from 1. For example, (.+) \1 matches 'the the' or '55 55', but not 'thethe' (note the space after the group)"

* The second \1 is the replacement to use in case of a match, so a repeated word will be replaced by a single word.

In [77]:
# Text for tutorial

text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
coreyms.com
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234
321--555-4321
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

sentence = 'Start a sentence and then bring it to an end. What the hell'

In [78]:
# raw string : r'\tab abc' vs '\tab abc'

print('\tab abc') #takes \tab literally
print(r'\tab abc') # takes it as a raw string

	ab abc
\tab abc


In [79]:
# re.compile and finditer methods
pattern = re.compile(r'abc')
matches = pattern.finditer(text_to_search)
for match in matches:
    print match.span()

(1, 4)


In [80]:
# . ^ $ * + ? { } [ ] \ | ( ) need to be escaped if to be used in a string literally. Eg. find : coreyms.com
pattern = re.compile(r'coreyms\.com')
matches = pattern.finditer(text_to_search)
for match in matches:
    print match.span()

(139, 150)


In [81]:
# Using '^' and '$'
# ^ matches only the start of the string. It will give null for others
pattern = re.compile(r'^Start')
matches = pattern.finditer(sentence)
for match in matches:
    print '1st Eg:',match.span()
    
pattern = re.compile(r'^What')
matches = pattern.finditer(sentence)
for match in matches:
    print '2nd Eg:',match.span()
    
# Same for $. Last of string

1st Eg: (0, 5)


In [82]:
# Excercise : find numbers with . or -
pattern = re.compile('\d\d\d[.-]\d\d\d[.-]\d\d\d\d')
re.findall(pattern,text_to_search)

# it doesn't find 321--555-4321 because [.-] characters are used once only

['321-555-4321', '123.555.1234', '800-555-1234', '900-555-1234']

#### [] may also contain range to match eg : [1-5] or [a-zA-Z]

In [83]:
# negation in character set
# when ^ is used inside character set, the characters are neglected:
re.findall('[^a-zA-Z0-9\n\s]',text_to_search)

['(',
 ')',
 ':',
 '.',
 '^',
 '$',
 '*',
 '+',
 '?',
 '{',
 '}',
 '[',
 ']',
 '\\',
 '|',
 '(',
 ')',
 '.',
 '-',
 '-',
 '.',
 '.',
 '*',
 '*',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '.',
 '.',
 '.']

In [84]:
#Quantifiers *,+,?, {3}, {3,4}
print 'digits with 3 or 4 chars'
print re.findall('\d{3,4}',text_to_search) #digits with 3 or 4 chars

print '\nTelephone numbers'
print re.findall('\d{3}[.-]\d{3}[.-]\d{4}',text_to_search)

digits with 3 or 4 chars
['1234', '5678', '321', '555', '4321', '123', '555', '1234', '123', '555', '1234', '800', '555', '1234', '900', '555', '1234', '321', '555', '4321']

Telephone numbers
['321-555-4321', '123.555.1234', '800-555-1234', '900-555-1234']


In [85]:
# Find names with designations
re.findall('M[rs]\.?\s+\w+',text_to_search)

['Mr. Schafer', 'Mr Smith', 'Ms Davis', 'Mr. T']