In [1]:
#   Regular expressions are a powerful language for matching text patterns. This page gives a
#   basic introduction to regular expressions themselves sufficient for our Python exercises and 
#   shows how regular expressions work in Python. The Python "re" module provides regular expression support.
#   
#   In Python a regular expression search is typically written as:
#   
#     match = re.search(pat, str)
#   
#   The re.search() method takes a regular expression pattern and a string and searches for that pattern
#   within the string. If the search is successful, search() returns a match object or None otherwise. 
#   Therefore, the search is usually immediately followed by an if-statement to test if the search
#   succeeded, as shown in the following example which searches for the pattern 'word:' followed by 
#   a 3 letter word (details below):

In [3]:
import re
str = 'an example word:cat!!'
match = re.search(r'word:\w{3}', str)

# If-statement after search() tests if it succeeded
if match:                      
    print 'found result==>', match.group() ## 'found word:cat'
else:
    print 'did not find'

found result==> word:cat


In [5]:
#The code match = re.search(pat, str) stores the search result in a variable named "match".
#Then the if-statement tests the match -- if true the search succeeded and match.group() is the 
#matching text (e.g. 'word:cat'). Otherwise if the match is false (None to be more specific),
#then the search did not succeed, and there is no matching text.
#
#The 'r' at the start of the pattern string designates a python "raw" string which passes through
#backslashes without change which is very handy for regular expressions (Java needs this feature badly!).
#I recommend that you always write pattern strings with the 'r' just as a habit.

In [8]:
#  Emails example::
#Suppose you want to find the email address inside the string 'xyz alice-b@google.com purple monkey'. We'll use this as a running example to demonstrate more regular expression features. Here's an attempt using the pattern r'\w+@\w+':
#Suppose you want to find the email address inside the string 'xyz alice-b@google.com purple monkey'. 
#We'll use this as a running example to demonstrate more regular expression features. 
#Here's an attempt using the pattern r'\w+@\w+':

str = 'purple alice-b@google.com monkey dishwasher'
match = re.search(r'\w+@\w+', str)
if match:
    print match.group()  ## 'b@google'

#The search does not get the whole email address in this case because the \w does not match
#the '-' or '.' in the address. We'll fix this using the regular expression features below.


b@google


In [10]:
#Square Brackets

#Square brackets can be used to indicate a set of chars, so [abc] matches 'a' or 'b' or 'c'. 
#The codes \w, \s etc. work inside square brackets too with the one exception that
#dot (.) just means a literal dot. For the emails problem, the square brackets are an easy way to 
#add '.' and '-' to the set of chars which can appear around the @ with the pattern r'[\w.-]+@[\w.-]+' 
#to get the whole email address:

match = re.search(r'[\w.-]+@[\w.-]+', str)
if match:
    print match.group()  ## 'alice-b@google.com'


#You can also use a dash to indicate a range, so [a-z] matches all lowercase letters.
#To use a dash without indicating a range, put the dash last, e.g. [abc-]

alice-b@google.com


In [11]:
#Group Extraction

#  The "group" feature of a regular expression allows you to pick out parts of the matching text.
#  Suppose for the emails problem that we want to extract the username and host separately. 
#  To do this, add parenthesis ( ) around the username and host in the pattern, 
#  like this: r'([\w.-]+)@([\w.-]+)'. In this case, the parenthesis do not change 
#  what the pattern will match, instead they establish logical "groups" inside of the match text.
#  On a successful search, match.group(1) is the match text corresponding to the 1st left parenthesis,
#  and match.group(2) is the text corresponding to the 2nd left parenthesis. 
#  The plain match.group() is still the whole match text as usual.
str = 'purple alice-b@google.com monkey dishwasher'
match = re.search('([\w.-]+)@([\w.-]+)', str)
if match:
    print match.group()   ## 'alice-b@google.com' (the whole match)
    print match.group(1)  ## 'alice-b' (the username, group 1)
    print match.group(2)  ## 'google.com' (the host, group 2)

alice-b@google.com
alice-b
google.com
