In [1]:
import re

# findall occurrences

In [11]:
message = "Call me 01811563457 tomorrow, or at 0171871973546"

phoneNumRegex = re.compile(r'\d\d\d\d\d\d\d\d\d\d\d')
matchObject = phoneNumRegex.findall(message)
print(matchObject)

['01811563457', '01718719735']


# group interested thing to grab

In [18]:
canadianPhoneNum = "My number is 415-555-4242"
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)') # group things you are interested in grabbing
mo = phoneNumRegex.search(canadianPhoneNum)
if mo != None:
    print(mo.group()) # 415-555-4242
    print(mo.group(1)) # 415
    print(mo.group(2)) # 555-4242

415-555-4242
415
555-4242


# escape special characters using backslash

In [24]:
phoneNumRegex = re.compile(r'\(\d\d\d\) \d\d\d-\d\d\d\d') # escape ( with \(
mo = phoneNumRegex.search('My number is (415) 555-4242')
if mo != None:
    print(mo.group())

(415) 555-4242


# match one of many candidates

In [27]:
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search("Batmobile lost a wheel") # returns None if can't find 
if mo != None:
    print(mo.group() + " found, matched: " + mo.group(1))

Batmobile found, matched: mobile


# Repetition in Regex

## ? -> optional (1 or 0 times appeared)

In [32]:
batRegex = re.compile(r'Bat(wo)?man')
mo = batRegex.search("The adventure of Batman")
if mo is not None:
    print(mo.group())
else:
    print('Pattern not matched')

Batman


In [44]:
phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
matchObject = phoneRegex.search("My phone number is 555-1234 or 415-555-1234")
if matchObject is not None:
    print(matchObject)
else:
    print('No phone number found')

<re.Match object; span=(19, 27), match='555-1234'>


# * -> any time appeared

In [45]:
batRegex = re.compile(r'Bat(wo)*man')
batRegex.search('The adventure of Batwowowoman')


<re.Match object; span=(17, 29), match='Batwowowoman'>

# + at least once appeared

In [47]:
batRegex = re.compile(r'Bat(wo)+man')
print(batRegex.search('The adventure of Batwowowoman'))
print(batRegex.search('The adventure of Batman'))


<re.Match object; span=(17, 29), match='Batwowowoman'>
None


# Escaping ? * +

In [50]:
regex = re.compile(r'(\?\*\+)+')
print(regex.search("This sentence has ?*+?*+"))


<re.Match object; span=(18, 24), match='?*+?*+'>


## exactly {n}

In [51]:
haRegex = re.compile(r'(HA){3}')
print(haRegex.search('He said, "HAHAHA"'))

<re.Match object; span=(10, 16), match='HAHAHA'>


## range of repetition (low, high)

In [56]:
haRegex = re.compile(r'(Ha){3,5}') # 3<= occurrence of Ha <= 5
print(haRegex.search("He said HaHaHaHa"))

<re.Match object; span=(8, 16), match='HaHaHaHa'>


# greedy match
By default regex does greedy match

In [57]:
digitRegex = re.compile(r'(\d){3,5}')
print(digitRegex.search('1234567890')) 
# matches with 12345 the largest possible string 
# but it is ambiguous to detect which portion to match
# regex matches greedely


<re.Match object; span=(0, 5), match='12345'>


In [60]:
digitRegex = re.compile(r'(\d){3,5}?')
print(digitRegex.search('1234567890')) # matches the smallest possible string

<re.Match object; span=(0, 3), match='123'>


# more about findall method

In [67]:
phoneRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # attention: fewer than two groups

In [73]:
text_to_search =  '''Canada has many three-digit area codes that you must dial depending on which province or territory you’re trying to reach. Some provinces have many different area codes because they have high populations.

    Alberta: 403, 587, 825, 780, 587, 825
    British Columbia: 604, 778, 236, 250, 778, 236
    Manitoba: 204, 431
    New Brunswick: 506
    Newfoundland: 709
    Northwest Territories: 867
    Nova Scotia: 902
    Nunavut: 867
    Ontario: 905, 289, 365, 519, 226, 548, 705, 249, 613, 343 , 807, 416, 647, 437
    Prince Edward Island: 902
    Quebec: 418, 581, 450, 579, 514, 438 , 819, 873
    Saskatchewan: 306, 639
    Yukon Territory: 867

For example, if calling a mobile phone from outside Canada, you would dial +61 4 and the 8-digit mobile number.
An example of a Canadian phone number

If you see: 890-444-4444, then dial +1 604-555-5555.'''

In [74]:

phoneNumbers = phoneRegex.findall(text_to_search)

print(phoneNumbers) # list of matched strings as the pattern has fewer than two groups

['890-444-4444', '604-555-5555']


In [76]:
phoneNumbers = phoneRegex.search(text_to_search)
print(phoneNumbers) # match object only the first match

<re.Match object; span=(817, 829), match='890-444-4444'>


In [80]:
phoneRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)') # attention:  two groups

In [82]:
phoneNumbers = phoneRegex.findall(text_to_search)

print(phoneNumbers) # list of tuples each containing matched string groups

[('890', '444-4444'), ('604', '555-5555')]


In [85]:
phoneRegex = re.compile(r'((\d\d\d)-(\d\d\d-\d\d\d\d))') # attention:  three groups, group can overlap

In [86]:
phoneNumbers = phoneRegex.findall(text_to_search)

print(phoneNumbers) # list of tuples each containing matched string groups

[('890-444-4444', '890', '444-4444'), ('604-555-5555', '604', '555-5555')]


# Character class
<img src="4.png">

In [108]:
message = '10 policeman, 12 engineers, 50 doctors were transferred from District A to District B'
professionRegex = re.compile(r'((\d+)\s(\w+))')
print(professionRegex.findall(message))

[('10 policeman', '10', 'policeman'), ('12 engineers', '12', 'engineers'), ('50 doctors', '50', 'doctors')]


In [90]:
vowelRegex = re.compile(r'[aeiouAEIOU]') 
vowelRegex.findall("Robocop eats baby food.")

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o']

In [91]:
vowelRegex = re.compile(r'[aeiouAEIOU]{2}') 
vowelRegex.findall("Robocop eats baby food.")

['ea', 'oo']

In [93]:
vowelRegex = re.compile(r'[^aeiouAEIOU]') # any character isn't from the class, other than vowels
vowelRegex.findall("Robocop eats baby food.")

['R', 'b', 'c', 'p', ' ', 't', 's', ' ', 'b', 'b', 'y', ' ', 'f', 'd', '.']

# pattern matching at the begining and at the end

In [94]:
beginsWithHelloRegex = re.compile(r'^Hello') # begins with Hello
beginsWithHelloRegex.search("Hello There!")

<re.Match object; span=(0, 5), match='Hello'>

In [99]:
endsWithThereRegex = re.compile(r'There!$') # ends with There!
endsWithThereRegex.search("Hello There!")

<re.Match object; span=(6, 12), match='There!'>

In [102]:
allDigitsRegex = re.compile(r'^\d+$')
print(allDigitsRegex.search("12345"))
print(allDigitsRegex.search("12345A")) # not all are digits, single character at the end

<re.Match object; span=(0, 5), match='12345'>
None


# . (dot) any single character except newline

In [105]:
atRegex = re.compile(r'.at')
mathResults = atRegex.findall("The cat in the hat sat on the flat mat. ")
print(mathResults) # flat didn't match as . is a single character

['cat', 'hat', 'sat', 'lat', 'mat']


# .* any character any times
.* is greedy always tries to match as much as possible

In [109]:
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
matchedNames = nameRegex.findall("First Name: Farhan Last Name: Utshaw")
print(matchedNames)

[('Farhan', 'Utshaw')]
