## Detecting Phone Numbers: Without regex

In [122]:
def isPhoneNumber(text):
    if len(text) != 12:
        return False
    for i in range(0, 3):
        if not text[i].isdecimal():
            return False
    if text[3] != '-':
        return False
    for i in range(4, 7):
        if not text[i].isdecimal():
            return False
    if text[7] != '-':
        return False
    for i in range(8, 12):
        if not text[i].isdecimal():
            return False
    return True

In [123]:
print('415-555-4242 is a phone number:')
print(isPhoneNumber('415-555-4242'))

415-555-4242 is a phone number:
True


In [124]:
print('Moshi moshi is a phone number:')
print(isPhoneNumber('Moshi moshi'))

Moshi moshi is a phone number:
False


In [125]:
message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'

for i in range(len(message)):
    chunk = message[i:i+12]
    if isPhoneNumber(chunk):
        print('Phone number found: ' + chunk)
print('Done')

Phone number found: 415-555-1011
Phone number found: 415-555-9999
Done


## Detecting Phone Numbers: With regex

In [126]:
import re

In [127]:
'''
The following detects the first occurence of a phone number in a string: 
Note: \d represents a number
'''

phone_num_regex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

# this time, we have two phone numbers that we want to detect
text = 'My cell number is 415-555-4242. My office phone is 415-555-4321'

# the search method returns only the first occcurence
match_object = phone_num_regex.search(text)

# the match object returns None if there aren't any matches
if match_object is not None: 
    print('Phone number found: ' + match_object.group())
else: 
    print('No match found')

Phone number found: 415-555-4242


In [128]:
# shorter regex which does the same thing with less code!
phone_num_regex = re.compile(r'\d{3}-\d{3}-\d{4}')

text = 'My cell number is 415-555-4242. My office phone is 415-555-4321'

# the findall method returns all the occurences in a string
match_object = phone_num_regex.findall(text)

# the match object returns None if there aren't any matches
if match_object is not None: 
    for phone_num in match_object: 
        print('Phone number found: ' + phone_num)
else: 
    print('No match found')

Phone number found: 415-555-4242
Phone number found: 415-555-4321


In [129]:
#We can group text via parenthesis
phone_num_regex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')

text = 'My cell number is 415-555-4242. My office phone is 415-555-4321'

# the search method returns only the first occcurence
match_object = phone_num_regex.search(text)

print(match_object.group(0))
print(match_object.group(1))
print(match_object.group(2))
print('')

# We can retrieve all the groups using the groups method
print(match_object.groups())

# We can also use the multiple assignment trick
area_code, main_number = match_object.groups()

print(area_code)
print(main_number)

415-555-4242
415
555-4242

('415', '555-4242')
415
555-4242


In [130]:
#the following allows us to account for parenthesis while grouping
phone_num_regex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')

text = 'My cell number is (415) 555-4242. My office phone is 415-555-4321'

# the search method returns only the first occcurence
match_object = phone_num_regex.search(text)

print(match_object.group(1))
print(match_object.group(2))

(415)
555-4242


In [131]:
# the pipe operator can be used to match one of many expressions
name_regex = re.compile(r'Thing 1|Thing 2')

# when both strings occur, the first occurence will be returned
mo = name_regex.search('Thing 1 and Thing 2')
print(mo.group())

mo = name_regex.search('Thing 2 and Thing 1')
print(mo.group())

Thing 1
Thing 2


In [132]:
# enables to match by the prefix 'Bat'
# Note: if our text contains a |, we can escape is using \|

bat_regex = re.compile(r'Bat(man|mobile|copter|bat)')

mo = bat_regex.search('Batmobile lost a wheel')

# returns the full matched text
print(mo.group(0))

# returns only the portion in the first parenthesis group
print(mo.group(1))

Batmobile
mobile


In [133]:
bat_regex = re.compile(r'Bat(wo)?man')

mo = bat_regex.search('The Adventures of Batman')
print(mo.group())

mo = bat_regex.search('The Adventures of Batwoman')
print(mo.group())

Batman
Batwoman


In [134]:
# the following detects phone nums that may not have an area code
# Note: if our text contains a ?, we can escape is using \?
phone_regex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')

mo = phone_regex.search('My number is 415-555-4242')
print(mo.group())

mo = phone_regex.search('My number is 555-4242')
print(mo.group())

415-555-4242
555-4242


In [135]:
# we can match zero or more with the asterisk
# Note: if our text contains a *, we can escape is using \*

bat_regex = re.compile(r'Bat(wo)*man')

mo = bat_regex.search('The Adventures of Batman')
print(mo.group())

mo = bat_regex.search('The Adventures of Batwoman')
print(mo.group())

mo = bat_regex.search('The Adventures of Batwowowowoman')
print(mo.group())

Batman
Batwoman
Batwowowowoman


In [136]:
# we can match one or more with the +
# Unlike *, the group preceding + must appear at least once
# Note: if our text contains a +, we can escape is using \+

bat_regex = re.compile(r'Bat(wo)+man')

mo = bat_regex.search('The Adventures of Batwoman')
print(mo.group())

mo = bat_regex.search('The Adventures of Batwowowowoman')
print(mo.group())

mo = bat_regex.search('The Adventures of Batman')
print(mo == None)

Batwoman
Batwowowowoman
True


In [137]:
# we can match specific repetitions with curly brackets
# here we match 3 repetitions of Ha
ha_regex = re.compile(r'(Ha){3}')

mo = ha_regex.search('HaHaHa')
print(mo.group())

mo = ha_regex.search('Ha')
print(mo == None)

# the following matche 0 to 3 instances. Thus, Ha now works
ha_regex = re.compile(r'(Ha){,3}')

mo = ha_regex.search('Ha')
print(mo.group())

HaHaHa
True
Ha


In [138]:
phone_num_regex = re.compile(r'\d{3}-\d{3}-\d{4}')

text = 'Cell: 415-555-9999 Work: 212-555-0000'

# search returns a match object 
mo = phone_num_regex.search(text)
print(mo.group())

# findall returns a list of strings (if no groups in the regex)
mo = phone_num_regex.findall(text)
print(mo)

415-555-9999
['415-555-9999', '212-555-0000']


In [139]:
# now we allow groups in the regex
phone_num_regex = re.compile(r'(\d{3})-(\d{3}-\d{4})')

text = 'Cell: 415-555-9999 Work: 212-555-0000'

# findall now returns a list of tuples
# each tuple represents a found match
# its items are the matched strings for each group

mo = phone_num_regex.findall(text)
print(mo)

[('415', '555-9999'), ('212', '555-0000')]


In [140]:
# grab some text from the nytimes frontpage

text = '''
The United States is adding fewer than 30,000 cases a day for the first time since June of last year, and deaths are as low as they’ve been since last summer. In much of the country, the virus outlook is improving.
Nearly 50 percent of Americans have received at least one vaccine shot, and though the pace has slowed, the share is still growing by about two percentage points per week.
“I think by June, we’re probably going to be at one infection per a hundred thousand people per day, which is a very low level,” Dr. Scott Gottlieb, former head of the Food and Drug Administration, said Sunday on the CBS program “Face the Nation.” The U.S. rate is now 8 cases per 100,000, down from 22 during the most recent peak, when new cases averaged about 71,000 on April 14.
The share of coronavirus tests coming back positive has fallen to below 3 percent for the first time since widespread testing began, and the number of hospitalized patients has fallen to the lowest point in 11 months, Dr. Eric Topol of the Scripps Research Translational Institute noted this week. For the first time since March 5 of last year, San Francisco General Hospital had no Covid-19 patients — “a truly momentous day,” Dr. Vivek Jain, an infectious disease physician at the hospital, said on Thursday.
Michigan, the state that reported one of the largest surges in the spring, has rapidly improved. About 1,400 cases were identified on Sunday, compared with about 7,800 cases a day in mid-April.
The virus remains dangerous in communities with low vaccination rates, and getting vaccines into these communities is crucial in continuing to curb the virus. As the virus continues to mutate, vaccines may need to be updated or boosters may need to be added.
The United States is reporting about 25,700 coronavirus cases daily, a 39 percent decrease from two weeks ago. Deaths are down 14 percent over the same period, to an average of 578 per day.
Since the Centers for Disease Control and Prevention issued guidance that said vaccinated people could forgo masks in most situations indoors and outside, states have followed suit.
Because of changing mask rules and guidance, people will need to rely on their own judgment in some circumstances, Dr. Gottlieb said Sunday. “We’re going to have to protect ourselves based on our own assessment of our risk and our own comfort,” he said
'''

'''

The regular expression \d+\s\w+ matches text with the folowing:

# one or more numeric digits (\d+),
# potentially seperated by comma
# followed by a whitespace character (\s), 
#followed by one or more letter/digit/underscore characters (\w+). 
#The findall() method returns all matching strings in a list.
'''

text_regex = re.compile(r'\d+,?\d+\s\w+')

text_regex.findall(text)

['30,000 cases',
 '50 percent',
 '22 during',
 '71,000 on',
 '11 months',
 '19 patients',
 '1,400 cases',
 '7,800 cases',
 '25,700 coronavirus',
 '39 percent',
 '14 percent',
 '578 per']

In [141]:
# We can perform string substiution via the sub() Method
names_regex = re.compile(r'Agent \w+')

names_regex.sub('CENSORED', 
                'Agent Alice gave the secret documents to Agent Bob.')


'CENSORED gave the secret documents to CENSORED.'

In [142]:
names_regex = re.compile(r'Agent (\w)\w*')

names_regex.sub(r'\1****', 
                'Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'

In [143]:
# Writing our regex as follows increases clarity

phoneRegex = re.compile(r'''(
(\d{3}|\(\d{3}\))?            # area code
(\s|-|\.)?                    # separator
\d{3}                         # first 3 digits
(\s|-|\.)                     # separator
\d{4}                         # last 4 digits
(\s*(ext|x|ext.)\s*\d{2,5})?  # extension
)''', re.VERBOSE)

phoneRegex.search('My office number is 410-423-2323 ext 213')

<re.Match object; span=(20, 40), match='410-423-2323 ext 213'>

In [144]:
# Matching by default is case-sensitive. Use re.I option the enable case insensitive matching. 

robocop = re.compile(r'robocop', re.I)

print(robocop.search('RoboCop is part man, part machine, all cop.').group())

print(robocop.search('ROBOCOP protects the innocent.').group())

print(robocop.search('Al, why does your programming book talk about robocop so much?').group())

RoboCop
ROBOCOP
robocop


## Useful Links

[Regular Expressions Tutorial](https://www.guru99.com/python-regular-expressions-complete-tutorial.html#7)

[YouTube](https://www.youtube.com/watch?v=K8L6KVGG-7o)