In [1]:
text ="The agent's phone number is 408-555-1234!"

In [2]:
"408-555-1234" in text

True

In [3]:
import re

In [4]:
pattern = "408-555-1234"

In [5]:
re.search(pattern, text)

<re.Match object; span=(28, 40), match='408-555-1234'>

In [6]:
pattern = "not in text"

In [7]:
re.search(pattern, text)

In [8]:
pattern = "408-555-1234"

In [10]:
match = re.search(pattern, text)

In [12]:
match.span()

(28, 40)

In [13]:
match.start()

28

In [14]:
match.end()

40

In [15]:
text ="The agent's phone number is 408-555-1234! He can be called on 408-555-1234 anytime"

In [16]:
pattern = "408-555-1234"

In [17]:
match = re.search(pattern, text) # find only the first instance of the pattern

In [18]:
match

<re.Match object; span=(28, 40), match='408-555-1234'>

In [19]:
matches = re.findall(pattern, text) # returns a list of all matches

In [20]:
matches

['408-555-1234', '408-555-1234']

In [21]:
len(matches)

2

In [23]:
# return each match object like re.search()
for match in re.finditer(pattern,text):
    print(match)

<re.Match object; span=(28, 40), match='408-555-1234'>
<re.Match object; span=(62, 74), match='408-555-1234'>


In [24]:
# return each match object like re.search()
for match in re.finditer(pattern,text):
    print(match.span()) # returns start and end position

(28, 40)
(62, 74)


In [28]:
# return each match object like re.search()
for match in re.finditer(pattern,text):
    print(match.group()) # returns pattern

408-555-1234
408-555-1234


## Identifiers for Characters in Patterns

https://docs.python.org/3/howto/regex.html

Note: Alphanumeric includes underscores

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >\d</span></td><td>A digit</td><td>file_\d\d</td><td>file_25</td></tr>

<tr ><td><span >\w</span></td><td>Alphanumeric</td><td>\w-\w\w\w</td><td>A-b_1</td></tr>



<tr ><td><span >\s</span></td><td>White space</td><td>a\sb\sc</td><td>a b c</td></tr>



<tr ><td><span >\D</span></td><td>A non digit</td><td>\D\D\D</td><td>ABC</td></tr>

<tr ><td><span >\W</span></td><td>Non-alphanumeric</td><td>\W\W\W\W\W</td><td>*-+=)</td></tr>

<tr ><td><span >\S</span></td><td>Non-whitespace</td><td>\S\S\S\S</td><td>Yoyo</td></tr></table>

In [30]:
text = "my phone number is 408-555-1234"

In [31]:
phone = re.search('408-555-1234', text)

In [32]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [38]:
pattern = r'\d\d\d-\d\d\d-\d\d\d\d'

In [39]:
phone = re.search(pattern, text)

In [40]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

## Quantifiers

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >+</span></td><td>Occurs one or more times</td><td>	Version \w-\w+</td><td>Version A-b1_1</td></tr>

<tr ><td><span >{3}</span></td><td>Occurs exactly 3 times</td><td>\D{3}</td><td>abc</td></tr>



<tr ><td><span >{2,4}</span></td><td>Occurs 2 to 4 times</td><td>\d{2,4}</td><td>123</td></tr>



<tr ><td><span >{3,}</span></td><td>Occurs 3 or more</td><td>\w{3,}</td><td>anycharacters</td></tr>

<tr ><td><span >\*</span></td><td>Occurs zero or more times</td><td>A\*B\*C*</td><td>AAACC</td></tr>

<tr ><td><span >?</span></td><td>Once or none</td><td>plurals?</td><td>plural</td></tr></table>

In [49]:
pattern = r'\d{3}-\d{3}-\d{4}' # Use quantifiers to indicate repitition of a character

In [50]:
phone = re.search(pattern, text)

In [51]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [52]:
phone.group()

'408-555-1234'

## Groups

What if we wanted to do two tasks, find phone numbers, but also be able to quickly extract their area code (the first three digits). We can use groups for any general task that involves grouping together regular expressions (so that we can later break them down). 

Using the phone number example, we can separate groups of regular expressions using parenthesis:


In [54]:
# compiles different regex pattern codes
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')
# parentheis inidicate a group of a patterns, that get compiled into a single expresion
# can call each group individually or complete match

In [55]:
results = re.search(phone_pattern, text)

In [56]:
results

<re.Match object; span=(19, 31), match='408-555-1234'>

In [58]:
results.group()

'408-555-1234'

In [59]:
results.groups()

('408', '555', '1234')

In [69]:
# extract area code only i.e. first group
results.group(1) # Group indexing start at 1

'408'

In [70]:
results.group(2)

'555'

In [71]:
results.group(3)

'1234'

## Additional Regex Syntax

### Or operator |

Use the pipe operator to have an **or** statment. For example

In [72]:
re.search(r'cat', "the cat is here") # match

<re.Match object; span=(4, 7), match='cat'>

In [73]:
re.search(r'cat', "the dog is here") # no match

In [75]:
re.search(r'cat|dog', "the cat is here") # OR match

<re.Match object; span=(4, 7), match='cat'>

### The Wildcard Character

Use a "wildcard" as a placement that will match any character placed there. You can use a simple period **.** for this. For example:

In [76]:
re.findall(r'at', 'The cat in the hat sat there')

['at', 'at', 'at']

In [77]:
re.findall(r'.at', 'The cat in the hat sat there') # Using wildcard "."

['cat', 'hat', 'sat']

In [81]:
re.findall(r'...at', 'The cat in the hat sat went splat') # wildcard doesnt could whitespace as a character

['e cat', 'e hat', 'splat']

### Starts with and Ends With

We can use the **^** to signal starts with, and the **$** to signal ends with:

In [82]:
re.findall(r'^\d', "1 is a number") # starts with a number

['1']

In [83]:
re.findall(r'^\d', "one is a number") # starts with a number

[]

In [87]:
re.findall(r'\d$', "the number is 2") # ends with a number

['2']

In [88]:
re.findall(r'\d$', "the number is 222") # ends with a number

['2']

In [89]:
re.findall(r'\d{3}$', "the number is 222") # ends with a number

['222']

In [90]:
re.findall(r'\d{2}$', "the number is 222") # ends with a number

['22']

### Exclusion

To exclude characters, we can use the **^** symbol in conjunction with a set of brackets **[]**. Anything inside the brackets is excluded. For example:

In [92]:
phrase = 'there are 3 numbers in 34 inside 5 this sentence'

In [94]:
# exlude using ^ and [ ]
pattern = r'[^\d]' # exclude digits

In [95]:
re.findall(pattern, phrase) # returns list of all characthers that match regex pattern

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 'i',
 'n',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e']

In [96]:
"".join(re.findall(pattern, phrase))

'there are  numbers in  inside  this sentence'

In [97]:
pattern = r'[^\d]+' # + means occrus 1 or more times

In [98]:
re.findall(pattern, phrase) 

['there are ', ' numbers in ', ' inside ', ' this sentence']

In [99]:
"".join(re.findall(pattern, phrase))

'there are  numbers in  inside  this sentence'

In [100]:
# exclussion is a common way to remove punctuation from a sentence
test_phrase = "This is a string! But it has punctuation. How can we remove it?"

In [108]:
re.findall(r'[^!.?]+',test_phrase)

['This is a string', ' But it has punctuation', ' How can we remove it']

In [107]:
"".join(re.findall(r'[^!.?]+',test_phrase))

'This is a string But it has punctuation How can we remove it'

In [110]:
re.findall(r'[^!.? ]+',test_phrase) # remove spaces as well

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [122]:
" ".join(re.findall(r'[^!.?]+',test_phrase)) # join with a space

'This is a string  But it has punctuation  How can we remove it'

## Brackets for Grouping

As we showed above we can use brackets to group together options, for example if we wanted to find hyphenated words:

In [114]:
text= 'only find the hyphen-words in this seentence, but you dont know how long-ish they are'

In [119]:
re.findall(r'(\w+)-(\w+)',text)

[('hyphen', 'words'), ('long', 'ish')]

In [116]:
re.findall(r'\w+-\w+',text)

['hyphen-words', 'long-ish']

In [120]:
re.findall(r'[\w]+',text)

['only',
 'find',
 'the',
 'hyphen',
 'words',
 'in',
 'this',
 'seentence',
 'but',
 'you',
 'dont',
 'know',
 'how',
 'long',
 'ish',
 'they',
 'are']

In [123]:
re.findall(r'[\w]+-[\w]+',text) # programmers prefer to use brackets [ ] to separate groups

['hyphen-words', 'long-ish']

## Parenthesis for Multiple Options

If we have multiple options for matching, we can use parenthesis to list out these options. For Example:

In [124]:
# Find words that start with cat and end with one of these options: 'fish','nap', or 'claw'
text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

In [125]:
re.search(r'cat(fish|nap|erpillar)', text)

<re.Match object; span=(27, 34), match='catfish'>

In [126]:
re.search(r'cat(fish|nap|erpillar)', texttwo)

<re.Match object; span=(32, 38), match='catnap'>

In [127]:
re.search(r'cat(fish|nap|erpillar)', textthree)

<re.Match object; span=(26, 37), match='caterpillar'>

In [130]:
re.search(r'cat(fish|nap|claw)', text)

<re.Match object; span=(27, 34), match='catfish'>

In [131]:
re.search(r'cat(fish|nap|claw)', textthree)