# Regular Expressions

In [4]:
import re

In [2]:
text = "The phone number of the agent is 408-555-1234. Call soon!"

In [3]:
"408-555-1234" in text

True

In [5]:
pattern = "phone"

In [7]:
# re.search(pattern you want to match, string you are searching)
re.search(pattern, text)

<re.Match object; span=(4, 9), match='phone'>

In [8]:
my_match = re.search(pattern, text)

In [11]:
my_match.span()  # .span(starts at index, goes up to index)

(4, 9)

In [12]:
my_match.start()

4

In [13]:
my_match.end()

9

In [15]:
text = "My phone is a new phone"

In [16]:
match = re.search(pattern, text)

In [17]:
match.span()

(3, 8)

In [18]:
all_matches = re.findall("phone", text)

In [19]:
len(all_matches)

2

In [21]:
for match in re.finditer("phone", text):
    print(match.span())

(3, 8)
(18, 23)


In [23]:
text = "My phone number is 777-555-1234"

In [24]:
text

'My phone number is 777-555-1234'

In [25]:
pattern = r'\d\d\d-\d\d\d-\d\d\d\d'

In [26]:
phone_number = re.search(pattern, text)

In [27]:
phone_number

<re.Match object; span=(19, 31), match='777-555-1234'>

In [29]:
# just shows match
phone_number.group()

'777-555-1234'

Quantifiers

In [30]:
pattern = r'\d{3}-\d{3}-\d{4}'

In [31]:
re.search(pattern, text)

<re.Match object; span=(19, 31), match='777-555-1234'>

In [1]:
# separate groups with parentheses ()
# everything within the () is considered a group
pattern = r"(\d{3})-(\d{3})-(\d{4})"

In [2]:
text = "My phone number is 777-555-1234"

In [5]:
mymatch = re.search(pattern, text)

In [10]:
# see entire matching text
mymatch.group()

'777-555-1234'

In [13]:
# only see the first group in the matching text
mymatch.group(1)

'777'

In [17]:
# pipe operator. Acts as logical OR
# re.search(pattern, text being searched)
re.search(r"man|woman", "This woman was here")

<re.Match object; span=(5, 10), match='woman'>

In [19]:
# .findall() method finds all instances of string
# . is a wildcard for single character
re.findall(r".at", "The cat in the hat sat")

['cat', 'hat', 'sat']

In [26]:
# ^ is starts with
re.findall(r"^\d", "1 is the loniest number")

['1']

In [27]:
# $ is ends with
re.findall(r"\d$", "This ends with a number 2")

['2']

In [29]:
phrase = "there are 3 numbers inside 5 this sentence"

In [34]:
# [] is negative, saying "find everything except this"
re.findall(r"[^\d]", phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e']

In [40]:
# the plus sign + brings together all characters before the 
# negative character was found
re.findall(r"[^\d]+", phrase)

['there are ', ' numbers inside ', ' this sentence']

In [41]:
test_phrase = "This is a string! but it has punctuation. How to remove it?"

In [45]:
# remove punctuation with []
re.findall(r"[^!.? ]+", test_phrase)

['This',
 'is',
 'a',
 'string',
 'but',
 'it',
 'has',
 'punctuation',
 'How',
 'to',
 'remove',
 'it']

In [46]:
mylist = re.findall(r"[^!.? ]+", test_phrase)

In [49]:
# convert list to string, separating elements by space
' '.join(mylist)

'This is a string but it has punctuation How to remove it'

In [54]:
text = "Only find the hyphen-words. Where are the long-ish dash words?"

In [58]:
# find hyphenated words by finding alphanumeric \w and a dash - 
re.findall(r"[\w]+-[\w]+", text)

['hyphen-words', 'long-ish']