# Regex

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# re.findall(patter,string)

In [3]:
#Literals
subject = 'abc'
#find the letter a
re.findall(r'a', subject)

['a']

In [4]:
# find the letter d
re.findall(r'd', subject)

[]

In [5]:
# more complex
subject = 'Mary had a little lamb. 1x little lamb. Not 10 lambs, not 12, not 22, just one ~'
#find not
re.findall(r'not',subject),re.findall(r'Not',subject)


(['not', 'not'], ['Not'])

In [6]:
#regex flag: re.IGNORECASE
regexp= r'not'
re.findall(regexp,subject,re.IGNORECASE)

['Not', 'not', 'not']

In [7]:
#find lamb
re.findall(r'lamb',subject,re.IGNORECASE)

['lamb', 'lamb', 'lamb']

# Metacharacters

* \w: any letter or digit
* \W: anything that is not a letter or digit
* \d: any digit
* \D: anything that is not a digit
* \s: any white space
* . : anything

In [8]:
# tryout
subject = 'abcccC. 123!w'
# find any number or letter
re.findall(r'\w', subject)

['a', 'b', 'c', 'c', 'c', 'C', '1', '2', '3', 'w']

In [9]:
# try \w\w
re.findall(r'\w\w\w\w\w\w', subject)

['abcccC']

In [10]:
# try \W
re.findall(r'\W', subject)

['.', ' ', '!']

In [11]:
# try \d
re.findall(r'\d\d\d',subject)

['123']

In [12]:
# try \D
re.findall(r'\D',subject)

['a', 'b', 'c', 'c', 'c', 'C', '.', ' ', '!', 'w']

In [13]:
# try \s
re.findall(r'\s',subject)

[' ']

In [14]:
# try .
re.findall(r'.',subject)

['a', 'b', 'c', 'c', 'c', 'C', '.', ' ', '1', '2', '3', '!', 'w']

Mini-Exercise
* Match the string 'c 1' using only metacharacters
* The returned list should have only one element in it
* Find 3 different syntax combinations

In [15]:
subject = 'c 1'
re.findall(r'c 1',subject)

['c 1']

In [16]:
re.findall(r'.\W\d',subject)

['c 1']

In [17]:
re.findall(r'...',subject)

['c 1']

In [18]:
re.findall(r'\w\s\d',subject)

['c 1']

In [19]:
re.findall(r'\D\s\d', subject)

['c 1']

In [20]:
re.findall(r'\w\W\w',subject)

['c 1']

# Metacharacters

* \w: any letter or digit
* \W: anything that is not a letter or digit
* \d: any digit
* \D: anything that is not a digit
* \s: any white space
* . : anything

# Repeating
* {}: custom number of repetitions
     * {x}: exactly x repetition
     * {x,}: x or more
     * {x,y}: between x and y repetitions
* *: zero or more
* +: one or more
* ?:optional/not greed

In [21]:
subject = 'ccccc! 123 ccc! 99!'

In [22]:
# find the whole string
re.findall(r'.+',subject)

['ccccc! 123 ccc! 99!']

In [23]:
#find all the c groups
re.findall(r'c{3}',subject), re.findall(r'c{3,}',subject)

(['ccc', 'ccc'], ['ccccc', 'ccc'])

In [24]:
# find 123 and anything after it 

In [25]:
subject

'ccccc! 123 ccc! 99!'

In [26]:
#123: find literal 123
#.: find anything
#+: find everything after
re.findall(r'123.+', subject)

['123 ccc! 99!']

In [27]:
# find the exclamation points and everything inbetween the first two 
re.findall(r'!',subject), re.findall(r'!.+?!',subject)

(['!', '!', '!'], ['! 123 ccc!'])

Mini Exercise
From the below string, find the following information:

* Find all the numbers
* Find the number that has exactly 5 digits
* Find numbers that has 4 or more digits
* Find the sentences contained in quotes
* Find http:// or https://

In [28]:
subject = '''
Codeup, founded in 2014, is located at 600 Navarro St. 
Suite 350, San Antonio, TX 78230. 
tagline: "launch your career in tech!" 
You can find us online at http://codeup.com 
and our alumni portal is located at https://alumni.codeup.com
and "codeup is a great school"! 100,000
'''

subject = subject.replace('\n','')
subject

'Codeup, founded in 2014, is located at 600 Navarro St. Suite 350, San Antonio, TX 78230. tagline: "launch your career in tech!" You can find us online at http://codeup.com and our alumni portal is located at https://alumni.codeup.comand "codeup is a great school"! 100,000'

In [29]:
# find all numbers
re.findall(r'\d+',subject)

['2014', '600', '350', '78230', '100', '000']

In [30]:
# find a number that has exactly 5 digits
re.findall(r'\d{5}',subject)

['78230']

In [31]:
# find numbers that has 4 or more digits
re.findall(r'\d{4,}',subject)

['2014', '78230']

In [32]:
# find the sentences contained in quotes
re.findall(r'".+?"',subject)

['"launch your career in tech!"', '"codeup is a great school"']

In [33]:
# find http:// or https://
re.findall(r'http://|https://', subject),
re.findall(r'htt.+?//',subject),
re.findall(r'http.*?//',subject),
re.findall(r'https?://', subject),
re.findall(r'https*://',subject)

['http://', 'https://']

# Metacharacters

* \w: any letter or digit
* \W: anything that is not a letter or digit
* \d: any digit
* \D: anything that is not a digit
* \s: any white space
* . : anything


# Repeating
* {}: custom number of repetitions
     * {x}: exactly x repetition
     * {x,}: x or more
     * {x,y}: between x and y repetitions
* *: zero or more
* +: one or more
* ?:optional/not greed


# Any of / None of 

* []: will match any element inside of
* [^]: will match any element NOT inside of
* [-]: will match a range of values inside of

In [34]:
# match using brackets
subject = 'abc 123745 1bc'

In [36]:
# find a or b 
re.findall(r'a|b',subject), re.findall(r'[ab]',subject)

(['a', 'b', 'b'], ['a', 'b', 'b'])

In [37]:
# find values that are NOT a or b 
re.findall(r'[^ab]',subject)

['c', ' ', '1', '2', '3', '7', '4', '5', ' ', '1', 'c']

In [38]:
# find values that is between 2 and 4
re.findall(r'[2-4]',subject)

['2', '3', '4']

Anchors
* ^: starts with
* $: ends with
* \b: word boundary

In [49]:
subject = 'kiwi aardvark banana codeup data science academy extra'

In [44]:
# match all words that start with a vowel
re.findall(r'[aeiou]\w+',subject)

['iwi', 'aardvark', 'anana', 'odeup', 'ata', 'ience', 'academy', 'extra']

In [45]:
#using a boundary 
re.findall(r'\b[aeiou]\w+',subject)

['aardvark', 'academy', 'extra']

In [50]:
# using a carrot
re.findall(r'^[aeiou]\w+',subject)

[]

In [58]:
subject

'kiwi aardvark banana codeup data science academy extra'

In [57]:
subjects = subject.split()
subjects

['kiwi', 'aardvark', 'banana', 'codeup', 'data', 'science', 'academy', 'extra']

In [59]:
for sub in subjects:
    print(re.findall(r'^[aioue]\w+',sub))

[]
['aardvark']
[]
[]
[]
[]
['academy']
['extra']


In [60]:
# match all words that end with a vowel
re.findall(r'\w+[aeiou]\b',subject)

['kiwi', 'banana', 'data', 'science', 'extra']

In [61]:
# match the word extra
re.findall(r'\w+[aeiou]$',subject)

['extra']

In [62]:
# match kiwi and aardvard
re.findall(r'[ka][ia]\w+',subject)

['kiwi', 'aardvark']

In [65]:
re.findall(r'i|j ',subject)

['i', 'i', 'i']

Mini Exercise

Write regular expressions to find the following values

    Find any even digits (regardless if its apart of a bigger number)
    Find entire numbers that are even
    Find 2 or more odd digits in a row.
    Find all the capital letters
    Find all words that start with a capital letter
<\div>

In [67]:
subject = '''
Codeup, founded in 2014, is located at 600 Navarro St. 
Suite 350, San Antonio, TX 78230. 
tagline: "launch your career in tech!" 
You can find us online at http://codeup.com 
and our alumni portal is located at https://alumni.codeup.com
and "codeup is a great school"!
'''

subject = subject.replace('\n','')
subject

'Codeup, founded in 2014, is located at 600 Navarro St. Suite 350, San Antonio, TX 78230. tagline: "launch your career in tech!" You can find us online at http://codeup.com and our alumni portal is located at https://alumni.codeup.comand "codeup is a great school"!'

In [68]:
# find any even digits (regardless if its apart of a bigger number)
re.findall(r'[24680]',subject)

['2', '0', '4', '6', '0', '0', '0', '8', '2', '0']

In [70]:
# find entire numbers that are even
re.findall(r'\d*[24680]\b',subject)

['2014', '600', '350', '78230']

In [72]:
# find 2 or more odd digits in a row (regardless if its apart of a bigger number)
re.findall(r'[13579][13579]',subject), re.findall(r'[13579]{2,}',subject)

(['35'], ['35'])

In [73]:
# find all the capital letters
re.findall(r'[A-Z]',subject)

['C', 'N', 'S', 'S', 'S', 'A', 'T', 'X', 'Y']

In [74]:
# find all words taht start with a capital letter
re.findall(r'[A-Z]\w+',subject)

['Codeup', 'Navarro', 'St', 'Suite', 'San', 'Antonio', 'TX', 'You']

In [None]:
# Capture Group