In [None]:
# REGEX IN PYTHON
s = 'foo123bar'
print('123' in s)

True


In [None]:
# If you want to know not only whether '123' exists in s but also where it exists, 
# then you can use .find() or .index().
#  Each of these returns the character position within s where the substring resides:
print(s.find('123'))
print(s.index('123'))

3
3


In [None]:
import re
# For example, rather than searching for a fixed substring like '123',
#  suppose you wanted to determine whether a string contains any three consecutive decimal 
# digit characters, as in the strings 'foo123bar', 'foo456bar', '234baz', and 'qux678'.

# Strict character comparisons won’t cut it here. This is where regexes in Python 
# come to the rescue.
s = 'foo123bar'
print(re.search('123', s))

if re.search('123', s):
     print('Found a match.')
else:
     print('No match.')


# span=(3, 6) indicates the portion of <string> in which the match was found. 
# This means the same thing as it would in slice notation:

<re.Match object; span=(3, 6), match='123'>
Found a match.


In [None]:
print(s[3:6])

123


In [None]:
print(re.search('[0-9][0-9][0-9]', 'foo456bar'))

<re.Match object; span=(3, 6), match='456'>


In [None]:
print(re.search('[0-9][0-9][0-9]', '234baz'))
print(re.search('[0-9][0-9][0-9]', 'qux678'))
print(print(re.search('[0-9][0-9][0-9]', '12foo34')))


<re.Match object; span=(0, 3), match='234'>
<re.Match object; span=(3, 6), match='678'>
None
None


In [None]:
# The dot (.) metacharacter matches any character except a newline, 
# so it functions like a wildcard
s = 'foo123bar'
print(re.search('1.3', s))


s = 'foo13bar'
print(re.search('1.3', s))

<re.Match object; span=(3, 6), match='123'>
None


In [None]:
re.search('ba[artz]', 'foobarqux')

<re.Match object; span=(3, 6), match='bar'>

In [None]:
re.search('ba[artz]', 'foobazqux')

<re.Match object; span=(3, 6), match='baz'>

In [None]:
re.search('[a-z]', 'FOObar')

<re.Match object; span=(3, 4), match='b'>

In [None]:
re.search('[0-9][0-9]', 'foo123bar')

<re.Match object; span=(3, 5), match='12'>

In [None]:
# [0-9a-fA-F] matches any hexadecimal digit character:
re.search('[0-9a-fA-f]', '--- a0 ---')

<re.Match object; span=(4, 5), match='a'>

In [None]:
# [^0-9] matches any character that isn’t a digit:
re.search('[^0-9]', '12345foo')

<re.Match object; span=(5, 6), match='f'>

In [None]:
# If a ^ character appears in a character class but isn’t the first character,
#  then it has no special meaning and matches a literal '^' character:
re.search('[#:^]', 'foo^bar:baz#qux')



<re.Match object; span=(3, 4), match='^'>

In [None]:
re.search('[-abc]', '123-456')


<re.Match object; span=(3, 4), match='-'>

In [None]:
re.search('[abc-]', '123-456')

<re.Match object; span=(3, 4), match='-'>

In [None]:
re.search('[ab\-c]', '123-456')

<re.Match object; span=(3, 4), match='-'>

In [None]:
re.search('[]]', 'foo[1]')

<re.Match object; span=(5, 6), match=']'>

In [None]:
re.search('[ab\]cd]', 'foo[1]')

<re.Match object; span=(5, 6), match=']'>

In [None]:
# Other regex metacharacters lose their special meaning inside a character class:
re.search('[)*+|]', '123*456')

<re.Match object; span=(3, 4), match='*'>

In [None]:
re.search('[)*+|]', '123+456')

<re.Match object; span=(3, 4), match='+'>

In [None]:
# The . metacharacter matches any single character except a newline:
re.search('foo.bar', 'fooxbar')

<re.Match object; span=(0, 7), match='fooxbar'>

In [None]:
print(re.search('foo.bar', 'foobar'))
print(re.search('foo.bar', 'foo\nbar'))

None
None


In [None]:
# \w matches any alphanumeric word character. Word characters are
#  uppercase and lowercase letters, digits, and the underscore (_) character,
#  so \w is essentially shorthand for [a-zA-Z0-9_]:
re.search('\w', '#(.a$@&')

<re.Match object; span=(3, 4), match='a'>

In [None]:
re.search('[a-zA-Z0-9_]', '#(.a$@&')

<re.Match object; span=(3, 4), match='a'>

In [None]:
# \W is the opposite. It matches any non-word character and is equivalent to [^a-zA-Z0-9_]:
re.search('\W', 'a_1*3Qb')

<re.Match object; span=(3, 4), match='*'>

In [None]:
re.search('[^a-zA-Z0-9_]', 'a_1*3Qb')

<re.Match object; span=(3, 4), match='*'>

In [None]:
# \d matches any decimal digit character. \D is the opposite. It matches any character that isn’t a decimal digit:
re.search('\d', 'abc4def')

<re.Match object; span=(3, 4), match='4'>

In [None]:
# \d is essentially equivalent to [0-9], and \D is equivalent to [^0-9].
re.search('\D', '234Q678')

<re.Match object; span=(3, 4), match='Q'>

In [None]:
# \s matches any whitespace character:
re.search('\s', 'foo\nbar baz')

<re.Match object; span=(3, 4), match='\n'>

In [None]:
# \S is the opposite of \s. It matches any character that isn’t whitespace:
re.search('\S', '  \n foo  \n  ')

<re.Match object; span=(4, 5), match='f'>

In [None]:
# backslash (\) Removes the special meaning of a metacharacter.
print(re.search('.', 'foo.bar'))


print(re.search('\.', 'foo.bar'))

<re.Match object; span=(0, 1), match='f'>
<re.Match object; span=(3, 4), match='.'>


In [None]:
s = r'foo\bar'
print(s)

foo\bar


In [None]:
re.search('\\', s)

error: ignored

In [None]:
re.search(r'\\', s)

<re.Match object; span=(3, 4), match='\\'>

In [None]:
# regex ^foo stipulates that 'foo' must be present not just any old place in the search string, but at the beginning:
re.search('^foo', 'foobar')

<re.Match object; span=(0, 3), match='foo'>

In [None]:
print(re.search('^foo', 'barfoo'))

None


In [None]:
re.search('\Afoo', 'foobar')

<re.Match object; span=(0, 3), match='foo'>

In [None]:
print(re.search('\Afoo', 'barfoo'))

None


In [None]:
# Whatever precedes $ or \Z must constitute the end of the search string:
re.search('bar$', 'foobar')

<re.Match object; span=(3, 6), match='bar'>

In [None]:
print(re.search('bar$', 'barfoo'))

None


In [None]:
re.search('bar\Z', 'foobar')

<re.Match object; span=(3, 6), match='bar'>

In [None]:
print(re.search('bar\Z', 'barfoo'))

None


In [None]:
# \b Anchors a match to a word boundary.
print(re.search(r'\bbar', 'foo bar'))

print(re.search(r'\bbar', 'foo.bar'))


print(re.search(r'\bbar', 'foobar'))


print(re.search(r'foo\b', 'foo bar'))

print(re.search(r'foo\b', 'foo.bar'))

print(re.search(r'foo\b', 'foobar'))


<re.Match object; span=(4, 7), match='bar'>
<re.Match object; span=(4, 7), match='bar'>
None
<re.Match object; span=(0, 3), match='foo'>
<re.Match object; span=(0, 3), match='foo'>
None


In [None]:
re.search(r'\bbar\b', 'foo bar baz')


<re.Match object; span=(4, 7), match='bar'>

In [None]:
re.search(r'\bbar\b', 'foo(bar)baz')

<re.Match object; span=(4, 7), match='bar'>

In [None]:
print(re.search(r'\bbar\b', 'foobarbaz'))

None


In [None]:
# a* matches zero or more 'a' characters. That means it would match an empty string, 'a', 'aa', 'aaa', and so on.
re.search('foo-*bar', 'foobar')                     # Zero dashes

<re.Match object; span=(0, 6), match='foobar'>

In [None]:
re.search('foo-*bar', 'foo-bar')                    # One dash

<re.Match object; span=(0, 7), match='foo-bar'>

In [None]:
re.search('foo-*bar', 'foo--bar')                   # Two dashes

<re.Match object; span=(0, 8), match='foo--bar'>

In [None]:
# .* matches everything between 'foo' and 'bar':
re.search('foo.*bar', '# foo $qux@grault % bar #')

<re.Match object; span=(2, 23), match='foo $qux@grault % bar'>

In [None]:
# This is similar to *, but the quantified regex must occur at least once:
print(re.search('foo-+bar', 'foobar'))              # Zero dashes

print(re.search('foo-+bar', 'foo-bar'))                 # One dash

print(re.search('foo-+bar', 'foo--bar'))                # Two dashes


None
<re.Match object; span=(0, 7), match='foo-bar'>
<re.Match object; span=(0, 8), match='foo--bar'>


In [None]:
# Again, this is similar to * and +, but in this case there’s only a match if the preceding regex occurs once or not at all:
print(re.search('foo-?bar', 'foobar'))                 # Zero dashes

print(re.search('foo-?bar', 'foo-bar'))                   # One dash

print(re.search('foo-?bar', 'foo--bar'))            # Two dashes

<re.Match object; span=(0, 6), match='foobar'>
<re.Match object; span=(0, 7), match='foo-bar'>
None


In [None]:
print(re.match('foo[1-9]*bar', 'foobar'))

print(re.match('foo[1-9]*bar', 'foo42bar'))


print(re.match('foo[1-9]+bar', 'foobar'))

print(re.match('foo[1-9]+bar', 'foo42bar'))


print(re.match('foo[1-9]?bar', 'foobar'))

print(re.match('foo[1-9]?bar', 'foo42bar'))

<re.Match object; span=(0, 6), match='foobar'>
<re.Match object; span=(0, 8), match='foo42bar'>
None
<re.Match object; span=(0, 8), match='foo42bar'>
<re.Match object; span=(0, 6), match='foobar'>
None


In [None]:
re.search('<.*>', '%<foo> <bar> <baz>%')

<re.Match object; span=(1, 18), match='<foo> <bar> <baz>'>

In [None]:
re.search('<.*?>', '%<foo> <bar> <baz>%')

<re.Match object; span=(1, 6), match='<foo>'>

In [None]:
re.search('ba?', 'baaaa')

<re.Match object; span=(0, 2), match='ba'>

In [None]:
re.search('ba??', 'baaaa')

<re.Match object; span=(0, 1), match='b'>

In [None]:
# {m} Matches exactly m repetitions of the preceding regex.
print(re.search('x-{3}x', 'x--x'))                # Two dashes


print(re.search('x-{3}x', 'x---x'))                   # Three dashes


print(re.search('x-{3}x', 'x----x'))              # Four dashes

None
<re.Match object; span=(0, 5), match='x---x'>
None


In [None]:
# {m,n}  Matches any number of repetitions of the preceding regex from m to n, inclusive.

for i in range(1, 6):
    s = f"x{'-' * i}x"
    print(f'{i}  {s:10}', re.search('x-{2,4}x', s))

1  x-x        None
2  x--x       <re.Match object; span=(0, 4), match='x--x'>
3  x---x      <re.Match object; span=(0, 5), match='x---x'>
4  x----x     <re.Match object; span=(0, 6), match='x----x'>
5  x-----x    None


In [None]:
# In this case, a{3,5} produces the longest possible match, so it matches five 'a' characters.
#  a{3,5}? produces the shortest match, so it matches three.
print(re.search('a{3,5}', 'aaaaaaaa'))


print(re.search('a{3,5}?', 'aaaaaaaa'))



<re.Match object; span=(0, 5), match='aaaaa'>
<re.Match object; span=(0, 3), match='aaa'>


In [None]:
# (<regex>)Defines a subexpression or group.
print(re.search('(bar)', 'foo bar baz'))


print(re.search('bar', 'foo bar baz'))

<re.Match object; span=(4, 7), match='bar'>
<re.Match object; span=(4, 7), match='bar'>


In [None]:
print(re.search('(ba[rz]){2,4}(qux)?', 'bazbarbazqux'))

print(re.search('(ba[rz]){2,4}(qux)?', 'barbar'))


<re.Match object; span=(0, 12), match='bazbarbazqux'>
<re.Match object; span=(0, 6), match='barbar'>


In [None]:
print(re.search('(foo(bar)?)+(\d\d\d)?', 'foofoobar'))

print(re.search('(foo(bar)?)+(\d\d\d)?', 'foofoobar123'))

print(re.search('(foo(bar)?)+(\d\d\d)?', 'foofoo123'))

<re.Match object; span=(0, 9), match='foofoobar'>
<re.Match object; span=(0, 12), match='foofoobar123'>
<re.Match object; span=(0, 9), match='foofoo123'>


In [None]:
regex = r'(\w+),\1'

m = re.search(regex, 'foo,foo')
print(m)

print(m.group(1))


m = re.search(regex, 'qux,qux')
print(m)

print(m.group(1))


m = re.search(regex, 'foo,qux')
print(m)


<re.Match object; span=(0, 7), match='foo,foo'>
foo
<re.Match object; span=(0, 7), match='qux,qux'>
qux
None


In [None]:

m = re.search('(\w+),(\w+),(\w+)', 'foo,quux,baz')
print(m.groups())


print(m.group(1, 2, 3))

('foo', 'quux', 'baz')
('foo', 'quux', 'baz')


In [None]:
# reference the matched group by its given symbolic <name> instead of by its number.
m = re.search('(?P<w1>\w+),(?P<w2>\w+),(?P<w3>\w+)', 'foo,quux,baz')
print(m.groups())

print(m.group('w1', 'w2', 'w3'))

('foo', 'quux', 'baz')
('foo', 'quux', 'baz')


In [None]:
import re
# ^(###)? indicates that the search string optionally begins with '###'. If it does, 
# then the grouping parentheses around ### will create a group numbered 1. Otherwise, no such group will exist.
# The next portion, foo, literally matches the string 'foo'.
# Lastly, (?(1)bar|baz) matches against 'bar' if group 1 exists and 'baz' if it doesn’t.
regex = r'^(###)?foo(?(1)bar|baz)'
re.search(regex, '###foobar')
print(re.search(regex, '###foobaz'))
print(re.search(regex, 'foobar'))
re.search(regex, 'foobaz')

None
None


<re.Match object; span=(0, 6), match='foobaz'>

In [None]:
regex = r'^(?P<ch>\W)?foo(?(ch)(?P=ch)|)$'
print(re.search(regex, 'foo'))

print(re.search(regex, '#foo#'))

print(re.search(regex, '@foo@'))


print(re.search(regex, '#foo'))

print(re.search(regex, 'foo@'))

print(re.search(regex, '#foo@'))

print(re.search(regex, '@foo#'))


<re.Match object; span=(0, 3), match='foo'>
<re.Match object; span=(0, 5), match='#foo#'>
<re.Match object; span=(0, 5), match='@foo@'>
None
None
None
None


In [None]:
# The lookahead assertion (?=[a-z]) specifies that what follows 'foo' must be a lowercase alphabetic character. 
# In this case, it’s the character 'b', so a match is found.
print(re.search('foo(?=[a-z])', 'foobar'))
print(re.search('foo(?=[a-z])', 'foo123'))

<re.Match object; span=(0, 3), match='foo'>
None


# Searching Functions

In [2]:
import re
print(re.search(r'(\d+)', 'foo123bar'))

print(re.search(r'[a-z]+', '123FOO456', flags=re.IGNORECASE))


print(re.search(r'\d+', 'foo.bar'))

<re.Match object; span=(3, 6), match='123'>
<re.Match object; span=(3, 6), match='FOO'>
None


In [3]:
print(re.search(r'\d+', '123foobar'))

print(re.search(r'\d+', 'foo123bar'))


print(re.match(r'\d+', '123foobar'))

print(re.match(r'\d+', 'foo123bar'))

<re.Match object; span=(0, 3), match='123'>
<re.Match object; span=(3, 6), match='123'>
<re.Match object; span=(0, 3), match='123'>
None


In [4]:
s = 'foo\nbar\nbaz'

print(re.search('^foo', s))

print(re.search('^bar', s, re.MULTILINE))


<re.Match object; span=(0, 3), match='foo'>
<re.Match object; span=(4, 7), match='bar'>


In [5]:
print(re.fullmatch(r'\d+', '123foo'))

print(re.fullmatch(r'\d+', 'foo123'))

print(re.fullmatch(r'\d+', 'foo123bar'))

print(re.fullmatch(r'\d+', '123'))


print(re.search(r'^\d+$', '123'))

None
None
None
<re.Match object; span=(0, 3), match='123'>
<re.Match object; span=(0, 3), match='123'>


In [6]:
print(re.findall(r'#(\w+)#', '#foo#.#bar#.#baz#'))

['foo', 'bar', 'baz']


In [7]:
print(re.findall(r'\w+', '...foo,,,,bar:%$baz//|'))

['foo', 'bar', 'baz']


In [8]:
for i in re.finditer(r'\w+', '...foo,,,,bar:%$baz//|'):
    print(i)

<re.Match object; span=(3, 6), match='foo'>
<re.Match object; span=(10, 13), match='bar'>
<re.Match object; span=(16, 19), match='baz'>


# Substitution Functions

In [9]:
s = 'foo.123.bar.789.baz'

print(re.sub(r'\d+', '#', s))

print(re.sub('[a-z]+', '(*)', s))

foo.#.bar.#.baz
(*).123.(*).789.(*)


In [10]:
print(re.sub(r'(\w+),bar,baz,(\w+)',
       r'\2,bar,baz,\1',
       'foo,bar,baz,qux'))

qux,bar,baz,foo


# Substitution by Function

In [11]:
def f(match_obj):
    s = match_obj.group(0)  # The matching string
    # print(s)

    # s.isdigit() returns True if all characters in s are digits
    if s.isdigit():
        return str(int(s) * 10)
    else:
        return s.upper()

re.sub(r'\w+', f, 'foo.10.bar.20.baz.30')

foo
10
bar
20
baz
30


'FOO.100.BAR.200.BAZ.300'

### In this example, f() gets called for each match. As a result, re.sub() converts each alphanumeric portion of <string> to all uppercase and multiplies each numeric portion by 10.

In [12]:
re.sub(r'\w+', 'xxx', 'foo.bar.baz.qux')

'xxx.xxx.xxx.xxx'

In [13]:
re.sub(r'\w+', 'xxx', 'foo.bar.baz.qux', count=2)

'xxx.xxx.baz.qux'

# Utility Functions

In [14]:
re.split('\s*[,;/]\s*', 'foo,bar  ;  baz / qux')

['foo', 'bar', 'baz', 'qux']

In [15]:
re.split('(\s*[,;/]\s*)', 'foo,bar  ;  baz / qux')

['foo', ',', 'bar', '  ;  ', 'baz', ' / ', 'qux']

In [16]:
print(re.match('foo^bar(baz)|qux', 'foo^bar(baz)|qux'))
print(re.match('foo\^bar\(baz\)\|qux', 'foo^bar(baz)|qux'))

None
<re.Match object; span=(0, 16), match='foo^bar(baz)|qux'>


In [17]:
re.escape('foo^bar(baz)|qux') == 'foo\^bar\(baz\)\|qux'

True

In [18]:
re.match(re.escape('foo^bar(baz)|qux'), 'foo^bar(baz)|qux')

<re.Match object; span=(0, 16), match='foo^bar(baz)|qux'>

 # Compiled Regex Objects in Python

In [19]:
re.search(r'(\d+)', 'foo123bar')

<re.Match object; span=(3, 6), match='123'>

In [20]:
re_obj = re.compile(r'(\d+)')

In [21]:
re.search(re_obj, 'foo123bar')

<re.Match object; span=(3, 6), match='123'>

In [22]:
re_obj.search('foo123bar')

<re.Match object; span=(3, 6), match='123'>

# Why Bother Compiling a Regex?

In [23]:
s1, s2, s3, s4 = 'foo.bar', 'foo123bar', 'baz99', 'qux & grault'

import re
print(re.search('\d+', s1))
print(re.search('\d+', s2))

print(re.search('\d+', s3))

print(re.search('\d+', s4))

None
<re.Match object; span=(3, 6), match='123'>
<re.Match object; span=(3, 5), match='99'>
None


In [24]:
s1, s2, s3, s4 = 'foo.bar', 'foo123bar', 'baz99', 'qux & grault'
regex = '\d+'

print(re.search(regex, s1))
print(re.search(regex, s2))

print(re.search(regex, s3))

print(re.search(regex, s4))

None
<re.Match object; span=(3, 6), match='123'>
<re.Match object; span=(3, 5), match='99'>
None
