In [1]:
import re

# Pattern

In [80]:
string = "The Euro STOXX 600 index, which tracks all stock markets across Europe including the FTSE, fell by 11.48% – the worst day since it launched in 1998. The panic selling prompted by the coronavirus has wiped £2.7tn off the value of STOXX 600 shares since its all-time peak on 19 February."

The pattern uses a ```\``` for insertion of an escape character and as a consequence raw strings are preferred ```r''```:

```\w``` represents word

```\d``` represents digit

```.``` represents any character except for a new line

The pattern uses braces ```{ }``` to enclose the number of characters that is a word or digit.

In [54]:
pattern = r'\d{4}' 
# 4 numeric digits

In [None]:
pattern = r'\d{4,}' 
# at least 4 numeric digits

In [None]:
pattern = r'\w{4}' 
# 4
# alphanumeric characters
# _
# exclusive of whitespace

In [None]:
pattern = r'.{4}' 
# alphanumeric characters
# _ 
# whitespace 
# exclusive of a new line \n

In [None]:
pattern = r'[A-Z]{4}'
# 4 uppercase characters

In [None]:
pattern = r'[a-z]{4}'
# 4 lowercase characters

# Search

In [2]:
string = 'abcd 1234 efgh 5678'

In [3]:
pattern = r'\d{4}'

In [4]:
re.search(pattern, string) #1st occurance only
# span is an index slice

<re.Match object; span=(5, 9), match='1234'>

In [5]:
string[5:9]

'1234'

In [6]:
pattern = r'\w{4}'

In [7]:
re.search(pattern, string) #1st occurance only
# span is an index slice

<re.Match object; span=(0, 4), match='abcd'>

In [8]:
string[0:4]

'abcd'

In [9]:
pattern = r'\d{4} \w{4}'

In [10]:
re.search(pattern, string) #1st occurance only
# span is an index slice

<re.Match object; span=(5, 14), match='1234 efgh'>

In [11]:
string[5:14]

'1234 efgh'

# Findall

In [61]:
string = 'abcd 1234 efgh 5678'

In [62]:
pattern = r'\d{4}' #4 consecutive digits

In [63]:
re.findall(pattern, string)

['1234', '5678']

In [64]:
pattern = r'\w{4}' #4 consecutive digits

In [65]:
re.findall(pattern, string)

['abcd', '1234', 'efgh', '5678']

# Match

Searches at the beginning of the string only.

In [15]:
string = 'abcd 1234 efgh 5678'

In [16]:
pattern = r'\w{4}'

In [17]:
re.match(pattern, string)

<re.Match object; span=(0, 4), match='abcd'>

In [18]:
pattern = r'\d{4}'

In [19]:
re.match(pattern, string)

# Full Match

Must include the entire string.

In [20]:
string = 'abcd 1234 efgh 5678'

In [21]:
pattern = r'\w{4} \d{4} \w{4} \w{4}'

In [24]:
re.fullmatch(pattern, string)

<re.Match object; span=(0, 19), match='abcd 1234 efgh 5678'>

In [40]:
string = 'abcd 1234 efgh 5678'

In [41]:
pattern = r'.{19}'

In [42]:
re.fullmatch(pattern, string)

<re.Match object; span=(0, 19), match='abcd 1234 efgh 5678'>

# Split

In [43]:
string = 'abcd 1234 efgh 5678'

In [49]:
pattern = r'\s' 
# \s matches the regular space as well as \n, \t, \r

In [50]:
string.split(' ')

['abcd', '1234', 'efgh', '5678']

In [51]:
re.split(pattern, string)

['abcd', '1234', 'efgh', '5678']

In [52]:
pattern = '   '

In [53]:
re.split(pattern, string)

['abcd 1234 efgh 5678']

# Sub

Substitute

In [69]:
string = 'abcd 1234 efgh 5678 IJKL MNOP'

In [70]:
pattern = r'[A-Z]{2,}'

In [71]:
repl = 'Hello'

In [72]:
re.sub(pattern, repl, string)

'abcd 1234 efgh 5678 Hello Hello'

In [73]:
re.sub(pattern, repl, string, 1)

'abcd 1234 efgh 5678 Hello MNOP'

In [74]:
re.sub(pattern, repl, string, 2)

'abcd 1234 efgh 5678 Hello Hello'

# Subn

tuple containing substitutions and number of substitutions

In [75]:
string = 'abcd 1234 efgh 5678 IJKL MNOP'

In [76]:
pattern = r'[A-Z]{2,}'

In [77]:
repl = 'Hello'

In [78]:
re.subn(pattern, repl, string)

('abcd 1234 efgh 5678 Hello Hello', 2)

# Groups and Groups

In [79]:
string = 'abcd 1234 efgh 5678 IJKL MNOP'

In [None]:
pattern = r('.+\s(.+cd).+(\d\d\s.+).'

In [None]:
#result.groups()

In [None]:
#result.group(1)

In [None]:
#result.group(2)

In [None]:
#result.group(1, 2)

In [None]:
#result.start(1)

In [None]:
#result.end(1)

In [None]:
#string[result.start(1): result.end(1)]

In [None]:
#result.span(1)

# Flag

In [None]:
# re.I # ignore case

In [None]:
re.findall(r'the', string)

In [None]:
re.findall(r'the', string, re.I)

In [None]:
# re.S # all including new line

In [None]:
re.search(r'.+', string)

In [None]:
re.search(r'.+', string, re.S)

In [None]:
# re.X # verbose

In [None]:
re.search(r'''.+s #Beginning of the string)
            (.+ex) #Searching for index
            .+ #Middle of the string
            (\d\d\s.+). #Date at the end''', string, re.X)
