# Regex Cheatsheet

## Regular expressions are used to match patterns in strings. The Python re module provides support for regular expressions in python. 

### The following are some of the most commonly used functions and character classes in the re module. 

In [1]:
# import the necessary packages
import re

### Containers

#### 1. [a-z], [A-Z], [0-9]
Matches any single character within the specified range:
- `[a-z]` matches any lowercase alphabet from `a` to `z`.
- `[A-Z]` matches any uppercase alphabet from `A` to `Z`.
- `[0-9]` matches any digit from `0` to `9`.

In [2]:
text = "abcDEF123ghiJKL456"
pattern_lower = r'[a-z]'
pattern_upper = r'[A-Z]'
pattern_digit = r'[0-9]'

matches_lower = re.findall(pattern_lower, text)
matches_upper = re.findall(pattern_upper, text)
matches_digit = re.findall(pattern_digit, text)

print("Lowercase a-k:", matches_lower)  # Output: ['a', 'b', 'c', 'g', 'h', 'i', 'j']
print("Uppercase A-Z:", matches_upper)  # Output: ['D', 'E', 'F', 'J', 'K', 'L']
print("Digits 0-9:", matches_digit)     # Output: ['1', '2', '3', '4', '5', '6']

Lowercase a-k: ['a', 'b', 'c', 'g', 'h', 'i']
Uppercase A-Z: ['D', 'E', 'F', 'J', 'K', 'L']
Digits 0-9: ['1', '2', '3', '4', '5', '6']


#### 2. [dyma ]
Matches either `d`, `y`, `m`, `a`, or a space. Each character in the character class is matched individually, not as a sequence.

In [3]:
text = "dynamically typed language"
pattern = r'[dyma ]'
matches = re.findall(pattern, text)
print(matches)

# The character class [dyma ] will match any of the characters 'd', 'y', 'm', 'a', or ' ' (space) in the string.
# It matches each occurrence of these characters in the string individually.
# The matches are returned in the order they appear in the string.

['d', 'y', 'a', 'm', 'a', 'y', ' ', 'y', 'd', ' ', 'a', 'a']


#### 3. [^ab]
Matches any character excluding `a` or `b`.

In [4]:
text = "abcdefgaabbccdad"
pattern = r'[^ab]'
matches = re.findall(pattern, text)
print(matches)

# The pattern [^ab] will match any character that is not 'a' or 'b' in the string.
# It matches each occurrence of these characters in the string individually.
# The matches are returned in the order they appear in the string.

['c', 'd', 'e', 'f', 'g', 'c', 'c', 'd', 'd']


### Regular Expression Character Classes

#### 1. \w
Matches an alphanumeric character, i.e., `a-z`, `A-Z`, `0-9`, and underscore, `_`.

In [5]:
text = "Hello _ world#1;,"
pattern = r'\w'
matches = re.findall(pattern, text)
print(matches)  # Output: ['H', 'e', 'l', 'l', 'o', '_', 'w', 'o', 'r', 'l', 'd', '1', '2', '3']

['H', 'e', 'l', 'l', 'o', '_', 'w', 'o', 'r', 'l', 'd', '1']


#### 2. \d, \d+, \d*, \d{3}
- `\d` matches a single digit `0-9`.
- `\d+` matches one or more digits.
- `\d*` matches zero or more digits.
- `\d{3}` matches exactly three digits 

In [6]:

text = "There are 300 cats, 15 dogs, and 7 birds."

# \d matches a single digit
pattern_single = r'\d'
matches_single = re.findall(pattern_single, text)
print("Single digit matches:", matches_single)

# \d+ matches one or more digits
pattern_one_or_more = r'\d+'
matches_one_or_more = re.findall(pattern_one_or_more, text)
print("One or more digits matches:", matches_one_or_more)

# \d* matches zero or more digits (every position in the string will match)
pattern_zero_or_more = r'\d*'
matches_zero_or_more = re.findall(pattern_zero_or_more, text)
print("Zero or more digits matches:", matches_zero_or_more)

# \d{3} matches exactly three digits
pattern_exactly_three = r'\d{3}'
matches_exactly_three = re.findall(pattern_exactly_three, text)
print("Exactly three digits matches:", matches_exactly_three)



Single digit matches: ['3', '0', '0', '1', '5', '7']
One or more digits matches: ['300', '15', '7']
Zero or more digits matches: ['', '', '', '', '', '', '', '', '', '', '300', '', '', '', '', '', '', '', '15', '', '', '', '', '', '', '', '', '', '', '', '7', '', '', '', '', '', '', '', '']
Exactly three digits matches: ['300']


#### 3. \D, \D+, \D*, \D{3}

- `\D` matches a single non-digit character.
- `\D+` matches one or more non-digit characters.
- `\D*` matches zero or more non-digit characters.
- `\D{3}` matches 

In [7]:
text = "There are 300 cats, 15 dogs, and 7 birds."

# \D matches a single non-digit character
pattern_single = r'\D'
matches_single = re.findall(pattern_single, text)
print("Single non-digit matches:", matches_single)

# \D+ matches one or more non-digit characters
pattern_one_or_more = r'\D+'
matches_one_or_more = re.findall(pattern_one_or_more, text)
print("One or more non-digits matches:", matches_one_or_more)

# \D* matches zero or more non-digit characters (every position in the string will match)
pattern_zero_or_more = r'\D*'
matches_zero_or_more = re.findall(pattern_zero_or_more, text)
print("Zero or more non-digits matches:", matches_zero_or_more)
# Output: ['There are ', '', '', ' cats, ', '', '', ' dogs, and ', '', ' birds.', '', '']

# \D{3} matches exactly three non-digit characters
pattern_exactly_three = r'\D{3}'
matches_exactly_three = re.findall(pattern_exactly_three, text)
print("Exactly three non-digits matches:", matches_exactly_three)

Single non-digit matches: ['T', 'h', 'e', 'r', 'e', ' ', 'a', 'r', 'e', ' ', ' ', 'c', 'a', 't', 's', ',', ' ', ' ', 'd', 'o', 'g', 's', ',', ' ', 'a', 'n', 'd', ' ', ' ', 'b', 'i', 'r', 'd', 's', '.']
One or more non-digits matches: ['There are ', ' cats, ', ' dogs, and ', ' birds.']
Zero or more non-digits matches: ['There are ', '', '', '', ' cats, ', '', '', ' dogs, and ', '', ' birds.', '']
Exactly three non-digits matches: ['The', 're ', 'are', ' ca', 'ts,', ' do', 'gs,', ' an', ' bi', 'rds']


#### 4. \s, \s{2}, \s+, \s*
- `\s` matches a single whitespace character (space, tab, newline).
- `\s{2}` matches exactly two consecutive whitespace characters.
- `\s+` matches one or more consecutive whitespace characters.
- `\s*` matches zero or more consecutive whitespace characters.

In [8]:
text = "There are 3  cats and 4 dogs."

# \s matches any whitespace character
pattern = r'\s'
matches = re.findall(pattern, text)
print("Single whitespace matches:", matches)

# \s{2} matches exactly two whitespace characters
pattern = r'\s{2}'
matches = re.findall(pattern, text)
print("Exactly two whitespace matches:", matches)

# \s+ matches one or more whitespace characters
pattern = r'\s+'
matches = re.findall(pattern, text)
print("One or more whitespace matches:", matches)

# \s* matches zero or more whitespace characters
pattern = r'\s*'
matches = re.findall(pattern, text)
print("Zero or more whitespace matches:", matches)

# - The pattern \s matches any single whitespace character in the string.
# - The pattern \s{2} matches exactly two consecutive whitespace characters in the string.
# - The pattern \s+ matches one or more consecutive whitespace characters in the string.
# - The pattern \s* matches zero or more consecutive whitespace characters in the string, including empty strings between non-whitespace characters.


Single whitespace matches: [' ', ' ', ' ', ' ', ' ', ' ', ' ']
Exactly two whitespace matches: ['  ']
One or more whitespace matches: [' ', ' ', '  ', ' ', ' ', ' ']
Zero or more whitespace matches: ['', '', '', '', '', ' ', '', '', '', ' ', '', '  ', '', '', '', '', ' ', '', '', '', ' ', '', ' ', '', '', '', '', '', '']


#### 5. \S, \S{2}, \S+, \S*
- `\S` matches any single non-whitespace character.
- `\S{2}` matches exactly two consecutive non-whitespace characters.
- `\S+` matches one or more consecutive non-whitespace characters.
- `\S*` matches zero or more consecutive non-whitespace characters.

In [9]:
# \S matches any non-whitespace character
pattern = r'\S'
matches = re.findall(pattern, text)
print("Single non-whitespace matches:", matches)

# \S{2} matches exactly two non-whitespace characters
pattern = r'\S{2}'
matches = re.findall(pattern, text)
print("Exactly two non-whitespace matches:", matches)

# \S+ matches one or more non-whitespace characters
pattern = r'\S+'
matches = re.findall(pattern, text)
print("One or more non-whitespace matches:", matches)

# \S* matches zero or more non-whitespace characters
pattern = r'\S*'
matches = re.findall(pattern, text)
print("Zero or more non-whitespace matches:", matches)

# - The pattern \S matches any single non-whitespace character in the string.
# - The pattern \S{2} matches exactly two consecutive non-whitespace characters in the string.
# - The pattern \S+ matches one or more consecutive non-whitespace characters in the string.
# - The pattern \S* matches zero or more consecutive non-whitespace characters, including empty strings between whitespace characters.

Single non-whitespace matches: ['T', 'h', 'e', 'r', 'e', 'a', 'r', 'e', '3', 'c', 'a', 't', 's', 'a', 'n', 'd', '4', 'd', 'o', 'g', 's', '.']
Exactly two non-whitespace matches: ['Th', 'er', 'ar', 'ca', 'ts', 'an', 'do', 'gs']
One or more non-whitespace matches: ['There', 'are', '3', 'cats', 'and', '4', 'dogs.']
Zero or more non-whitespace matches: ['There', '', 'are', '', '3', '', '', 'cats', '', 'and', '', '4', '', 'dogs.', '']


### Special Characters

#### 1. `.` (dot)
- `.` matches any single character except newline (`\n`).

In [10]:
text = '''catty mat bat ratty pattty hat sat'''
pattern = r'.at'
matches = re.findall(pattern, text)
print(matches)

# The pattern .at will match any three-character sequence that ends with "at" in the string.
# It matches each occurrence of these sequences in the string individually.
# The matches are returned in the order they appear in the string.

['cat', 'mat', 'bat', 'rat', 'pat', 'hat', 'sat']


#### 2. `^` (caret)
- `^` matches the beginning of a string.

In [11]:
text = "apple banana cherry"
pattern = r'^apple'
matches = re.findall(pattern, text)
print(matches)

# The pattern ^apple will match the word "apple" only if it appears at the beginning of the string.
# In this case, the word "apple" is at the beginning of the string, so it is matched.

['apple']


#### 3. `$` (dollar sign)
- `$` matches the end of a string.

In [12]:
text = "apple banana cherry"
pattern = r'cherry$'
matches = re.findall(pattern, text)
print(matches)

# The pattern cherry$ will match the word "cherry" only if it appears at the end of the string.
# In this case, the word "cherry" is at the end of the string, so it is matched.

['cherry']


#### 4. `\` (backslash)
- `\` escapes special characters in regular expressions.

In [13]:
text = "The price is $10.99. & The item is in stock."
pattern = r'\$|\&|\.'
matches = re.findall(pattern, text)
print(matches)

# The pattern \$|\. will match either the dollar sign "$" or the period "." in the string.
# It matches each occurrence of these characters in the string individually.
# The matches are returned in the order they appear in the string.

['$', '.', '.', '&', '.']


#### 5. `*` (asterisk)
- `*` greedily matches the expression to its left 0 or more times.

In [14]:
text = "abcccccdef cat ccc" 
pattern = r'c*'
matches = re.findall(pattern, text)
print(matches)

# The pattern c* will match zero or more occurrences of the character 'c' in the string.
# It matches each occurrence of these characters in the string individually.
# The matches are returned in the order they appear in the string.

['', '', 'ccccc', '', '', '', '', 'c', '', '', '', 'ccc', '']


#### 6. `+` (plus sign)
- `+` greedily matches the expression to its left 1 or more times.

In [15]:
text = "abcccccdef"
pattern = r'c+'
matches = re.findall(pattern, text)
print(matches)

# The pattern c+ will match one or more occurrences of the character 'c' in the string.
# It matches each occurrence of these characters in the string individually.
# The matches are returned in the order they appear in the string.

['ccccc']


#### 7. `*?` or `+?` (non-greedy matching)
- `*?` and `+?` match the expression to their left 0 or more times and 1 or more times, respectively, but in a non-greedy or minimal fashion.

In [16]:

text = "abcccccdef"

# Non-greedy matching with *
pattern_star = r'c*?'
matches_star = re.findall(pattern_star, text)
print(matches_star)  # Output: ['', 'c', '', '', '', '', '']

# Non-greedy matching with +
pattern_plus = r'c+?'
matches_plus = re.findall(pattern_plus, text)
print(matches_plus)  # Output: ['c']

# The pattern c*? will match zero or more occurrences of the character 'c' in the string in a non-greedy way.
# It matches each occurrence of these characters in the string individually.
# The matches are returned in the order they appear in the string.

# The pattern c+? will match one or more occurrences of the character 'c' in the string in a non-greedy way.
# It matches each occurrence of these characters in the string individually.
# The matches are returned in the order they appear in the string.

['', '', '', 'c', '', 'c', '', 'c', '', 'c', '', 'c', '', '', '', '']
['c', 'c', 'c', 'c', 'c']


#### 8. `{m}` (quantifier)
- `{m}` matches the expression to its left exactly m times.

In [17]:
text = "abcccccdef"
pattern = r'c{4}'
matches = re.findall(pattern, text)
print(matches)

['cccc']


### Basic RE Module Functions

#### 1. `re.match(x, y)`
- `re.match(x, y)` attempts to match the pattern `x` at the beginning of the string `y`.

In [18]:
text = "Hello, World!"
pattern = r'Hello'
match = re.match(pattern, text)

if match:
    print("Match found:", match.group())
else:
    print("No match found")

# The re.match() function is used to match a pattern at the beginning of a string.
# If the pattern is found at the beginning of the string, the match object is returned.
# Otherwise, None is returned.
# The match object has a group() method that returns the matched string.

Match found: Hello


#### 2. re.search(x, y)
- `re.search(x, y)` searches for the first occurrence of the pattern `x` in the string `y`.

In [19]:
text = "Python is fun"
pattern = r"is"
match = re.search(pattern, text)

if match:
    print("Match found:", match.group())
else:
    print("No match found")

# The re.search() function is used to search for a pattern in a string.
# If the pattern is found in the string, the match object is returned.
# Otherwise, None is returned.
# The match object has a group() method that returns the matched string.

Match found: is


#### 3. re.finditer(x, y)
`re.finditer(x, y)` returns an iterator of all match objects of the pattern `x` in the string `y`.

In [20]:
text = "Hello, hello, hello"
pattern = r'hello'
matches = re.finditer(pattern, text)

for match in matches:
    print("Match found:", match.group())

# The re.finditer() function is used to find all occurrences of a pattern in a string.
# It returns an iterator that yields match objects for each occurrence of the pattern in the string.
# The match object has a group() method that returns the matched string.

Match found: hello
Match found: hello


#### 4. re.subn(x, y, z)
`re.subn(x, y, z)` replaces occurrences of pattern `x` with `y` in the string `z`, returning a tuple of the new string and the number of substitutions made.

In [21]:
text = "Python is great. Python is awesome."
pattern = r'Python'
new_text, count = re.subn(pattern, 'Java', text)

print("Modified text:", new_text)
print("Number of substitutions:", count)

# The re.subn() function is used to substitute all occurrences of a pattern in a string with a replacement string.
# It returns a tuple containing the modified string and the number of substitutions made.

Modified text: Java is great. Java is awesome.
Number of substitutions: 2


#### 5. re.split(x, y)
`re.split(x, y)` splits the string `y` by the occurrences of pattern `x`.

In [22]:
text = "apple, banana, cherry"
pattern = r', '
parts = re.split(pattern, text)

print("Split parts:", parts)

# The re.split() function is used to split a string into a list of substrings based on a pattern.
# It returns a list of substrings that are separated by the pattern in the string.

Split parts: ['apple', 'banana', 'cherry']


#### 6. re.compile(x)
`re.compile(x)` compiles the pattern `x` into a regular expression object for efficient reuse.

In [23]:
pattern = re.compile(r'\d+')
text = "There are 123 cats and 456 dogs."
matches = pattern.findall(text)

print("Matches:", matches)

Matches: ['123', '456']


#### 7. re.fullmatch(x, y)
`re.fullmatch(x, y)` matches the pattern `x` against the entire string `y`.

In [24]:
pattern = re.compile(r'\d{3}')
text = "123"
match = pattern.fullmatch(text)

if match:
    print("Full match found:", match.group())
else:
    print("No full match found")

# The re.compile() function is used to compile a regular expression pattern into a pattern object.
# The pattern object has methods like findall(), fullmatch(), search(), etc., that can be used to match patterns in strings.

Full match found: 123


#### 8. re.escape(x)
`re.escape(x)` escapes special characters in the string `x`, making it safe to use as a literal in a regex.

In [25]:
text = "Escaping + special. characters?"
escaped_text = re.escape(text)

print("Escaped text:", escaped_text)

Escaped text: Escaping\ \+\ special\.\ characters\?


#### 9. re.purge()
`re.purge()` clears the regular expression cache

In [26]:
# Assume patterns have been compiled and cached
re.compile(r'\d+')
re.compile(r'\w+')

# Clear the cache
re.purge()

#### 10. re.Scanner(x)
`re.Scanner(x)` creates a scanner object for the pattern x for advanced tokenizing.

In [27]:
scanner = re.Scanner([(r'\w+', lambda scanner, token: ("WORD", token))])
text = "hello 123"
tokens, remainder = scanner.scan(text)

print("Tokens:", tokens)
print("Remainder:", remainder)

scanner = re.Scanner([(r'\d+', lambda scanner, token: ("NUMBER", token))])
text = "123 hello"
tokens, remainder = scanner.scan(text)

print("Tokens:", tokens)
print("Remainder:", remainder)

# The re.Scanner class is used to create a scanner object that can be used to tokenize strings based on regular expressions.
# The scanner object takes a list of regular expression patterns and corresponding action functions.
# The scan() method is used to tokenize a string based on the patterns and actions provided.
# It returns a list of tokens and the remaining part of the string that was not tokenized.

Tokens: [('WORD', 'hello')]
Remainder:  123
Tokens: [('NUMBER', '123')]
Remainder:  hello


#### 11. re.sub(x, y, z, count)
`re.sub(x, y, z, count)` replaces up to count occurrences of pattern `x` with `y` in the string `z`.

In [28]:
text = "apple, banana, apple, cherry, apple"
pattern = r'apple'
new_text = re.sub(pattern, 'orange', text, count=2)

print("Modified text:", new_text)

# The re.sub() function is used to substitute all occurrences of a pattern in a string with a replacement string.
# It has an optional count parameter that specifies the maximum number of substitutions to make.

Modified text: orange, banana, orange, cherry, apple


#### 12. re.subn(x, y, z, count)
`re.subn(x, y, z, count)` replaces up to count occurrences of pattern `x` with `y` in the string `z`, returning a tuple of the new string and the number of substitutions made.

In [29]:
text = "apple, banana, apple, cherry, apple"
pattern = r'apple'
new_text, count = re.subn(pattern, 'orange', text)

print("Modified text:", new_text)
print("Number of substitutions:", count)

# The re.subn() function is used to substitute all occurrences of a pattern in a string with a replacement string.
# It returns a tuple containing the modified string and the number of substitutions made.

Modified text: orange, banana, orange, cherry, orange
Number of substitutions: 3


#### 13. re.split(x, y, maxsplit)
`re.split(x, y, maxsplit)` splits the string `y` by the occurrences of pattern `x`, with a maximum of maxsplit splits.

In [30]:
text = "apple-banana-cherry-date-elderberry"
pattern = r'-'
parts = re.split(pattern, text, maxsplit=2)

print("Split parts:", parts)

# The re.split() function is used to split a string into a list of substrings based on a pattern.
# It has an optional maxsplit parameter that specifies the maximum number of splits to make.

Split parts: ['apple', 'banana', 'cherry-date-elderberry']


#### 14. re.findall(x, y)
Matches all instances of an expression `x` in a string `y` and returns them in a list.

In [31]:
text = "The rain in Spain falls mainly in the plain."
pattern = r'\bin\b'
matches = re.findall(pattern, text)
print(matches)

# The \b metacharacter is used to find a word boundary.
# A word boundary is a character that is not a letter, digit or underscore.
# A word boundary \b matches the position between a word character (like letters or digits) and a non-word character (like spaces, punctuation, or the beginning/end of a string).
# Here’s how \bin\b works in a string:
    # \b at the beginning: ensures that the match starts at the beginning of a word.
    # in: matches the literal string "in".
    # \b at the end: ensures that the match ends at the boundary of a word.
    # This means \bin\b will match the word "in" when it stands alone, not when it is a part of another word (like "inside" or "beginning").

['in', 'in']


#### 15. re.sub(x, y, z)
Replaces all occurrences of the expression `x` with the expression `y` in the string `z` and returns the modified string.

In [32]:
text = "The rain in Spain falls mainly in the plain."
pattern = r'\brain\b'
replacement = 'snow'
result = re.sub(pattern, replacement, text)
print(result)

# The \b metacharacter is used to find a word boundary.
# A word boundary is a character that is not a letter, digit or underscore.
# A word boundary \b matches the position between a word character (like letters or digits) and a non-word character (like spaces, punctuation, or the beginning/end of a string).
# Here’s how \bin\b works in a string:
    # \b at the beginning: ensures that the match starts at the beginning of a word.
    # rain: matches the literal string "rain".
    # \b at the end: ensures that the match ends at the boundary of a word.
    # This means \brain\b will match the word "rain" when it stands alone, not when it is a part of another word (like "brain" or "brainstorm").

The snow in Spain falls mainly in the plain.
