# Imports

In [6]:
import re

# Topics

## re module 

### matching and substitution

#### re.match

In [7]:
# 1 - re.match : finds a match at the beginning of the string
pattern = r'\d+'
text = "123abc"
match = re.match(pattern, text)
if match:
    print(f"Matched: {match.group()}")
else:
    print("No match")

Matched: 123


#### re.search

In [8]:
# 2- re.search : finds a match anywhere in the string
pattern = r'\d+'
text = "abc123xyz567"
match = re.search(pattern, text)
if match:
    print(f"Matched: {match.group()}")
else:
    print("No match")

Matched: 123


#### re.findall vs re.finditer

In [9]:
# 3- re.findall : finds all the matches and returns them as a list of strings
print("FINDALL")

pattern = r'\d+'
text = "abc123xyz567"
matches = re.findall(pattern, text)
if match:
    print(f"Matches: {matches}")
else:
    print("No match")

print("FINDITER")
# 4- re.finditer : finds all the matches and returns them as an iterator of match objects
pattern = r'\d+'
text = "abc123xyz567"
matches = re.finditer(pattern, text)
for match in matches:
    print(f"Matched: {match.group()}")
    print(f"Start: {match.start()}")
    print(f"End: {match.end()}")
    print(f"Span: {match.span()}")
    print()

FINDALL
Matches: ['123', '567']
FINDITER
Matched: 123
Start: 3
End: 6
Span: (3, 6)

Matched: 567
Start: 9
End: 12
Span: (9, 12)



#### re.sub

In [10]:
# 5- re.sub : replaces the matches with a string
pattern = r'\d+'
replacement = '?'
text = "abc123xyz456"
result = re.sub(pattern, replacement, text)
print(f"Result: {result}")

Result: abc?xyz?


### splitting

In [11]:
# 6- re.split : splits the string where there is a match and returns a list of strings
pattern = r'\d+'
text = "abc123xyz456"
split_result = re.split(pattern, text)
print(f"Split result: {split_result}")

Split result: ['abc', 'xyz', '']


### grouping and capturing

In [12]:
pattern = r'(\d+)-(\w+)-(\W+)'
text = "123-abc-!!!"
match = re.search(pattern, text)
if match:
    print(f"Group 1: {match.group(1)}")
    print(f"Group 2: {match.group(2)}")
    print(f"Group 3: {match.group(3)}")

Group 1: 123
Group 2: abc
Group 3: !!!


### lookahead and lookbehind (with their negatives)
Lookahead and lookbehind assertions allow for more complex patterns without consuming characters in the string

In [13]:
# 7- lookahead (?=...)
pattern = r'\d+(?=abc)'
text = "123abc456"
match = re.search(pattern, text)
if match:
    print(f"Lookahead match: {match.group()}")

# 8- lookbehind (?<=...)
pattern = r'(?<=abc)\d+'
text = "123abc456"
match = re.search(pattern, text)
if match:
    print(f"Lookbehind match: {match.group()}")

# 9- negative lookahead (?!...)
pattern = r'\d+(?!abc)'
text = "123abc456"
matches = re.findall(pattern, text)
if matches:
    print(f"Negative lookahead match: {matches}")
else:
    print("No match")


# 10- negative lookbehind (?<!...)
pattern = r'(?<!abc)\d+'
text = "123abc456"
matches = re.findall(pattern, text)
if matches:
    print(f"Negative lookbehind match: {matches}")
else:
    print("No match")


Lookahead match: 123
Lookbehind match: 456
Negative lookahead match: ['12', '456']
Negative lookbehind match: ['123', '56']


### Special Sequences and Characters Classes: 

- \d: Matches any digit. Equivalent to [0-9].
- \D: Matches any non-digit.
- \w: Matches any alphanumeric character. Equivalent to [a-zA-Z0-9_].
- \W: Matches any non-alphanumeric character.
- \s: Matches any whitespace character.
- \S: Matches any non-whitespace character.
- [abc]: Matches any of the characters inside the brackets.
- [^abc]: Matches any character not inside the brackets.
- a|b: Matches either a or b.

### Applications

#### Email Extraction

In [14]:
pattern = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
text = "example@example.com"
match = re.match(pattern, text)
if match:
    print("Valid email")
else:
    print("Invalid email")

Valid email


#### Phone No Extraction

In [15]:
pattern = r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'
text = "Contact me at 123-456-7890 or 987.654.3210"
matches = re.findall(pattern, text)
print(f"Phone numbers: {matches}")

Phone numbers: ['123-456-7890', '987.654.3210']


#### Parsing Logs

In [16]:
pattern = r'(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}),(\d+) - (\w+) - (.*)'
log_entry = "2024-06-03 12:34:56,789 - INFO - This is a log message"
match = re.match(pattern, log_entry)
if match:
    print(f"Date: {match.group(1)}")
    print(f"Time: {match.group(2)}")
    print(f"Milliseconds: {match.group(3)}")
    print(f"Level: {match.group(4)}")
    print(f"Message: {match.group(5)}")

Date: 2024-06-03
Time: 12:34:56
Milliseconds: 789
Level: INFO
Message: This is a log message


### Advanced Pattern Matching

#### greedy matching vs non-greedy matching

In [17]:
text = "<div>content</div><div>another content</div>"
pattern_greedy = r'<div>.*</div>' # match as much as possible
pattern_non_greedy = r'<div>.*?</div>' # match as little as possible
match_greedy = re.findall(pattern_greedy, text)
match_non_greedy = re.findall(pattern_non_greedy, text)
print(f"Greedy match: {match_greedy}")
print(f"Non-Greedy match: {match_non_greedy}")

Greedy match: ['<div>content</div><div>another content</div>']
Non-Greedy match: ['<div>content</div>', '<div>another content</div>']


#### backreferences

In [18]:
# Backreferences allow you to reuse part of the matched text. 
# They are created by capturing groups and then referenced using \1, \2, etc

pattern = r'(\b\w+)\s+\1'
text = "hello hello world world"
matches = re.findall(pattern, text)
print(f"Backreferences match: {matches}")

Backreferences match: ['hello', 'world']


#### conditional matching (expressions)

In [19]:
# Conditional expressions in regular expressions allow for more complex logic 
# by testing for the presence of a specific capturing group.

pattern = r'(a)?b(?(1)c|d)'
text1 = "abc"
text2 = "bd"
match1 = re.match(pattern, text1)
match2 = re.match(pattern, text2)
print(f"Conditional match 1: {match1.group() if match1 else 'No match'}")
print(f"Conditional match 2: {match2.group() if match2 else 'No match'}")

Conditional match 1: abc
Conditional match 2: bd


### Compiling Regular Expressions

In [20]:
# Compiling a regular expression can improve performance when the same pattern is used multiple times.
pattern = re.compile(r'\d+')
text = "123 456 789"
matches = pattern.findall(text)
print(f"Compiled matches: {matches}")

Compiled matches: ['123', '456', '789']


### Using Raw Strings

In [21]:
# Raw strings (prefix r) prevent Python from interpreting backslashes as escape characters
# making it easier to write and read regular expressions.

pattern = r'\b\d{3}\b'
text = "100 200 300"
matches = re.findall(pattern, text)
print(f"Raw string matches: {matches}")

Raw string matches: ['100', '200', '300']


#### Validating Passwords

In [22]:
pattern = r'^(?=.*[A-Z])(?=.*[a-z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$'
passwords = ["Password1!", "pass", "PASSWORD1!", "Pass1!", "ValidPass123!"]
for pwd in passwords:
    match = re.match(pattern, pwd)
    print(f"Password: {pwd} - {'Valid' if match else 'Invalid'}")

Password: Password1! - Valid
Password: pass - Invalid
Password: PASSWORD1! - Invalid
Password: Pass1! - Invalid
Password: ValidPass123! - Valid


#### URL Extraction 

In [23]:
pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
text = "Visit https://www.linkedin.com/in/gaurav-kumar007/ and https://topmate.io/gaurav_kumar_quant for more info. Also check https://docs.python.org/3/howto/regex.html."
matches = re.findall(pattern, text)
print(f"URLs: {matches}")

URLs: ['https://www.linkedin.com/in/gaurav-kumar007/', 'https://topmate.io/gaurav_kumar_quant', 'https://docs.python.org/3/howto/regex.html.']
