In [None]:
#!/usr/bin/env python3
"""
Shared Sample Text for All RegEx Categories
This text contains various patterns for testing all regex categories
"""

SAMPLE_TEXT = """
On January 15, 2024, Dr. Sarah Johnson sent an email to john.doe@techcorp.com
regarding the server issues. The main database server at IP address 192.168.1.100
went down at 14:30:22. Emergency contacts were notified: Mike.Wilson@support.org
(phone: 555-123-4567) and admin@company.net. The backup servers at 10.0.0.55
and 172.16.254.1 remained operational.

Important dates to remember:
- Project start: March 3rd, 2024
- First review: 2024-04-15
- Final deadline: Dec 31, 2024

Team members involved:
- Project Manager: Alice Smith-Brown (alice.smith@project-team.com)
- Developer: Bob_Wilson123 (bwilson@dev-team.org)
- Designer: Carol.Johnson (c.johnson+design@creative.co.uk)

Log entries show ERROR messages at timestamps 09:15:33, 10:22:45, and 11:59:59.
WARNING: System memory usage exceeded 90% threshold.
INFO: Scheduled backup completed successfully.
DEBUG: Processing user authentication for admin_user_2024.

Contact information:
Phone numbers: (555) 987-6543, 555.234.5678, +1-800-555-0199
Web addresses: https://www.example.com, http://backup-site.org/status
Emergency hotline available 24/7 at emergency@support-center.com

System specifications:
- CPU cores: 8
- RAM: 32GB
- Storage: 2TB SSD
- Network: 1000Mbps connection
- OS version: Ubuntu 22.04.3 LTS

The incident report #INC-2024-0315 was filed by J.Anderson@security.com
Status: RESOLVED on 2024/03/16 at 16:45:00 UTC.
"""

def display_sample_text():
    print("SAMPLE TEXT FOR REGEX DEMONSTRATIONS")
    print("=" * 60)
    print(SAMPLE_TEXT)
    print("=" * 60)
    print("\nThis text contains:")
    print("- Email addresses (various formats)")
    print("- IP addresses")
    print("- Dates (multiple formats)")
    print("- Phone numbers")
    print("- Names and usernames")
    print("- URLs")
    print("- Timestamps")
    print("- Log levels (ERROR, WARNING, INFO, DEBUG)")
    print("- Numbers and measurements")
    print("- File paths and system info")

if __name__ == "__main__":
    display_sample_text()

SAMPLE TEXT FOR REGEX DEMONSTRATIONS

On January 15, 2024, Dr. Sarah Johnson sent an email to john.doe@techcorp.com 
regarding the server issues. The main database server at IP address 192.168.1.100 
went down at 14:30:22. Emergency contacts were notified: Mike.Wilson@support.org 
(phone: 555-123-4567) and admin@company.net. The backup servers at 10.0.0.55 
and 172.16.254.1 remained operational.

Important dates to remember:
- Project start: March 3rd, 2024
- First review: 2024-04-15  
- Final deadline: Dec 31, 2024

Team members involved:
- Project Manager: Alice Smith-Brown (alice.smith@project-team.com)
- Developer: Bob_Wilson123 (bwilson@dev-team.org)
- Designer: Carol.Johnson (c.johnson+design@creative.co.uk)

Log entries show ERROR messages at timestamps 09:15:33, 10:22:45, and 11:59:59.
INFO: Scheduled backup completed successfully.
DEBUG: Processing user authentication for admin_user_2024.

Contact information:
Phone numbers: (555) 987-6543, 555.234.5678, +1-800-555-0199
Web ad

**Category 1** : Character Classes and Basic Matching





In [None]:
import re

# Example 1: Find all digits
print("1. [0-9] - Find all digits:")
digits = re.findall(r'[0-9]', SAMPLE_TEXT)
print(f"   Found: {digits[:10]}... (showing first 10)")

# Example 2: Find all lowercase letters
print("\n2. [a-z] - Find all lowercase letters:")
lowercase = re.findall(r'[a-z]', SAMPLE_TEXT)
print(f"   Found: {lowercase[:10]}... (showing first 10)")

# Example 3: Find all uppercase letters
print("\n3. [A-Z] - Find all uppercase letters:")
uppercase = re.findall(r'[A-Z]', SAMPLE_TEXT)
print(f"   Found: {uppercase}")

# Example 4: Find vowels
print("\n4. [aeiou] - Find all lowercase vowels:")
vowels = re.findall(r'[aeiou]', SAMPLE_TEXT)
print(f"   Found: {vowels[:15]}... (showing first 15)")

# Example 5: Find non-digits
print("\n5. [^0-9] - Find all non-digit characters:")
non_digits = re.findall(r'[^0-9]', SAMPLE_TEXT)
print(f"   Found: {non_digits[:10]}... (showing first 10)")

print("\n" + "=" * 60)
print("Using predefined character classes:")

# Using \d for digits
print("\\d - Find all digits (same as [0-9]):")
digits_shorthand = re.findall(r'\d', SAMPLE_TEXT)
print(f"   Found: {digits_shorthand[:10]}... (showing first 10)")

# Using \w for word characters
print("\\w - Find all word characters [a-zA-Z0-9_]:")
word_chars = re.findall('\\w', SAMPLE_TEXT)
print(f"   Found: {word_chars[:15]}... (showing first 15)")

# Using \s for whitespace
print("\\s - Find all whitespace characters:")
whitespace = re.findall(r'\s', SAMPLE_TEXT)
print(f"   Found {len(whitespace)} whitespace characters: spaces, tabs, newlines")



1. [0-9] - Find all digits:
   Found: ['1', '5', '2', '0', '2', '4', '1', '9', '2', '1']... (showing first 10)

2. [a-z] - Find all lowercase letters:
   Found: ['n', 'a', 'n', 'u', 'a', 'r', 'y', 'r', 'a', 'r']... (showing first 10)

3. [A-Z] - Find all uppercase letters:
   Found: ['O', 'J', 'D', 'S', 'J', 'T', 'I', 'P', 'E', 'M', 'W', 'T', 'I', 'P', 'M', 'F', 'F', 'D', 'T', 'P', 'M', 'A', 'S', 'B', 'D', 'B', 'W', 'D', 'C', 'J', 'L', 'E', 'R', 'R', 'O', 'R', 'W', 'A', 'R', 'N', 'I', 'N', 'G', 'S', 'I', 'N', 'F', 'O', 'S', 'D', 'E', 'B', 'U', 'G', 'P', 'C', 'P', 'W', 'E', 'S', 'C', 'P', 'U', 'R', 'A', 'M', 'G', 'B', 'S', 'T', 'B', 'S', 'S', 'D', 'N', 'M', 'O', 'S', 'U', 'L', 'T', 'S', 'T', 'I', 'N', 'C', 'J', 'A', 'S', 'R', 'E', 'S', 'O', 'L', 'V', 'E', 'D', 'U', 'T', 'C']

4. [aeiou] - Find all lowercase vowels:
   Found: ['a', 'u', 'a', 'a', 'a', 'o', 'o', 'e', 'a', 'e', 'a', 'i', 'o', 'o', 'o']... (showing first 15)

5. [^0-9] - Find all non-digit characters:
   Found: ['\n', 'O', 

**Backslash Plague and Raw Strings**

> Sample text =  """
The file path is C:\documents\new_folder\data.txt
Some regex patterns: \d matches digits, \n is newline, \t is tab
Special sequences: \b for word boundary, \s for space
Directory: C:\users\admin\desktop\file.log
Text with literal backslashes: This is \d not a digit pattern
Code snippet: if re.search('\\d+', text): print("found digits")
"""

```text
============================================================
SUMMARY OF THE BACKSLASH PLAGUE
============================================================

The Problem:
Python processes strings BEFORE regex sees them
\d in normal string becomes literal 'd' character
\n becomes actual newline character
\b becomes backspace character (ASCII 8)

The Solution - Raw Strings (r'...'):
r'\d' - Raw string preserves \d for regex
r'\\d' - To match literal \d in text
r'\n' - To match newline with regex
r'\\n' - To match literal \n text

Rule of Thumb:
✓ Always use raw strings for regex: r'pattern'
✓ Double backslashes in raw strings for literal matches: r'\\d'
✗ Avoid normal strings with backslashes in regex

============================================================
COMPARISON TABLE
============================================================
Want to match   | Wrong (Normal String) | Right (Raw String) | Literal Match
--------------- | -------------------- | ------------------ | ---------------
Any digit       | '\d+'                | r'\d+'             | r'\\d+'
Word boundary   | '\bword\b'           | r'\bword\b'        | r'\\bword\\b'
Newline char    | '\n'                 | r'\n'              | r'\\n'
Tab character   | '\t'                 | r'\t'              | r'\\t'
File path C:\dir | 'C:\\dir'            | r'C:\\dir'         | r'C:\\\\dir'
```




In [None]:
#!/usr/bin/env python3
# """
# Backslash Plague vs Raw Strings - 4 Examples
# Demonstrating the difference between normal strings and raw strings in regex
# """

import re

SAMPLE_TEXT = r"""
The file path is C:\documents\new_folder\data.txt
Some regex patterns: \d matches digits, \n is newline, \t is tab
Special sequences: \b for word boundary, \s for space
Directory: C:\users\admin\desktop\file.log
Text with literal backslashes: This is \d not a digit pattern
Code snippet: if re.search('\\d+', text): print("found digits")
Some digits here: 07845w36742
"""

In [None]:
print("BACKSLASH PLAGUE vs RAW STRINGS")
# Example 1: Matching actual digits vs literal \d
print("EXAMPLE 1: Matching digits vs literal '\\d'")
print("\n" + "=" * 60)

# Wrong way - gets interpreted by Python string parser
try:
  pattern_wrong = '\\d+'  # Python interprets \d
  matches_wrong = re.findall(pattern_wrong, SAMPLE_TEXT)
  print(f"Normal string '\\d+': {matches_wrong}")
except:
  print("Normal string '\\d+': ERROR - \\d becomes literal character")

# Right way - raw string
pattern_right = r'\d+'  # Raw string preserves \d for regex
matches_right = re.findall(pattern_right, SAMPLE_TEXT)
print(f"Raw string r'\\d+': {matches_right[:5]}... (first 5 numbers)")


# Finding literal \d in text
literal_d = r'\\d'  # Looking for actual \d characters
matches_literal = re.findall(literal_d, SAMPLE_TEXT)
print(f"Literal '\\\\d' pattern r'\\\\d': {matches_literal}")

BACKSLASH PLAGUE vs RAW STRINGS
EXAMPLE 1: Matching digits vs literal '\d'

Normal string '\d+': ['07845', '36742']
Raw string r'\d+': ['07845', '36742']... (first 5 numbers)
Literal '\\d' pattern r'\\d': ['\\d', '\\d', '\\d', '\\d', '\\d', '\\d']


In [None]:
# Example 2: Word boundaries vs literal \b
print("EXAMPLE 2: Word boundaries vs literal '\\b'")
print("\n" + "=" * 60)

# Word boundary (what we usually want)
word_boundary = r'\bword\b'
boundary_matches = re.findall(word_boundary, SAMPLE_TEXT)
print(f"Word boundary r'\\bword\\b': {boundary_matches}")

# This would be wrong - \b becomes backspace character in normal string
try:
    wrong_boundary = '\bword\b'  # \b = backspace (ASCII 8)
    wrong_matches = re.findall(wrong_boundary, SAMPLE_TEXT)
    print(f"Normal string '\\bword\\b': {wrong_matches}")
except:
    print("Normal string '\\bword\\b': No matches (\\b becomes backspace)")

# Finding literal \b in text
literal_b = r'\\b'
literal_matches = re.findall(literal_b, SAMPLE_TEXT)
print(f"Literal '\\\\b' pattern r'\\\\b': {literal_matches}")

EXAMPLE 2: Word boundaries vs literal '\b'

Word boundary r'\bword\b': ['word']
Normal string '\bword\b': []
Literal '\\b' pattern r'\\b': ['\\b']


In [None]:
# Example 3: File paths with backslashes
print("EXAMPLE 3: Windows file paths")
print("\n" + "=" * 60)

# Extract file paths - need to escape backslashes
# Wrong - single backslash
try:
    wrong_path = 'C:\w+\w+'  # \w becomes regex word character
    wrong_results = re.findall(wrong_path, SAMPLE_TEXT)
    print(f"Wrong pattern 'C:\\w+\\w+': {wrong_results}")
except:
    print("Wrong pattern causes issues")

# Right - raw string with escaped backslashes for literal match
right_path = r'C:\\[^\\]+\\[^\\]+\\[^\s]+'  # Matches C:\folder\folder\file
right_results = re.findall(right_path, SAMPLE_TEXT)
print(f"Raw string r'C:\\\\[^\\\\]+\\\\[^\\\\]+\\\\[^\\s]+': {right_results}")

# Alternative - using forward slashes or os.path.sep
simple_path = r'C:[^\s]+\.(txt|log)'  # Match files with extensions
simple_results = re.findall(simple_path, SAMPLE_TEXT)
print(f"Simpler approach r'C:[^\\s]+\\.(txt|log)': {simple_results}")

EXAMPLE 3: Windows file paths

Wrong pattern 'C:\w+\w+': []
Raw string r'C:\\[^\\]+\\[^\\]+\\[^\s]+': ['C:\\documents\\new_folder\\data.txt', 'C:\\users\\admin\\desktop\\file.log']
Simpler approach r'C:[^\s]+\.(txt|log)': ['txt', 'log']


  wrong_path = 'C:\w+\w+'  # \w becomes regex word character


In [None]:
# Example 4: Newlines and tabs
print("EXAMPLE 4: Newlines and tabs vs literal \\n, \\t")
print("\n" + "=" * 60)

# Looking for actual newline characters
newlines = r'\n'  # Regex for newline character
newline_count = len(re.findall(newlines, SAMPLE_TEXT))
print(f"Actual newlines r'\\n': Found {newline_count}")

# Looking for literal \n text in the string
literal_newline = r'\\n'  # Looking for \n as text
literal_n_matches = re.findall(literal_newline, SAMPLE_TEXT)
print(f"Literal '\\\\n' text r'\\\\n': {literal_n_matches}")

# Same for tabs
literal_tab = r'\\t'
literal_t_matches = re.findall(literal_tab, SAMPLE_TEXT)
print(f"Literal '\\\\t' text r'\\\\t': {literal_t_matches}")



EXAMPLE 4: Newlines and tabs vs literal \n, \t

Actual newlines r'\n': Found 8
Literal '\\n' text r'\\n': ['\\n', '\\n']
Literal '\\t' text r'\\t': ['\\t']


**Anchors and Boundaries**

In [None]:
#!/usr/bin/env python3


import re

SAMPLE_TEXT = """
On January 15, 2024, Dr. Sarah Johnson sent an email to john.doe@techcorp.com
regarding the server issues. The main database server at IP address 192.168.1.100
went down at 14:30:22. Emergency contacts were notified: Mike.Wilson@support.org
(phone: 555-123-4567) and admin@company.net. The backup servers at 10.0.0.55
and 172.16.254.1 remained operational.

Important dates to remember:
- Project start: March 3rd, 2024
- First review: 2024-04-15
- Final deadline: Dec 31, 2024

Team members involved:
- Project Manager: Alice Smith-Brown (alice.smith@project-team.com)
- Developer: Bob_Wilson123 (bwilson@dev-team.org)
- Designer: Carol.Johnson (c.johnson+design@creative.co.uk)

Log entries show ERROR messages at timestamps 09:15:33, 10:22:45, and 11:59:59.
WARNING: System memory usage exceeded 90% threshold.
INFO: Scheduled backup completed successfully.
DEBUG: Processing user authentication for admin_user_2024.

Contact information:
Phone numbers: (555) 987-6543, 555.234.5678, +1-800-555-0199
Web addresses: https://www.example.com, http://backup-site.org/status
Emergency hotline available 24/7 at emergency@support-center.com

System specifications:
- CPU cores: 8
- RAM: 32GB
- Storage: 2TB SSD
- Network: 1000Mbps connection
- OS version: Ubuntu 22.04.3 LTS

The incident report #INC-2024-0315 was filed by J.Anderson@security.com
Status: RESOLVED on 2024/03/16 at 16:45:00 UTC.
"""

def demonstrate_anchors_boundaries():
    print("Category 3: Anchors and Boundaries")
    print("=" * 60)

    # Split text into lines for demonstration
    print(SAMPLE_TEXT.strip())
    lines = SAMPLE_TEXT.strip().split('\n')
    print(lines)
    # Example 1: ^ (start of string/line)
    print("1. ^ - Lines starting with specific patterns:")

    # Find lines starting with specific words
    start_patterns = ['^On', '^Team', '^Status', '^The']

    for pattern in start_patterns:
        matching_lines = []
        for i, line in enumerate(lines):
            if re.search(pattern, line):
                matching_lines.append(f"Line {i+1}: {line.strip()}")

        print(f"\n   Pattern '{pattern}':")
        if matching_lines:
            for match in matching_lines:
                print(f"   {match}")
        else:
            print("   No matches found")

    # Example 2: $ (end of string/line)
    print("\n2. $ - Lines ending with specific patterns:")

    end_patterns = ['com$', 'org$', '\\.$', 'UTC\\.$']

    for pattern in end_patterns:
        matching_lines = []
        for i, line in enumerate(lines):
            if re.search(pattern, line.strip()):
                matching_lines.append(f"Line {i+1}: {line.strip()}")

        print(f"\n   Pattern '{pattern}':")
        if matching_lines:
            for match in matching_lines:
                print(f"   {match}")
        else:
            print("   No matches found")

    # Example 3: \b (word boundary)
    print("\n3. \\b - Word boundaries (complete words only):")

    word_patterns = [r'\buser\b', r'\bserver\b', r'\binfo\b', r'\berror\b']

    for pattern in word_patterns:
        matches = re.findall(pattern, SAMPLE_TEXT, re.IGNORECASE)
        print(f"\n   Pattern '{pattern}': Found {len(matches)} matches")
        if matches:
            print(f"   Matches: {matches}")

    # Example 4: \B (non-word boundary)
    print("\n4. \\B - Non-word boundaries (inside words):")

    # Find 'er' inside words (not at word boundaries)
    inside_matches = re.findall(r'\Ber\B', SAMPLE_TEXT)
    print(f"\n   Pattern '\\Ber\\B': {inside_matches}")

    # Find 'or' inside words
    inside_matches2 = re.findall(r'\Bor\B', SAMPLE_TEXT)
    print(f"   Pattern '\\Bor\\B': {inside_matches2}")

    # Example 5: Combined anchors ^...$
    print("\n5. ^...$ - Exact line matches:")

    # Find lines that are exactly just whitespace/dashes
    for i, line in enumerate(lines):
        if re.match(r'^-.*-.*$', line.strip()):
            print(f"   Line {i+1} matches '^-.*-.*$': {line.strip()}")
        if re.match(r'^\s*$', line):
            print(f"   Line {i+1} is empty or whitespace only")

if __name__ == "__main__":
    demonstrate_anchors_boundaries()

Category 3: Anchors and Boundaries
On January 15, 2024, Dr. Sarah Johnson sent an email to john.doe@techcorp.com 
regarding the server issues. The main database server at IP address 192.168.1.100 
went down at 14:30:22. Emergency contacts were notified: Mike.Wilson@support.org 
(phone: 555-123-4567) and admin@company.net. The backup servers at 10.0.0.55 
and 172.16.254.1 remained operational.

Important dates to remember:
- Project start: March 3rd, 2024
- First review: 2024-04-15  
- Final deadline: Dec 31, 2024

Team members involved:
- Project Manager: Alice Smith-Brown (alice.smith@project-team.com)
- Developer: Bob_Wilson123 (bwilson@dev-team.org)
- Designer: Carol.Johnson (c.johnson+design@creative.co.uk)

Log entries show ERROR messages at timestamps 09:15:33, 10:22:45, and 11:59:59.
INFO: Scheduled backup completed successfully.
DEBUG: Processing user authentication for admin_user_2024.

Contact information:
Phone numbers: (555) 987-6543, 555.234.5678, +1-800-555-0199
Web addre

**Groups**

In [None]:
#!/usr/bin/env python3
"""
Category 4: Groups and Capturing
Using the shared sample text
"""

import re

SAMPLE_TEXT = """
On January 15, 2024, Dr. Sarah Johnson sent an email to john.doe@techcorp.com
regarding the server issues. The main database server at IP address 192.168.1.100
went down at 14:30:22. Emergency contacts were notified: Mike.Wilson@support.org
(phone: 555-123-4567) and admin@company.net. The backup servers at 10.0.0.55
and 172.16.254.1 remained operational.

Important dates to remember:
- Project start: March 3rd, 2024
- First review: 2024-04-15
- Final deadline: Dec 31, 2024

Team members involved:
- Project Manager: Alice Smith-Brown (alice.smith@project-team.com)
- Developer: Bob_Wilson123 (bwilson@dev-team.org)
- Designer: Carol.Johnson (c.johnson+design@creative.co.uk)

Log entries show ERROR messages at timestamps 09:15:33, 10:22:45, and 11:59:59.
WARNING: System memory usage exceeded 90% threshold.
INFO: Scheduled backup completed successfully.
DEBUG: Processing user authentication for admin_user_2024.

Contact information:
Phone numbers: (555) 987-6543, 555.234.5678, +1-800-555-0199
Web addresses: https://www.example.com, http://backup-site.org/status
Emergency hotline available 24/7 at emergency@support-center.com

System specifications:
- CPU cores: 8
- RAM: 32GB
- Storage: 2TB SSD
- Network: 1000Mbps connection
- OS version: Ubuntu 22.04.3 LTS

The incident report #INC-2024-0315 was filed by J.Anderson@security.com
Status: RESOLVED on 2024/03/16 at 16:45:00 UTC.
"""

def demonstrate_groups():
    print("Category 4: Groups and Capturing")
    print("=" * 60)

    # Example 1: Basic grouping - Extract email parts
    print("1. Basic Groups - Extract email parts:")
    email_pattern = r'([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)\.([a-zA-Z]{2,})'

    emails = re.findall(email_pattern, SAMPLE_TEXT)
    print(f"   Pattern: {email_pattern}")
    print("   Groups: (username) @ (domain) . (tld)")
    for i, (username, domain, tld) in enumerate(emails, 1):
        print(f"   Email {i}: username='{username}', domain='{domain}', tld='{tld}'")

    # Example 2: Extract date parts
    print("\n2. Date Groups - Extract year, month, day:")
    date_pattern = r'(\d{4})-(\d{2})-(\d{2})'

    dates = re.findall(date_pattern, SAMPLE_TEXT)
    print(f"   Pattern: {date_pattern}")
    print("   Groups: (year)-(month)-(day)")
    for i, (year, month, day) in enumerate(dates, 1):
        print(f"   Date {i}: year='{year}', month='{month}', day='{day}'")

    # Example 3: Extract IP address parts
    print("\n3. IP Address Groups - Extract octets:")
    ip_pattern = r'(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})'

    ips = re.findall(ip_pattern, SAMPLE_TEXT)
    print(f"   Pattern: {ip_pattern}")
    print("   Groups: (octet1).(octet2).(octet3).(octet4)")
    for i, (oct1, oct2, oct3, oct4) in enumerate(ips, 1):
        print(f"   IP {i}: {oct1}.{oct2}.{oct3}.{oct4}")

    # Example 4: Extract phone number parts
    print("\n4. Phone Number Groups:")
    phone_pattern = r'\((\d{3})\)\s(\d{3})-(\d{4})|(\d{3})-(\d{3})-(\d{4})|(\d{3})\.(\d{3})\.(\d{4})'

    matches = re.finditer(phone_pattern, SAMPLE_TEXT)
    print(f"   Pattern: {phone_pattern}")
    print("   Multiple formats: (xxx) xxx-xxxx | xxx-xxx-xxxx | xxx.xxx.xxxx")

    for i, match in enumerate(matches, 1):
        groups = match.groups()
        # Filter out None values
        phone_parts = [g for g in groups if g is not None]
        if len(phone_parts) >= 3:
            print(f"   Phone {i}: area={phone_parts[0]}, exchange={phone_parts[1]}, number={phone_parts[2]}")

    # Example 5: Named groups
    print("\n5. Named Groups - More readable:")
    named_email_pattern = r'(?P<username>[a-zA-Z0-9._%+-]+)@(?P<domain>[a-zA-Z0-9.-]+)\.(?P<tld>[a-zA-Z]{2,})'

    for match in re.finditer(named_email_pattern, SAMPLE_TEXT):
        groups = match.groupdict()
        print(f"   Email: {groups}")

if __name__ == "__main__":
    demonstrate_groups()

Category 4: Groups and Capturing
1. Basic Groups - Extract email parts:
   Pattern: ([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)\.([a-zA-Z]{2,})
   Groups: (username) @ (domain) . (tld)
   Email 1: username='john.doe', domain='techcorp', tld='com'
   Email 2: username='Mike.Wilson', domain='support', tld='org'
   Email 3: username='admin', domain='company', tld='net'
   Email 4: username='alice.smith', domain='project-team', tld='com'
   Email 5: username='bwilson', domain='dev-team', tld='org'
   Email 6: username='c.johnson+design', domain='creative.co', tld='uk'
   Email 7: username='emergency', domain='support-center', tld='com'
   Email 8: username='J.Anderson', domain='security', tld='com'

2. Date Groups - Extract year, month, day:
   Pattern: (\d{4})-(\d{2})-(\d{2})
   Groups: (year)-(month)-(day)
   Date 1: year='2024', month='04', day='15'

3. IP Address Groups - Extract octets:
   Pattern: (\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})
   Groups: (octet1).(octet2).(octet3).(octet4)
   I

**Lookahead**

In [None]:
#!/usr/bin/env python3
"""
Category 5: Advanced Features
Using the shared sample text
"""

import re

SAMPLE_TEXT = """
On January 15, 2024, Dr. Sarah Johnson sent an email to john.doe@techcorp.com
regarding the server issues. The main database server at IP address 192.168.1.100
went down at 14:30:22. Emergency contacts were notified: Mike.Wilson@support.org
(phone: 555-123-4567) and admin@company.net. The backup servers at 10.0.0.55
and 172.16.254.1 remained operational.

Important dates to remember:
- Project start: March 3rd, 2024
- First review: 2024-04-15
- Final deadline: Dec 31, 2024

Team members involved:
- Project Manager: Alice Smith-Brown (alice.smith@project-team.com)
- Developer: Bob_Wilson123 (bwilson@dev-team.org)
- Designer: Carol.Johnson (c.johnson+design@creative.co.uk)

Log entries show ERROR messages at timestamps 09:15:33, 10:22:45, and 11:59:59.
WARNING: System memory usage exceeded 90% threshold.
INFO: Scheduled backup completed successfully.
DEBUG: Processing user authentication for admin_user_2024.

Contact information:
Phone numbers: (555) 987-6543, 555.234.5678, +1-800-555-0199
Web addresses: https://www.example.com, http://backup-site.org/status
Emergency hotline available 24/7 at emergency@support-center.com

System specifications:
- CPU cores: 8
- RAM: 32GB
- Storage: 2TB SSD
- Network: 1000Mbps connection
- OS version: Ubuntu 22.04.3 LTS

The incident report #INC-2024-0315 was filed by J.Anderson@security.com
Status: RESOLVED on 2024/03/16 at 16:45:00 UTC.
"""

def demonstrate_advanced_features():
    print("Category 5: Advanced Features")
    print("=" * 60)

    # Example 1: Lookahead assertions
    print("1. Lookahead Assertions - Numbers followed by specific units:")

    # Positive lookahead: numbers followed by "GB"
    gb_numbers = re.findall(r'\d+(?=GB)', SAMPLE_TEXT)
    print(f"   Numbers followed by 'GB': {gb_numbers}")

    # Negative lookahead: numbers NOT followed by "GB"
    non_gb_numbers = re.findall(r'\d+(?!GB)', SAMPLE_TEXT)
    print(f"   Numbers NOT followed by 'GB': {non_gb_numbers[:10]}... (first 10)")

    # Example 2: Case-insensitive matching
    print("\n2. Case-insensitive Matching:")

    # Find all instances of "error" regardless of case
    error_matches = re.findall(r'error', SAMPLE_TEXT, re.IGNORECASE)
    print(f"   'error' (case-insensitive): {error_matches}")

    # Find all instances of "info" regardless of case
    info_matches = re.findall(r'info', SAMPLE_TEXT, re.IGNORECASE)
    print(f"   'info' (case-insensitive): {info_matches}")

    # Example 3: Multiline mode
    print("\n3. Multiline Mode - Find lines starting with patterns:")

    # Find lines that start with capital letters (using ^ with MULTILINE)
    line_starts = re.findall(r'^[A-Z]\w*', SAMPLE_TEXT, re.MULTILINE)
    print(f"   Words at start of lines: {line_starts}")

    # Example 4: Verbose patterns with comments
    print("\n4. Verbose Patterns - Complex email pattern with comments:")

    verbose_email_pattern = re.compile(r'''
        ^                       # Start of string
        ([a-zA-Z0-9._%+-]+)     # Username: letters, numbers, dots, etc.
        @                       # Literal @ symbol
        ([a-zA-Z0-9.-]+)        # Domain: letters, numbers, dots, hyphens
        \.                      # Literal dot
        ([a-zA-Z]{2,})          # TLD: 2 or more letters
        $                       # End of string
    ''', re.VERBOSE)

    sample_emails = ["john.doe@techcorp.com", "admin@company.net", "invalid-email"]

    print("   Testing emails with verbose pattern:")
    for email in sample_emails:
        match = verbose_email_pattern.match(email)
        if match:
            username, domain, tld = match.groups()
            print(f"   '{email}' -> username: {username}, domain: {domain}, tld: {tld}")
        else:
            print(f"   '{email}' -> No match")

    # Example 5: Search and Replace with groups
    print("\n5. Search and Replace - Reformatting dates:")

    # Find dates in YYYY-MM-DD format and convert to MM/DD/YYYY
    def date_replacer(match):
        year, month, day = match.groups()
        return f"{month}/{day}/{year}"

    original_dates = re.findall(r'\d{4}-\d{2}-\d{2}', SAMPLE_TEXT)
    print(f"   Original dates found: {original_dates}")

    # Replace dates in a sample string
    sample_text = "Meeting on 2024-03-15 and 2024-04-15"
    reformatted = re.sub(r'(\d{4})-(\d{2})-(\d{2})', date_replacer, sample_text)
    print(f"   Original: {sample_text}")
    print(f"   Reformatted: {reformatted}")

    print("\n" + "=" * 60)
    print("Practical combination example:")

    # Combine multiple advanced features
    log_pattern = re.compile(r'''
        ^                           # Start of line
        (?P<level>ERROR|WARNING|INFO|DEBUG)  # Log level
        :\s+                        # Colon and whitespace
        (?P<message>.+)             # Message content
        $                           # End of line
    ''', re.VERBOSE | re.MULTILINE)

    log_matches = log_pattern.finditer(SAMPLE_TEXT)
    print("   Advanced log parsing (verbose + multiline + named groups):")

    for match in log_matches:
        data = match.groupdict()
        print(f"   Level: {data['level']}, Message: {data['message']}")

if __name__ == "__main__":
    demonstrate_advanced_features()

Category 5: Advanced Features
1. Lookahead Assertions - Numbers followed by specific units:
   Numbers followed by 'GB': ['32']
   Numbers NOT followed by 'GB': ['15', '2024', '192', '168', '1', '100', '14', '30', '22', '555']... (first 10)

2. Case-insensitive Matching:
   'error' (case-insensitive): ['ERROR']
   'info' (case-insensitive): ['INFO', 'info']

3. Multiline Mode - Find lines starting with patterns:

4. Verbose Patterns - Complex email pattern with comments:
   Testing emails with verbose pattern:
   'john.doe@techcorp.com' -> username: john.doe, domain: techcorp, tld: com
   'admin@company.net' -> username: admin, domain: company, tld: net
   'invalid-email' -> No match

5. Search and Replace - Reformatting dates:
   Original dates found: ['2024-04-15']
   Original: Meeting on 2024-03-15 and 2024-04-15
   Reformatted: Meeting on 03/15/2024 and 04/15/2024

Practical combination example:
   Advanced log parsing (verbose + multiline + named groups):
   Level: INFO, Message: 


**Need to find a pattern?**

- Just check if it exists? → re.search()

- Must start at beginning? → re.match()

- Want all matches?

    ─ Just the text? → re.findall()

    ─ With positions/groups? → re.finditer()

- Replace matches? → re.sub()

- Split text? → re.split()

- Reuse pattern many times? → re.compile() first


In [None]:
text = "The email is user@domain.com and backup is admin@site.org"
match = re.search(r'\w+@\w+\.\w+', text)

if match:
    print(f"Found email: {match.group()}")  # Output: user@domain.com

Found email: user@domain.com


In [None]:
# Validate if entire string is a valid email
email = "user@domain.com"
if re.match(r'^\w+@\w+\.\w+$', email):
    print("Valid email format")

# This would fail - match() only checks from start
text = "Email: user@domain.com"
match = re.match(r'\w+@\w+\.\w+', text)  # None - doesn't start with email

Valid email format


In [None]:
text = "Phones: 555-1234, 555-5678, 555-9999"
phones = re.findall(r'\d{3}-\d{4}', text)
print(phones)  # ['555-1234', '555-5678', '555-9999']

['555-1234', '555-5678', '555-9999']


In [None]:
text = "Emails: john@test.com, jane@demo.org"
for match in re.finditer(r'(\w+)@(\w+)\.(\w+)', text):
    print(f"Email: {match.group(0)} at position {match.span()}")
    print(f"  User: {match.group(1)}, Domain: {match.group(2)}, TLD: {match.group(3)}")

Email: john@test.com at position (8, 21)
  User: john, Domain: test, TLD: com
Email: jane@demo.org at position (23, 36)
  User: jane, Domain: demo, TLD: org


In [None]:
# Compile once, use many times
email_pattern = re.compile(r'\w+@\w+\.\w+')

texts = ["user@site.com", "invalid", "admin@test.org"]
for text in texts:
    if email_pattern.search(text):
        print(f"Found email in: {text}")

Found email in: user@site.com
Found email in: admin@test.org


In [None]:
text = "Call 555-123-4567 or 555-987-6543"
# Format phone numbers
formatted = re.sub(r'(\d{3})-(\d{3})-(\d{4})', r'(\1) \2-\3', text)
print(formatted)  # Call (555) 123-4567 or (555) 987-6543

Call (555) 123-4567 or (555) 987-6543


In [None]:
data = "apple,banana;orange:grape"
parts = re.split(r'[,;:]', data)
print(parts)  # ['apple', 'banana', 'orange', 'grape']

['apple', 'banana', 'orange', 'grape']


In [None]:
#!/usr/bin/env python3
"""
Complete Explanation Script for the RegEx Notebook
This script explains all the regex concepts demonstrated in the notebook
"""

import re

def explain_notebook_content():
    print("REGEX NOTEBOOK EXPLANATION")
    print("=" * 70)

    print("\nThis notebook demonstrates 5 key regex categories using a consistent")
    print("sample text containing emails, IPs, dates, phone numbers, and logs.")

    section_1_character_classes()
    section_2_backslash_plague()
    section_3_anchors_boundaries()
    section_4_groups()
    section_5_advanced_features()
    section_6_function_usage()

def section_1_character_classes():
    print("\n" + "=" * 70)
    print("SECTION 1: CHARACTER CLASSES - EXPLANATION")
    print("=" * 70)

    print("\nWhat you saw in the notebook:")
    print("- [0-9] found individual digits: ['1', '5', '2', '0', '2', '4'...]")
    print("- [a-z] found lowercase letters: ['n', 'a', 'n', 'u', 'a', 'r'...]")
    print("- [A-Z] found uppercase letters: ['O', 'J', 'D', 'S', 'J'...]")
    print("- [aeiou] found vowels: ['a', 'u', 'a', 'a', 'a'...]")
    print("- [^0-9] found non-digits: ['\\n', 'O', 'n', ' ', 'J'...]")

    print("\nKey Insights:")
    print("1. Character classes match ONE character at a time")
    print("2. [0-9] and \\d are equivalent - both match single digits")
    print("3. [^...] means NOT - matches anything except what's inside")
    print("4. Predefined classes like \\w, \\s are shortcuts for common patterns")

    print("\nWhy the results look scattered:")
    print("- Individual character matching gives you every single occurrence")
    print("- To get complete numbers, you'd need \\d+ (one or more digits)")
    print("- To get complete words, you'd need [a-z]+ (one or more letters)")

    # Demonstration
    sample = "Hello123 World!"
    print(f"\nExample with '{sample}':")
    print(f"[0-9] matches: {re.findall(r'[0-9]', sample)} (each digit separately)")
    print(f"\\d+ matches: {re.findall(r'\\d+', sample)} (complete numbers)")
    print(f"[a-z] matches: {re.findall(r'[a-z]', sample)} (each letter separately)")
    print(f"[a-z]+ matches: {re.findall(r'[a-z]+', sample)} (complete words)")

def section_2_backslash_plague():
    print("\n" + "=" * 70)
    print("SECTION 2: BACKSLASH PLAGUE - EXPLANATION")
    print("=" * 70)

    print("\nWhat you saw in the notebook:")
    print("- Raw strings r\"\"\" prevent Python from interpreting backslashes")
    print("- Normal strings cause Unicode errors with \\users paths")
    print("- Raw strings preserve literal backslashes for regex")

    print("\nThe Core Problem:")
    print("Python processes strings BEFORE regex sees them")

    # Show the issue
    print(f"\nDemonstration:")
    normal_string = "This is \\d a digit pattern"  # Will cause issues
    raw_string = r"This is \d a digit pattern"

    print(f"Normal string: {repr(normal_string)}")
    print(f"Raw string: {repr(raw_string)}")

    print(f"\nIn regex patterns:")
    print("- Normal '\\d+' might become 'd+' (loses the backslash)")
    print("- Raw r'\\d+' preserves \\d for regex engine")
    print("- To find literal \\d text: use r'\\\\d'")

    print("\nWhy notebook examples worked:")
    print("1. Used r\"\"\" for sample text to prevent Unicode errors")
    print("2. Used raw strings r'pattern' for all regex patterns")
    print("3. Used r'\\\\d' to find literal \\d characters in text")

    print("\nRule: Always use raw strings r'...' for regex patterns!")

def section_3_anchors_boundaries():
    print("\n" + "=" * 70)
    print("SECTION 3: ANCHORS AND BOUNDARIES - EXPLANATION")
    print("=" * 70)

    print("\nWhat you saw in the notebook:")
    print("- ^ found lines starting with 'On', 'Team', 'Status', 'The'")
    print("- $ found lines ending with 'com', 'org', '.', 'UTC.'")
    print("- \\b found complete words: 'user', 'server', 'info', 'error'")
    print("- \\B found letters inside words: 'er', 'or'")

    print("\nKey Insights:")
    print("1. Anchors don't match characters - they match POSITIONS")
    print("2. ^ matches the position at start of string/line")
    print("3. $ matches the position at end of string/line")
    print("4. \\b matches positions between word and non-word characters")

    # Demonstrate the difference
    text = "The server serves users"
    print(f"\nExample with '{text}':")
    print(f"'server' anywhere: {re.findall(r'server', text)}")
    print(f"\\bserver\\b (whole word): {re.findall(r'\\bserver\\b', text)}")
    print(f"\\Bserver\\B (inside word): {re.findall(r'\\Bserver\\B', text)}")
    print(f"^The (starts with): {re.findall(r'^The', text)}")

    print("\nWhy this matters:")
    print("- Without anchors: 'user@domain.com.evil' would match email pattern")
    print("- With anchors: ^pattern$ ensures exact match of entire string")
    print("- Word boundaries prevent matching parts of larger words")

def section_4_groups():
    print("\n" + "=" * 70)
    print("SECTION 4: GROUPS - EXPLANATION")
    print("=" * 70)

    print("\nWhat you saw in the notebook:")
    print("- Email groups: extracted username, domain, TLD separately")
    print("- Date groups: extracted year, month, day from YYYY-MM-DD")
    print("- IP groups: extracted 4 octets from IP addresses")
    print("- Phone groups: handled multiple formats with alternation")
    print("- Named groups: made groups more readable with (?P<name>...)")

    print("\nKey Insights:")
    print("1. Parentheses () create groups that capture matched text")
    print("2. Groups are numbered from left to right starting at 1")
    print("3. Group 0 is always the entire match")
    print("4. findall() with groups returns tuples of group contents")

    # Demonstrate group numbering
    text = "2024-03-15"
    pattern = r'(\\d{4})-(\\d{2})-(\\d{2})'
    match = re.search(pattern, text)

    print(f"\nExample with '{text}' and pattern '{pattern}':")
    if match:
        print(f"Group 0 (full match): '{match.group(0)}'")
        print(f"Group 1 (year): '{match.group(1)}'")
        print(f"Group 2 (month): '{match.group(2)}'")
        print(f"Group 3 (day): '{match.group(3)}'")
        print(f"All groups: {match.groups()}")

    print("\nWhy groups are powerful:")
    print("- Extract specific parts of complex patterns")
    print("- Rearrange data (YYYY-MM-DD → MM/DD/YYYY)")
    print("- Validate and parse structured data")
    print("- Named groups make code more readable")

def section_5_advanced_features():
    print("\n" + "=" * 70)
    print("SECTION 5: ADVANCED FEATURES - EXPLANATION")
    print("=" * 70)

    print("\nWhat you saw in the notebook:")
    print("- Lookahead (?=GB): found '32' before 'GB' but not 'GB' itself")
    print("- Negative lookahead (?!GB): found numbers NOT followed by 'GB'")
    print("- Case-insensitive: found 'ERROR' and 'INFO' regardless of case")
    print("- Multiline mode: found words at start of each line")
    print("- Verbose patterns: allowed comments in complex regex")

    print("\nKey Insights:")
    print("1. Lookaheads check what comes after without including it")
    print("2. Flags modify how patterns work (case, multiline, verbose)")
    print("3. re.sub() can use functions for complex replacements")
    print("4. Combining features creates powerful parsing tools")

    # Demonstrate lookahead
    text = "32GB RAM, 2TB SSD, 8 cores"
    print(f"\nLookahead example with '{text}':")
    print(f"\\d+(?=GB): {re.findall(r'\\d+(?=GB)', text)} (numbers before GB)")
    print(f"\\d+GB: {re.findall(r'\\d+GB', text)} (numbers with GB)")

    print("\nWhy advanced features matter:")
    print("- Lookaheads solve complex matching problems")
    print("- Flags make patterns more flexible")
    print("- Verbose mode makes complex patterns maintainable")
    print("- Combining features handles real-world data complexity")

def section_6_function_usage():
    print("\n" + "=" * 70)
    print("SECTION 6: FUNCTION USAGE - EXPLANATION")
    print("=" * 70)

    print("\nWhat you saw in the notebook examples:")

    functions = [
        ("re.search()", "Found first email: user@domain.com", "Check if pattern exists"),
        ("re.match()", "Validated email format from string start", "Validate entire string"),
        ("re.findall()", "Got all phone numbers as list", "Extract all occurrences"),
        ("re.finditer()", "Got emails with positions and groups", "Need match details"),
        ("re.compile()", "Reused email pattern efficiently", "Pattern used multiple times"),
        ("re.sub()", "Reformatted phone numbers", "Replace/transform text"),
        ("re.split()", "Split on multiple delimiters", "Break text into parts")
    ]

    print("\nFunction comparison:")
    for func, example, usage in functions:
        print(f"{func:15} | {usage:25} | {example}")

    print("\nDecision flowchart from notebook:")
    print("- Just check existence? → re.search()")
    print("- Validate format? → re.match()")
    print("- Get all matches? → re.findall() or re.finditer()")
    print("- Transform text? → re.sub()")
    print("- Split text? → re.split()")
    print("- Reuse pattern? → re.compile() first")

    print("\nPerformance insights:")
    print("- Module functions (re.search) are simple for one-time use")
    print("- re.compile() is faster when reusing the same pattern")
    print("- re.finditer() is memory efficient for large texts")
    print("- Choose the right function based on what you need from results")

def notebook_summary():
    print("\n" + "=" * 70)
    print("NOTEBOOK LEARNING SUMMARY")
    print("=" * 70)

    print("\nProgression through the notebook:")
    print("1. Character Classes: Building blocks - match single characters")
    print("2. Backslash Plague: Critical syntax - always use raw strings")
    print("3. Anchors/Boundaries: Position matching - where patterns occur")
    print("4. Groups: Data extraction - capture parts of matches")
    print("5. Advanced Features: Power tools - complex pattern matching")
    print("6. Function Usage: Right tool - choose appropriate function")

    print("\nKey takeaways:")
    print("- Start simple with character classes and quantifiers")
    print("- Always use raw strings r'pattern' to avoid backslash issues")
    print("- Use anchors to ensure patterns match where you expect")
    print("- Groups let you extract structured data from text")
    print("- Advanced features solve complex real-world problems")
    print("- Choose functions based on what you need: existence, all matches, etc.")

    print("\nPractical application:")
    print("- Email validation: anchors + character classes + groups")
    print("- Log parsing: multiline + named groups + case insensitive")
    print("- Data extraction: groups + findall/finditer")
    print("- Text transformation: groups + re.sub() + replacement functions")

    print("\nNext steps:")
    print("- Practice with your own data")
    print("- Combine multiple techniques")
    print("- Build reusable patterns with re.compile()")
    print("- Test edge cases and validate results")

if __name__ == "__main__":
    explain_notebook_content()
    notebook_summary()

REGEX NOTEBOOK EXPLANATION

This notebook demonstrates 5 key regex categories using a consistent
sample text containing emails, IPs, dates, phone numbers, and logs.

SECTION 1: CHARACTER CLASSES - EXPLANATION

What you saw in the notebook:
- [0-9] found individual digits: ['1', '5', '2', '0', '2', '4'...]
- [a-z] found lowercase letters: ['n', 'a', 'n', 'u', 'a', 'r'...]
- [A-Z] found uppercase letters: ['O', 'J', 'D', 'S', 'J'...]
- [aeiou] found vowels: ['a', 'u', 'a', 'a', 'a'...]
- [^0-9] found non-digits: ['\n', 'O', 'n', ' ', 'J'...]

Key Insights:
1. Character classes match ONE character at a time
2. [0-9] and \d are equivalent - both match single digits
3. [^...] means NOT - matches anything except what's inside
4. Predefined classes like \w, \s are shortcuts for common patterns

Why the results look scattered:
- Individual character matching gives you every single occurrence
- To get complete numbers, you'd need \d+ (one or more digits)
- To get complete words, you'd need [a-z