### 1. Introduction to Regular Expressions
Basic pattern matching with the re module.

In [None]:
import re

# Simple pattern matching
text = "Hello, my email is john@example.com"

# Search for a pattern
pattern = r"email"
match = re.search(pattern, text)

if match:
    print(f"Pattern found at position {match.start()}-{match.end()}")
    print(f"Matched text: '{match.group()}'")
else:
    print("Pattern not found")

# Check if string starts with pattern
if re.match(r"Hello", text):
    print("\nText starts with 'Hello'")

### 2. Basic Metacharacters
Understanding special characters in regex.

In [None]:
import re

text = "The price is $100.50 and the discount is 20%"

# . matches any character except newline
print("Using . (dot):")
print(re.findall(r"p...e", text))  # Matches 'price'

# ^ matches start of string
print("\nUsing ^ (caret):")
print(re.search(r"^The", text))  # Matches 'The' at start

# $ matches end of string
print("\nUsing $ (dollar):")
print(re.search(r"20%$", text))  # Matches '20%' at end

# * matches 0 or more repetitions
print("\nUsing * (asterisk):")
print(re.findall(r"\d*", "abc123def456"))  # Matches digits

# + matches 1 or more repetitions
print("\nUsing + (plus):")
print(re.findall(r"\d+", "abc123def456"))  # Matches one or more digits

# ? matches 0 or 1 repetition
print("\nUsing ? (question):")
print(re.findall(r"colou?r", "color colour"))  # Matches both spellings

### 3. Character Classes
Matching specific sets of characters.

In [None]:
import re

text = "Contact: +1-555-1234 or email@example.com"

# \d matches digits (0-9)
print("Digits (\\d):")
print(re.findall(r"\d", text))
print(re.findall(r"\d+", text))  # One or more digits

# \D matches non-digits
print("\nNon-digits (\\D):")
print(re.findall(r"\D+", text))

# \w matches word characters (alphanumeric + underscore)
print("\nWord characters (\\w):")
print(re.findall(r"\w+", text))

# \W matches non-word characters
print("\nNon-word characters (\\W):")
print(re.findall(r"\W", text))

# \s matches whitespace
print("\nWhitespace (\\s):")
print(re.findall(r"\s+", text))

# \S matches non-whitespace
print("\nNon-whitespace (\\S):")
print(re.findall(r"\S+", text))

# Custom character sets []
print("\nCustom character set [aeiou]:")
print(re.findall(r"[aeiou]", text))  # Vowels

print("\nRange [a-z]:")
print(re.findall(r"[a-z]+", text))  # Lowercase words

print("\nNegated set [^0-9]:")
print(re.findall(r"[^0-9]+", text))  # Everything except digits

### 4. Quantifiers
Specifying how many times a pattern should match.

In [None]:
import re

text = "Phone: 555-1234, 555-12345, 555-123456"

# {n} - exactly n times
print("Exactly 3 digits {3}:")
print(re.findall(r"\d{3}", text))

# {n,} - n or more times
print("\n3 or more digits {3,}:")
print(re.findall(r"\d{3,}", text))

# {n,m} - between n and m times
print("\nBetween 3 and 5 digits {3,5}:")
print(re.findall(r"\d{3,5}", text))

# Greedy vs Non-greedy
html = "<div>Content 1</div><div>Content 2</div>"

print("\nGreedy match (.*):")
print(re.findall(r"<div>.*</div>", html))  # Matches entire string

print("\nNon-greedy match (.*?):")
print(re.findall(r"<div>.*?</div>", html))  # Matches each div separately

### 5. Groups and Capturing
Extracting specific parts of a match.

In [None]:
import re

# Basic grouping with ()
text = "John Doe, age 30, email: john@example.com"
pattern = r"(\w+)\s(\w+),\sage\s(\d+)"

match = re.search(pattern, text)
if match:
    print("Full match:", match.group(0))
    print("First name:", match.group(1))
    print("Last name:", match.group(2))
    print("Age:", match.group(3))

# Named groups
pattern = r"(?P<first>\w+)\s(?P<last>\w+),\sage\s(?P<age>\d+)"
match = re.search(pattern, text)
if match:
    print("\nUsing named groups:")
    print("First name:", match.group('first'))
    print("Last name:", match.group('last'))
    print("Age:", match.group('age'))

# Non-capturing groups (?:...)
text = "Email: john@example.com or jane@test.org"
pattern = r"\w+@(?:example|test)\.(?:com|org)"
emails = re.findall(pattern, text)
print("\nEmails found:", emails)

### 6. Common Patterns: Email Validation

In [None]:
import re

def validate_email(email):
    """Validate email address"""
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return re.match(pattern, email) is not None

# Test emails
test_emails = [
    "user@example.com",
    "user.name@example.co.uk",
    "invalid.email",
    "@example.com",
    "user@.com",
    "user@example",
    "user+tag@example.com"
]

for email in test_emails:
    is_valid = validate_email(email)
    print(f"{email:30} -> {'Valid' if is_valid else 'Invalid'}")

### 7. Common Patterns: Phone Numbers

In [None]:
import re

def extract_phone_numbers(text):
    """Extract various phone number formats"""
    # Pattern supports: (123) 456-7890, 123-456-7890, 123.456.7890, 1234567890
    pattern = r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
    return re.findall(pattern, text)

text = """
Contact us at:
Office: (555) 123-4567
Mobile: 555-987-6543
Fax: 555.111.2222
Direct: 5551234567
Invalid: 12-345
"""

phones = extract_phone_numbers(text)
print("Phone numbers found:")
for phone in phones:
    print(f"  - {phone}")

### 8. Common Patterns: URLs

In [None]:
import re

def extract_urls(text):
    """Extract URLs from text"""
    pattern = r'https?://[^\s]+'
    return re.findall(pattern, text)

def parse_url(url):
    """Parse URL components"""
    pattern = r'(?P<protocol>https?)://(?P<domain>[^/]+)(?P<path>/.*)?'
    match = re.match(pattern, url)
    if match:
        return match.groupdict()
    return None

text = """
Visit our websites:
https://www.example.com
http://blog.example.com/post/123
https://api.example.com/v1/users
"""

urls = extract_urls(text)
print("URLs found:")
for url in urls:
    print(f"\nURL: {url}")
    parts = parse_url(url)
    if parts:
        print(f"  Protocol: {parts['protocol']}")
        print(f"  Domain: {parts['domain']}")
        print(f"  Path: {parts.get('path', 'None')}")

### 9. Substitution and Replacement

In [None]:
import re

# Simple substitution
text = "The quick brown fox jumps over the lazy dog"
new_text = re.sub(r"fox", "cat", text)
print("Original:", text)
print("Modified:", new_text)

# Replace with count limit
text = "apple apple apple banana apple"
result = re.sub(r"apple", "orange", text, count=2)
print("\nReplace first 2:", result)

# Using backreferences
text = "Date: 2023-12-25"
# Convert YYYY-MM-DD to DD/MM/YYYY
result = re.sub(r"(\d{4})-(\d{2})-(\d{2})", r"\3/\2/\1", text)
print("\nDate format:", result)

# Using a function for replacement
def double_number(match):
    number = int(match.group())
    return str(number * 2)

text = "I have 5 apples and 10 oranges"
result = re.sub(r"\d+", double_number, text)
print("\nDoubled numbers:", result)

# Remove extra whitespace
text = "Too    many      spaces"
result = re.sub(r"\s+", " ", text)
print("\nCleaned:", result)

### 10. Splitting Text

In [None]:
import re

# Split on whitespace
text = "apple banana cherry date"
words = re.split(r"\s+", text)
print("Split on space:", words)

# Split on multiple delimiters
text = "apple,banana;cherry:date"
fruits = re.split(r"[,;:]", text)
print("\nSplit on multiple delimiters:", fruits)

# Split with maxsplit
text = "one,two,three,four,five"
parts = re.split(r",", text, maxsplit=2)
print("\nSplit max 2:", parts)

# Split but keep separator
text = "apple1banana2cherry3"
parts = re.split(r"(\d)", text)
print("\nSplit keeping separator:", parts)

# Split sentences
text = "Hello world. How are you? I'm fine!"
sentences = re.split(r"[.!?]\s*", text)
print("\nSentences:", [s for s in sentences if s])

### 11. Lookahead and Lookbehind

In [None]:
import re

# Positive lookahead (?=...)
text = "Price: $100, Sale: $80"
# Find numbers followed by a comma
result = re.findall(r"\d+(?=,)", text)
print("Numbers before comma:", result)

# Negative lookahead (?!...)
text = "apple1 banana2 cherry date3"
# Find words NOT followed by a digit
result = re.findall(r"\b\w+(?!\d)", text)
print("\nWords not followed by digit:", result)

# Positive lookbehind (?<=...)
text = "Price: $100, Discount: $20"
# Find numbers preceded by $
result = re.findall(r"(?<=\$)\d+", text)
print("\nNumbers after $:", result)

# Negative lookbehind (?<!...)
text = "file1.txt file2.jpg file3.txt"
# Find extensions not preceded by 'file1'
result = re.findall(r"(?<!file1)\.\w+", text)
print("\nExtensions (not file1):", result)

### 12. Compiling Regular Expressions

In [None]:
import re

# Compile pattern for reuse (more efficient)
email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')

text = """
Contact: john@example.com
Support: support@company.org
Sales: sales@business.net
"""

# Use compiled pattern
emails = email_pattern.findall(text)
print("Emails found:", emails)

# With flags
pattern = re.compile(r'hello', re.IGNORECASE)
text = "Hello world, HELLO Python, hello regex"
matches = pattern.findall(text)
print("\nCase-insensitive matches:", matches)

# Multiline flag
text = """
First line
Second line
Third line
"""
pattern = re.compile(r'^\w+', re.MULTILINE)
matches = pattern.findall(text)
print("\nFirst word of each line:", matches)

### 13. Practical Example: Log File Parsing

In [None]:
import re
from collections import Counter

# Sample log data
log_data = """
2023-12-25 10:15:30 ERROR Database connection failed
2023-12-25 10:16:45 INFO User login successful
2023-12-25 10:17:20 WARNING High memory usage detected
2023-12-25 10:18:10 ERROR File not found: config.json
2023-12-25 10:19:05 INFO Request processed
2023-12-25 10:20:30 ERROR Network timeout
"""

# Pattern to parse log entries
log_pattern = re.compile(
    r'(?P<date>\d{4}-\d{2}-\d{2})\s+'
    r'(?P<time>\d{2}:\d{2}:\d{2})\s+'
    r'(?P<level>\w+)\s+'
    r'(?P<message>.+)'
)

# Parse logs
log_entries = []
for line in log_data.strip().split('\n'):
    match = log_pattern.match(line)
    if match:
        log_entries.append(match.groupdict())

# Analyze logs
print("Parsed Log Entries:")
for entry in log_entries:
    print(f"[{entry['date']} {entry['time']}] {entry['level']}: {entry['message']}")

# Count log levels
levels = [entry['level'] for entry in log_entries]
level_counts = Counter(levels)

print("\nLog Level Summary:")
for level, count in level_counts.items():
    print(f"  {level}: {count}")

# Find all ERROR messages
errors = [entry['message'] for entry in log_entries if entry['level'] == 'ERROR']
print("\nError Messages:")
for error in errors:
    print(f"  - {error}")

### 14. Practical Example: Data Cleaning

In [None]:
import re

def clean_text(text):
    """Clean and normalize text"""
    # Remove URLs
    text = re.sub(r'https?://\S+', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove phone numbers
    text = re.sub(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', '', text)
    
    # Remove special characters (keep letters, numbers, spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Trim
    return text.strip()

# Test data
messy_text = """
Hello!!! Contact me at john@example.com or call (555) 123-4567.
Visit https://example.com for more info... 
Price: $99.99 (discount available!!!)
"""

cleaned = clean_text(messy_text)
print("Original:")
print(messy_text)
print("\nCleaned:")
print(cleaned)