# Quantifiers

Quantifiers in regular expressions specify how many times a character, group, 
or character class should be matched.

They are essential for matching patterns of varying lengths and making regex
patterns more flexible and powerful.

# Basic Quantifiers

In [3]:
import re

In [9]:
# Sample text for our examples
text = "The year 2023 had 365 days. File names: doc1.txt, img001.jpg, and report12345.pdf"
print(f"Sample text: '{text}'")
print("-" * 80)

# --------------------------------------------- #

# Example of * (zero or more occurrences)
print("Example of * (zero or more occurrences):")
star_results = re.findall(r"a*b", "b ab aaab hello abcd")
print(f"  Pattern 'a*b' matches: {star_results}")
# This matches: b (0 'a's), ab (1 'a'), aaab (3 'a's)

# All digit sequences (including empty ones)
star_digit_results = re.findall(r"\d*", text)
# Filter out empty strings for clarity
non_empty_star_results = [match for match in star_digit_results if match]
print(f"  Pattern '\\d*' matches (non-empty): {non_empty_star_results}")

# --------------------------------------------- #

# Example of + (one or more occurrences)
print("\nExample of + (one or more occurrences):")

plus_results = re.findall(r"a+b", "b ab aaab")
print(f"  Pattern 'a+b' matches: {plus_results}")
# This matches: ab (1 'a'), aaab (3 'a's), but not b (0 'a's)

# All digit sequences (at least one digit)
plus_digit_results = re.findall(r"\d+", text)
print(f"  Pattern '\\d+' matches: {plus_digit_results}")
# This matches: 2023, 365, 1, 001, 12345

# --------------------------------------------- #

# Example of ? (zero or one occurrence)
print("\nExample of ? (zero or one occurrence):")

question_results = re.findall(r"colou?r", "I prefer color, but others write it as colour")
print(f"  Pattern 'colou?r' matches: {question_results}")
# This matches both 'color' and 'colour'

# Optional area code in phone numbers
phone_text = "Call 123-456-7890 or (987) 654-3210"
optional_area_code = re.findall(r"\(?\d{3}\)?-?\d{3}-\d{4}", phone_text)
print(f"  Pattern '\\(?\\d{{3}}\\)?-?\\d{{3}}-\\d{{4}}' matches: {optional_area_code}")
# This matches both formats with or without parentheses

Sample text: 'The year 2023 had 365 days. File names: doc1.txt, img001.jpg, and report12345.pdf'
--------------------------------------------------------------------------------
Example of * (zero or more occurrences):
  Pattern 'a*b' matches: ['b', 'ab', 'aaab', 'ab']
  Pattern '\d*' matches (non-empty): ['2023', '365', '1', '001', '12345']

Example of + (one or more occurrences):
  Pattern 'a+b' matches: ['ab', 'aaab']
  Pattern '\d+' matches: ['2023', '365', '1', '001', '12345']

Example of ? (zero or one occurrence):
  Pattern 'colou?r' matches: ['color', 'colour']
  Pattern '\(?\d{3}\)?-?\d{3}-\d{4}' matches: ['123-456-7890']


## Specific Quantifiers

Python regex also supports specific quantifiers for more precise control:

- {n}   - Matches exactly n occurrences

- {n,}  - Matches n or more occurrences

- {n,m} - Matches between n and m occurrences (inclusive)

In [11]:
# Sample text for our examples
text = "The year 2023 had 365 days. File names: doc1.txt, img001.jpg, and report12345.pdf"
print(f"Sample text: '{text}'")
print("-" * 80)

# Example of {n} (exactly n occurrences)
print("Example of {n} (exactly n occurrences):")
exact_results = re.findall(r"\d{3}", text)
print(f"  Pattern '\\d{{3}}' matches: {exact_results}")
# This matches: 365, 001 (exactly 3 digits each)

# Example of {n,} (n or more occurrences)
print("\nExample of {n,} (n or more occurrences):")
min_results = re.findall(r"\d{4,}", text)
print(f"  Pattern '\\d{{4,}}' matches: {min_results}")
# This matches: 2023, 12345 (4 or more digits)

# Example of {n,m} (between n and m occurrences)
print("\nExample of {n,m} (between n and m occurrences):")
range_results = re.findall(r"\d{2,4}", text)
print(f"  Pattern '\\d{{2,4}}' matches: {range_results}")
# This matches: 2023, 365, 001, 1234 (between 2 and 4 digits)

# Practical examples with specific quantifiers
print("\nPractical examples with specific quantifiers:")

# Matching US zip codes (exactly 5 digits or 5+4)
zip_codes = "Zip codes: 12345 and 12345-6789"
zip_matches = re.findall(r"\b\d{5}(?:-\d{4})?\b", zip_codes)
print(f"  US zip codes: {zip_matches}")

# Matching 3-letter words
words_text = "The fox ran to the big red hen"
three_letter_words = re.findall(r"\b[a-zA-Z]{3}\b", words_text)
print(f"  Three-letter words: {three_letter_words}")

# Matching words with 3-5 letters
three_to_five_letter_words = re.findall(r"\b[a-zA-Z]{3,5}\b", words_text)
print(f"  Words with 3-5 letters: {three_to_five_letter_words}")

Sample text: 'The year 2023 had 365 days. File names: doc1.txt, img001.jpg, and report12345.pdf'
--------------------------------------------------------------------------------
Example of {n} (exactly n occurrences):
  Pattern '\d{3}' matches: ['202', '365', '001', '123']

Example of {n,} (n or more occurrences):
  Pattern '\d{4,}' matches: ['2023', '12345']

Example of {n,m} (between n and m occurrences):
  Pattern '\d{2,4}' matches: ['2023', '365', '001', '1234']

Practical examples with specific quantifiers:
  US zip codes: ['12345', '12345-6789']
  Three-letter words: ['The', 'fox', 'ran', 'the', 'big', 'red', 'hen']
  Words with 3-5 letters: ['The', 'fox', 'ran', 'the', 'big', 'red', 'hen']


## Greedy vs Non-Greedy Matching

By default, quantifiers are greedy, meaning they match as much as possible.

Adding a ? after a quantifier makes it non-greedy (lazy), meaning it will
match as little as possible.

In [13]:
html_text = "<div>Content 1</div><div>Content 2</div>"
print(f"HTML text: '{html_text}'")

# Greedy matching (default)
print("\nGreedy matching (default):")
greedy_match = re.search(r"<div>.*</div>", html_text)
print(f"  Pattern '<div>.*</div>' matches: '{greedy_match.group()}'")
# This matches: "<div>Content 1</div><div>Content 2</div>"
# The .* matches as much as possible, going to the last </div>

# Non-greedy (lazy) matching
print("\nNon-greedy (lazy) matching:")
lazy_match = re.search(r"<div>.*?</div>", html_text)
print(f"  Pattern '<div>.*?</div>' matches: '{lazy_match.group()}'")
# This matches: "<div>Content 1</div>"
# The .*? matches as little as possible, stopping at the first </div>

# More examples of greedy vs. non-greedy matching
print("\nMore examples of greedy vs. non-greedy matching:")

text = "Start 123 Middle 456 End"

# Greedy matching with \d* (as many digits as possible)
greedy_digits = re.search(r"Start.*End", text)
print(f"  Greedy 'Start.*End': '{greedy_digits.group()}'")
# Matches "Start 123 Middle 456 End" (the entire text)

# Non-greedy matching with \d*? (as few digits as possible)
lazy_digits = re.search(r"Start.*?End", text)
print(f"  Non-greedy 'Start.*?End': '{lazy_digits.group()}'")
# Still matches "Start 123 Middle 456 End" because there's only one "End"

# Example where the difference is more apparent
html_more = "<p>First paragraph</p><p>Second paragraph</p>"
greedy_p = re.findall(r"<p>.*</p>", html_more)
lazy_p = re.findall(r"<p>.*?</p>", html_more)
print(f"  HTML text: '{html_more}'")
print(f"  Greedy '<p>.*</p>': {greedy_p}")
print(f"  Non-greedy '<p>.*?</p>': {lazy_p}")
# Greedy matches the whole thing once
# Non-greedy matches each paragraph separately

HTML text: '<div>Content 1</div><div>Content 2</div>'

Greedy matching (default):
  Pattern '<div>.*</div>' matches: '<div>Content 1</div><div>Content 2</div>'

Non-greedy (lazy) matching:
  Pattern '<div>.*?</div>' matches: '<div>Content 1</div>'

More examples of greedy vs. non-greedy matching:
  Greedy 'Start.*End': 'Start 123 Middle 456 End'
  Non-greedy 'Start.*?End': 'Start 123 Middle 456 End'
  HTML text: '<p>First paragraph</p><p>Second paragraph</p>'
  Greedy '<p>.*</p>': ['<p>First paragraph</p><p>Second paragraph</p>']
  Non-greedy '<p>.*?</p>': ['<p>First paragraph</p>', '<p>Second paragraph</p>']


## COMBINING QUANTIFIERS WITH GROUPS AND CLASSES

Quantifiers become even more powerful when combined with groups and
character classes, allowing for complex pattern matching.

In [14]:
# Combining quantifiers with character classes
print("Combining quantifiers with character classes:")
text = "The quick brown fox jumps over the lazy dog"
vowel_sequences = re.findall(r"[aeiou]+", text.lower())
print(f"  Sequences of vowels [aeiou]+: {vowel_sequences}")

# Finding words with specific consonant sequences
consonant_seq_words = re.findall(r"\b\w*[^aeiou\W]{2,}\w*\b", text.lower())
print(f"  Words with 2+ consecutive consonants: {consonant_seq_words}")

# Combining quantifiers with groups
print("\nCombining quantifiers with groups:")
repeated_words_text = "The the quick quick brown fox fox jumps over over the lazy dog"
repeated_words = re.findall(r"\b(\w+)( \1)+\b", repeated_words_text.lower())
print(f"  Repeated words: {[match[0] for match in repeated_words]}")

# Optional groups with quantifiers
date_text = "Dates: 01/02/2023, 1-2-2023, 2023-01-02"
dates = re.findall(r"\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{1,2}[-/]\d{1,2})\b", date_text)
print(f"  Flexible date formats: {dates}")


Combining quantifiers with character classes:
  Sequences of vowels [aeiou]+: ['e', 'ui', 'o', 'o', 'u', 'o', 'e', 'e', 'a', 'o']
  Words with 2+ consecutive consonants: ['the', 'quick', 'brown', 'jumps', 'the', 'lazy']

Combining quantifiers with groups:
  Repeated words: ['the', 'quick', 'fox', 'over']
  Flexible date formats: ['01/02/2023', '1-2-2023', '2023-01-02']


## Practical Applications

In [15]:
# Example 1: Email validation
def is_valid_email(email):
    # Email validation pattern using quantifiers
    pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    return bool(re.match(pattern, email))

emails = ["user@example.com", "john.doe@company.co.uk", "invalid@", 
         "missing-domain@.com", "special#char@domain.com"]

print("Email validation:")
for email in emails:
    print(f"  {email}: {'Valid' if is_valid_email(email) else 'Invalid'}")

# Example 2: URL extraction
text = "Visit our website at https://www.example.com or http://subdomain.example.org/path?query=value"
url_pattern = r"https?://(?:www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?"
urls = re.findall(url_pattern, text)
print(f"\nURLs found: {urls}")

# Example 3: Phone number validation
def is_valid_phone(phone):
    """Validate North American phone numbers in various formats."""
    pattern = r"^(?:\+1[-\s]?)?(?:\(?([0-9]{3})\)?[-\s]?)?([0-9]{3})[-\s]?([0-9]{4})$"
    return bool(re.match(pattern, phone))

phone_numbers = [
    "123-456-7890", 
    "(123) 456-7890", 
    "+1 123-456-7890",
    "123.456.7890", 
    "1234567890",
    "123-456-78",  # Invalid
    "+2 123-456-7890"  # Invalid
]

print("\nPhone number validation:")
for phone in phone_numbers:
    print(f"  {phone}: {'Valid' if is_valid_phone(phone) else 'Invalid'}")

# Example 4: Password strength with quantifiers
def is_strong_password(password):
    """
    Check if a password is strong:
    - At least 8 characters long
    - Contains at least 1 uppercase letter
    - Contains at least 1 lowercase letter
    - Contains at least 1 digit
    - Contains at least 1 special character
    """
    if len(password) < 8:
        return False
    
    # Check for at least one uppercase, lowercase, digit, and special character
    patterns = [
        r"[A-Z]+",      # At least one uppercase letter
        r"[a-z]+",      # At least one lowercase letter
        r"[0-9]+",      # At least one digit
        r"[^a-zA-Z0-9]+"  # At least one special character
    ]
    
    return all(re.search(pattern, password) for pattern in patterns)

passwords = ["password", "Password", "Password1", "P@ssw0rd"]
print("\nPassword strength validation:")
for password in passwords:
    print(f"  {password}: {'Strong' if is_strong_password(password) else 'Weak'}")

Email validation:
  user@example.com: Valid
  john.doe@company.co.uk: Valid
  invalid@: Invalid
  missing-domain@.com: Invalid
  special#char@domain.com: Invalid

URLs found: ['https://www.example.com', 'http://subdomain.example.org/path?query=value']

Phone number validation:
  123-456-7890: Valid
  (123) 456-7890: Valid
  +1 123-456-7890: Valid
  123.456.7890: Invalid
  1234567890: Valid
  123-456-78: Invalid
  +2 123-456-7890: Invalid

Password strength validation:
  password: Weak
  Password: Weak
  Password1: Weak
  P@ssw0rd: Strong
