# Regular Expressions in Python: Zero to Hero Guide

## Module 1: Introduction & Basics

## 1.1 What are Regular Expressions?

Regular expressions (regex) are sequences of characters that define search patterns for text processing.

## 1.2 Basic Pattern Matching

In [None]:
import re
# Example 1: Simple text search
text = "Hello, my name is Alice. Hello everyone!"
pattern = "Helo"

# Search for first occurrence
match = re.search(pattern, text)
print(f"Found at position: {match.start() if match else 'Not found'}")  # Output: 0
# Find all occurrences and returns the results as a list
matches = re.findall(pattern, text)
print(f"All occurrences: {matches}")  # Output: ['Hello', 'Hello']
# Check if pattern exists
if re.search(pattern, text):
    print("Pattern found!")

Found at position: Not found
All occurrences: []


# 2. Metacharacters you must know

.  — any character except newline

^  — start of string (or line with re.M)

$  — end of string (or line with re.M)

\d  — digit (0–9)

\D  — non-digit

\w  — word char (letters, digits, underscore)

\W  — non-word char

\s  — whitespace

\S  — non-whitespace

\b  — word boundary

\B  — not a word boundary

[...]  — character class

[^...]  — negated class

|  — alternation (OR)

(...)  — capture group

In [32]:
print(re.findall(r"\w{3}", "one two three four"))  # ['one', 'two', 'thr', 'fou']
re.findall(r"\b\w{3}\b", "one two three four")  # words of length 3 -> ['one','two']


['one', 'two', 'thr', 'fou']


['one', 'two']

In [35]:
re.findall(r"\d", "a12b3")     # ['12','3']

['1', '2', '3']

## 3. Quantifiers: repetition

\* — 0 or more

\+ — 1 or more

? — 0 or 1

{n} — exactly n

{n,} — n or more

{n,m} — between n and m

In [None]:
import re

text = "My email address is john.doe@example.com and my phone number is 555-123-4567."
email_regex = r"\b[A-Za-z0-9._-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
matches = re.findall(email_regex, text)
print(matches)  # Output: ['john.doe@example.com']


['john.doe@example.cOm']


## 4. Greedy and Lazy

### Greedy quantifiers
\*    # 0 or more (greedy)

\+    # 1 or more (greedy)

?    # 0 or 1 (greedy)

{n,} # n or more (greedy)


### Lazy quantifiers  
*?   # 0 or more (lazy)

+?   # 1 or more (lazy)

??   # 0 or 1 (lazy)

{n,}? # n or more (lazy)


In [80]:
# "How would you extract all HTML tags from a string?"
import re

html = "<>Paragraph <b>bold</bhi> text <i>italic</i></p>"

# WRONG - Greedy approach (gets too much)
greedy_tags = re.findall(r'<.*>', html)
print("Greedy (wrong):", greedy_tags)
# Output: ['<p>Paragraph <b>bold</b> text <i>italic</i></p>']

# CORRECT - Lazy approach
lazy_tags = re.findall(r'</?.*?>', html)
print("Lazy (correct):", lazy_tags)
# Output: ['<p>', '<b>', '</b>', '<i>', '</i>', '</p>']
# CORRECT - Lazy approach
lazy_tags = re.findall(r'<.*?>', html)
print("Lazy (correct):", lazy_tags)
# Output: ['<p>', '<b>', '</b>', '<i>', '</i>', '</p>']
# Even better - more specific pattern
specific_tags = re.findall(r'</?[a-z]+>', html)
print("Specific:", specific_tags)
# Same output but more efficient

Greedy (wrong): ['<>Paragraph <b>bold</bhi> text <i>italic</i></p>']
Lazy (correct): ['<>', '<b>', '</bhi>', '<i>', '</i>', '</p>']
Lazy (correct): ['<>', '<b>', '</bhi>', '<i>', '</i>', '</p>']
Specific: ['<b>', '</bhi>', '<i>', '</i>', '</p>']


In [81]:
html = "<html> <p>Paragraph <b>bold</bhi> text <i>italic</i></p> </html>"
html = re.sub(r"<.*?>", "",html)
print(html)

 Paragraph bold text italic 


In [None]:
import re

def remove_html_tags(text, replace_with=' '):
    """
    Remove HTML tags, replacing with space to avoid joined words.
    Handles self-closing tags, multiline content, and malformed tags.
    """
    # Pattern explanation:
    # <           - Start of tag
    # [^>]*       - Zero or more characters that are NOT >
    # >           - End of tag
    # |           - OR
    # <!--.*?-->  - HTML comments
    # |           - OR  
    # <[^>]*$     - Incomplete tag at end of string
    pattern = r'<[^>]*>|<!--.*?-->'
    return re.sub(pattern, replace_with, text, flags=re.DOTALL)

# Example
html = """
<div class="main">
    <h1>Title</h1>
    <p>Paragraph with <b>bold</b> text.</p>
    <!-- This is a comment -->
    <br/>
    <img src="test.jpg" alt="test">
    Unclosed </tag
</div>
"""

result = remove_html_tags(html)
print(result)
# Output: 
# "
#     Title
#     Paragraph with bold text.
#     
#     
#     Unclosed 
# "


 
     Title 
     Paragraph with  bold  text. 
     
     
     
    Unclosed  



In [69]:
# For serious HTML parsing, use BeautifulSoup instead of regex
from bs4 import BeautifulSoup

def remove_html_tags_bs4(html):
    """
    Using BeautifulSoup - more reliable than regex for HTML
    """
    soup = BeautifulSoup(html, 'html.parser')
    
    # Get all text
    text = soup.get_text(separator=' ', strip=True)
    
    return text

# Example
html = "<p>Hello <b>World</b>!<br/>Next line.</p>"
result = remove_html_tags_bs4(html)
print(result)  # Output: "Hello World! Next line."

ModuleNotFoundError: No module named 'bs4'

In [44]:
s = "<b>first</b><b>second</b> <b>first</b><b>second</b>"
print(re.findall(r"<b>.*</b>", s))   # greedy -> matches whole string
print(re.findall(r"<b>.*?</b>", s))  # lazy  -> ['<b>first</b>', '<b>second</b>']


['<b>first</b><b>second</b> <b>first</b><b>second</b>']
['<b>first</b>', '<b>second</b>', '<b>first</b>', '<b>second</b>']


In [12]:
for i in range(*match.span()):
    print(text[i], end="")

Hello

In [92]:
import re
s = "aa"
p = r".*"
pattern = r"^"+p+r"$"
res = re.match(pattern,s)
if res:
    print("True", res)
else:
    print("False")

True <re.Match object; span=(0, 2), match='aa'>


In [None]:
print(re.findall(r"[aeiou]", "hello"))    # all vowels: ['e','o']
print(re.findall(r"[^aeiou]", "hello"))    # Except vowels: ['h', 'l', 'l']
print(re.findall(r"[A-Za-z]+", "123 abc DEF"))  # ['abc','DEF']
print(re.findall(r"[^0-9]+", "12ab34 ab67"))   # non-digits: ['ab', 'ab']


['e', 'o']
['h', 'l', 'l']
['abc', 'DEF']
['ab', ' ab']


In [None]:
print(re.match(r"Hello", "world Hello"))      # match at start
print(re.search(r"^Hello", "Hello world"))      # ^ represents the starting of the string. look at the begining0
print(re.search(r"world$", "Hello world"))     # match at end
print(re.findall(r"^\d{4}$", "2025"))          # full string of 4 digits -> ['2025'] (use re.fullmatch for full-string)


None
<re.Match object; span=(0, 5), match='Hello'>
<re.Match object; span=(6, 11), match='world'>
['2025']


In [4]:
import re 
text_list = ['My name is Suresh kannan', 'My name is Naresh', 'My name is Thamil']
pattern = re.compile(r"(?<=My name is )\w+")
for text in text_list:
    print(re.findall(pattern,text))

['Suresh']
['Naresh']
['Thamil']


In [None]:
import re

input_text = '''The rain had been falling since dawn, a steady rain that whispered against the rooftops and painted the streets silver. Mira stepped outside, letting the rain cool her warm skin as the world blurred behind a curtain of rain. She didn’t mind the rain—in fact, she loved how the rain washed away the noise of the city, leaving only the soft rhythm of droplets. Every worry felt lighter in the rain, every thought clearer, as if the rain itself carried a quiet kind of magic.'''

# Normalize text (lowercase + remove punctuation)
words = re.findall(r'\b\w+\b', input_text.lower())
print(words)

['the', 'rain_in', 'had', 'been', 'falling', 'since', 'dawn', 'a', 'steady', 'rain', 'that', 'whispered', 'against', 'the', 'rooftops', 'and', 'painted', 'the', 'streets', 'silver', 'mira', 'stepped', 'outside', 'letting', 'the', 'rain', 'cool', 'her', 'warm', 'skin', 'as', 'the', 'world', 'blurred', 'behind', 'a', 'curtain', 'of', 'rain', 'she', 'didn', 't', 'mind', 'the', 'rain', 'in', 'fact', 'she', 'loved', 'how', 'the', 'rain', 'washed', 'away', 'the', 'noise', 'of', 'the', 'city', 'leaving', 'only', 'the', 'soft', 'rhythm', 'of', 'droplets', 'every', 'worry', 'felt', 'lighter', 'in', 'the', 'rain', 'every', 'thought', 'clearer', 'as', 'if', 'the', 'rain', 'itself', 'carried', 'a', 'quiet', 'kind', 'of', 'magic']
