Introduction to regex

Importing re module

Raw strings (r"pattern")

# metacharacters and quantifiers
# Flags / Modifiers 

## Practical Use Cases
-> Email validation
-> Phone number matching
-> Extracting hashtags or mentions
-> Splitting text by multiple delimiters
-> Cleaning unwanted characters from strings

In [2]:
import re
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Match object   => is an object containing information about the search and the result.

txt = "The rain in Spain"
x = re.search("ra", txt)

print(x)

<re.Match object; span=(4, 6), match='ra'>


In [None]:
 #.       Metacharacters and quantifiers
'''   
# Metacharacters
     . → any character except newline
     ^ → start of string
     $ → end of string
     [] → character set
     | → OR operator
     () → grouping 
# Quantifiers
     *	Zero or more occurrences	
     +	One or more occurrences	
     ?	Zero or one occurrences	
     {n}	Exactly the specified number of occurrences.
     {n,} → n or more
     {n,m} → between n and m      '''

txt="Hello Singapore. is it still raining there"

a=re.findall("He..o",txt)
b=re.findall("^Hello",txt) # string starts with Hello, if yes then -> print it .
c=re.findall("there$",txt) # string ends with there, if yes then -> print it. or not then -> print [] 
d=re.findall("[a-t]",txt)  
e=re.findall("Hello|Hey",txt) # Hello or Hey 
f=re.findall("He.*o",txt)  # -> Zero or more occurrences of letters between He <-> o
g=re.findall("He.+o",txt)  # -> one or more occurrences of letters between He <-> o
h=re.findall("He.?o",txt)  # -> Zero or one occurrences of letters between He <-> o
i=re.findall("He.{2}o",txt) # -> only 2 occurrences of letters between He <-> o
j=re.findall("He.{2,}o",txt) 
k=re.findall("He.{1,3}o",txt) 


print(a)
print(b)
print(c)
print(d)
print(e)
print(f)
print(g)
print(h)
print(i)
print(j)
print(k)


['Hello']
['Hello']
['there']
['e', 'l', 'l', 'o', 'i', 'n', 'g', 'a', 'p', 'o', 'r', 'e', 'i', 's', 'i', 't', 's', 't', 'i', 'l', 'l', 'r', 'a', 'i', 'n', 'i', 'n', 'g', 't', 'h', 'e', 'r', 'e']
['Hello']
['Hello Singapo']
['Hello Singapo']
[]
['Hello']
['Hello Singapo']
['Hello']


In [None]:
#    () → grouping    Example -> 1 without numbers

Txt="Hello world"
pattern= r"(Hello) (world)"
match=re.search(pattern, Txt)
print("Full Match:", match.group(0))
print("First word:", match.group(1))
print("second word:", match.group(2))

Full Match: Hello world
First word: Hello
second word: world


In [5]:
#  Example -> 2 with numbers

date = "Today is 05-08-2025"

pattern = r"(\d{2})-(\d{2})-(\d{4})"
match=re.search(pattern, date)

print("Full Date:", match.group(0))
print("Day:", match.group(1))
print("Month:", match.group(2))
print("Year:", match.group(3))

Full Date: 05-08-2025
Day: 05
Month: 08
Year: 2025


# Flags / Modifiers :

In [None]:

# 1.IGNORECASE (re.I) → Case-insensitive matching
text="Hellow world"
pattern = r"hello"
match=re.search(pattern, text, re.I)
print(bool(match))

True


In [23]:
# 2.MULTILINE (re.M) → ^ and $ match each line
text1="""first line
second line"""

pattern = r"^second"
match1 = re.search(pattern, text1, re.M)
print(match1.group())

second


In [24]:
# 3.DOTALL (re.S) → . matches newline too
text2='''hello
world'''

pattern = r"hello.world"

match2=re.search(pattern, text2, re.S)
print(match2.group())

hello
world


In [25]:
# 4. VERBOSE (re.X) → Ignore spaces & allow comments
pattern=re.compile("""
    (\d{4}) #year
     -      #seperate
    (\d{2}) # month
     -      #seperate
    (\d{2}) #day                                   
""",re.X)

match=pattern.search("2025-08-05")
print(match.groups())

('2025', '08', '05')


In [26]:
# 5. ASCII (re.A) → Restrict \w, \d, etc. to ASCII only
text3="Café"
pattern=r"\w+"

print(re.findall(pattern, text3))
print(re.findall(pattern, text3, re.A))

['Café']
['Caf']


# use cases :

In [27]:
# Email validation 
def validate_email(email):
    pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    return bool(re.match(pattern,email))
emails =["use@example.com", "hello.world@domain.co.in", "invalid-email@.com"]
for e in emails:
    print(e, "->", "valid" if validate_email(e) else "invalid")

use@example.com -> valid
hello.world@domain.co.in -> valid
invalid-email@.com -> invalid


In [None]:
# cleaning unwanted characters from string


text4="Hello!!! 123, Welcome_to@Python."

# Replace anything that's not a letter with a space
cleaning_txt=re.sub(r"[^a-zA-Z\s]"," ",text4)

# Collapse multiple spaces into one 
cleaning_txt=re.sub(r"\s+"," ",cleaning_txt).strip()

print(cleaning_txt)

Hello Welcome to Python
