In [12]:
# Perfect! Learning regex (regular expressions) in Pandas is a game-changer for real-world data cleaning and transformation
# Common Regex Symbols

'''
^  : Matches the beginning of a string.  
$  : Matches the end of a string.  
.  : Matches any single character except newline.  
*  : Matches 0 or more repetitions of the previous character.  
+  : Matches 1 or more repetitions of the previous character.  
?  : Matches 0 or 1 of the previous character.  
[] : Matches any one character inside the brackets.  
[^]: Matches any one character NOT inside the brackets.  
() : Groups patterns together.  
|  : Acts like OR between expressions.  

'''

#  Extract Email Username & Domain
import pandas as pd
df = pd.DataFrame({'Email': ['john.doe@gmail.com', 'alice_smith@yahoo.com']})



In [8]:
print(df)


                   Email
0     john.doe@gmail.com
1  alice_smith@yahoo.com


In [15]:
# explantion
'''
Regex part	    What it does
[^@]+	        Everything before @
@	            The @ symbol itself
.+	            Everything after @
'''

df[['Username','Domain']] = df['Email'].str.extract(r'([^@]+)@(.+)')
print(df)
# Extract username and domain

                   Email     Username     Domain
0     john.doe@gmail.com     john.doe  gmail.com
1  alice_smith@yahoo.com  alice_smith  yahoo.com


In [23]:
#  Extract Phone Numbers
df = pd.DataFrame({'Text': ['Call me at 9876543210', 'My number is 9123456789']})
df['Phnum'] = df['Text'].str.extract(r'(\b\d{10}\b)') 
# \b Meaning: This is a word boundary. eg:abc9876543210xyz----->  \b9876543210\b, output: 9876543210
# \d{10} matches exactly 10 digits (9876543210).


print(df)


                      Text       Phnum
0    Call me at 9876543210  9876543210
1  My number is 9123456789  9123456789


In [32]:
#  Extract Dates in Various Formats
df = pd.DataFrame({"Info": ['Date: 2025-04-12', 'DOB: 12/04/1999']})
df['date'] = df['Info'].str.extract(r'(\d{2}/\d{2}/\d{4}|\d{4}-\d{2}-\d{2})')
# carefull space will not come and format should correct that time only output is come / very easy
# r'(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4})'

print(df)

               Info        date
0  Date: 2025-04-12  2025-04-12
1   DOB: 12/04/1999  12/04/1999


In [40]:
#  Extract Alphanumeric Codes (e.g., EMP001, ID456)
df = pd.DataFrame({'EmpId': ['EMP001', 'ID456', 'EMP999']})
df[['prefix', 'Id']] = df['EmpId'].str.extract(r'([A-Z]+)(\d+)')
                                               
print(df)

    EmpId prefix   Id
0  EMP001    EMP  001
1   ID456     ID  456
2  EMP999    EMP  999


In [5]:
# Extract Website Domain from URL
import pandas as pd
'''
df = pd.DataFrame({
    'URL': ['https://www.google.com', 'http://facebook.com']
})

df['Domain'] = df['URL'].str.extract(r'https?://(?:www\.)?([^/]+)')
print(df)'''

df = pd.DataFrame({'URL': ['https://www.google.com', 'http://facebook.com']})
df['Domain'] = df['URL'].str.extract(r"https?://(?:www\.)?([^/]+)")    # easy but little bit confused incase if u have time just learn again.
print(df)

                      URL        Domain
0  https://www.google.com    google.com
1     http://facebook.com  facebook.com


  '''


In [7]:
#🔑 Summary: findall vs extract
#Method	Returns	Use when...

#) .str.findall()	A list of all matches	You want all matching names/words
#) .str.extract()	The first match only	You want to capture a single item per row

# Extract Capitalized Words (like names)
df = pd.DataFrame({'Text': ['My name is John Doe', 'Hello from Alice Johnson']})
df['Names'] = df['Text'].str.findall(r'\b[A-Z][a-z]+\b')
print(df)


                       Text                    Names
0       My name is John Doe          [My, John, Doe]
1  Hello from Alice Johnson  [Hello, Alice, Johnson]


In [13]:
# Remove Special Characters

df = pd.DataFrame({'Product': ['iPhone@12#Pro!', 'Galaxy$S21*Ultra']})
df['Cleaned'] = df['Product'].str.replace(r'[^A-Za-z0-1]+', '', regex = True)

print(df)

            Product        Cleaned
0    iPhone@12#Pro!     iPhone1Pro
1  Galaxy$S21*Ultra  GalaxyS1Ultra


In [14]:
# Extract Email Addresses
import re

text = "Contact us at support@example.com or hr@company.org" # emails = re.findall(r'\b[\w.-]+@[\w.-]+\.\w+\b', text)
emails = re.findall(r"\b[\w.-]+@[\w.-]+\.\w+\b", text) # very easy
print(emails)


['support@example.com', 'hr@company.org']


In [16]:
# Replace Multiple Spaces with a Single Space
text = "This    is   a    messy   sentence."  # cleaned = re.sub(r'\s+', ' ', text), # sub mean substitute
cleaned = re.sub(r'\s+', ' ', text)
print(cleaned)


This is a messy sentence.


In [17]:
# Standardize Date Format (DD-MM-YYYY)
text = "Today's date is 2025/04/13 and yesterday was 2025-04-12." # standardized = re.sub(r'(\d{4})[-/](\d{2})[-/](\d{2})', r'\3-\2-\1', text)
standardize = re.sub(r'(\d{4})[-/](\d{2})[-/](\d{2})', r'\3-\2-\1', text) # very easy
print(standardize)

Today's date is 13-04-2025 and yesterday was 12-04-2025.


In [18]:
# Extract All Prices from Text
text = "Products: iPhone - $999, AirPods - $199, MacBook - $1299." # prices = re.findall(r'\$\d+', text)
prices = re.findall(r'\$\d+', text)
print(prices)


['$999', '$199', '$1299']


In [19]:
# Mask Phone Numbers
text = "Call me at 987-654-3210 or 123-456-7890." # re.sub(r'\d{3}-\d{3}-\d{4}', 'XXX-XXX-XXXX', text)
masked = re.sub(r'\d{3}-\d{3}-\d{4}', 'xxx-xxx-xxx', text) 
print(masked)


Call me at xxx-xxx-xxx or xxx-xxx-xxx.


In [22]:
#  Extract Domain Names from URLs

text = "Visit https://www.google.com or http://example.org for more info."
#domains = re.findall(r'https?://(www\.)?([\w.-]+)', text) # easy tha once again paru
#print([match[1] for match in domains])

domains = re.findall(r'https?://(wwww/.)?([\w.-]+)', text)
print(domains)
print([match[1] for match in domains])

[('', 'www.google.com'), ('', 'example.org')]
['www.google.com', 'example.org']


In [4]:
#Remove HTML Tags
import re
text = "<div>Hello <b>world</b>!</div>"
cleaned = re.sub(r'<[^>]+>','', text)  #< literaaly, [^>] matching except this >, + one or more character, > literally.
print(cleaned)


Hello world!
