In [12]:
# Perfect! Learning regex (regular expressions) in Pandas is a game-changer for real-world data cleaning and transformation
# Common Regex Symbols

'''
^  : Matches the beginning of a string.  
$  : Matches the end of a string.  
.  : Matches any single character except newline.  
*  : Matches 0 or more repetitions of the previous character.  
+  : Matches 1 or more repetitions of the previous character.  
?  : Matches 0 or 1 of the previous character.  
[] : Matches any one character inside the brackets.  
[^]: Matches any one character NOT inside the brackets.  
() : Groups patterns together.  
|  : Acts like OR between expressions.  

'''

#  Extract Email Username & Domain
import pandas as pd
df = pd.DataFrame({'Email': ['john.doe@gmail.com', 'alice_smith@yahoo.com']})



In [8]:
print(df)


                   Email
0     john.doe@gmail.com
1  alice_smith@yahoo.com


In [15]:
# explantion
'''
Regex part	    What it does
[^@]+	        Everything before @
@	            The @ symbol itself
.+	            Everything after @
'''

df[['Username','Domain']] = df['Email'].str.extract(r'([^@]+)@(.+)')
print(df)
# Extract username and domain

                   Email     Username     Domain
0     john.doe@gmail.com     john.doe  gmail.com
1  alice_smith@yahoo.com  alice_smith  yahoo.com


In [23]:
#  Extract Phone Numbers
df = pd.DataFrame({'Text': ['Call me at 9876543210', 'My number is 9123456789']})
df['Phnum'] = df['Text'].str.extract(r'(\b\d{10}\b)') 
# \b Meaning: This is a word boundary. eg:abc9876543210xyz----->  \b9876543210\b, output: 9876543210
# \d{10} matches exactly 10 digits (9876543210).


print(df)


                      Text       Phnum
0    Call me at 9876543210  9876543210
1  My number is 9123456789  9123456789


In [32]:
#  Extract Dates in Various Formats
df = pd.DataFrame({"Info": ['Date: 2025-04-12', 'DOB: 12/04/1999']})
df['date'] = df['Info'].str.extract(r'(\d{2}/\d{2}/\d{4}|\d{4}-\d{2}-\d{2})')
# carefull space will not come and format should correct that time only output is come / very easy
# r'(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4})'

print(df)

               Info        date
0  Date: 2025-04-12  2025-04-12
1   DOB: 12/04/1999  12/04/1999


In [40]:
#  Extract Alphanumeric Codes (e.g., EMP001, ID456)
df = pd.DataFrame({'EmpId': ['EMP001', 'ID456', 'EMP999']})
df[['prefix', 'Id']] = df['EmpId'].str.extract(r'([A-Z]+)(\d+)')
                                               
print(df)

    EmpId prefix   Id
0  EMP001    EMP  001
1   ID456     ID  456
2  EMP999    EMP  999


In [5]:
# Extract Website Domain from URL
import pandas as pd
'''
df = pd.DataFrame({
    'URL': ['https://www.google.com', 'http://facebook.com']
})

df['Domain'] = df['URL'].str.extract(r'https?://(?:www\.)?([^/]+)')
print(df)'''

df = pd.DataFrame({'URL': ['https://www.google.com', 'http://facebook.com']})
df['Domain'] = df['URL'].str.extract(r"https?://(?:www\.)?([^/]+)")    # easy but little bit confused incase if u have time just learn again.
print(df)

                      URL        Domain
0  https://www.google.com    google.com
1     http://facebook.com  facebook.com


  '''


In [7]:
#🔑 Summary: findall vs extract
#Method	Returns	Use when...

#) .str.findall()	A list of all matches	You want all matching names/words
#) .str.extract()	The first match only	You want to capture a single item per row

# Extract Capitalized Words (like names)
df = pd.DataFrame({'Text': ['My name is John Doe', 'Hello from Alice Johnson']})
df['Names'] = df['Text'].str.findall(r'\b[A-Z][a-z]+\b')
print(df)


                       Text                    Names
0       My name is John Doe          [My, John, Doe]
1  Hello from Alice Johnson  [Hello, Alice, Johnson]
