## REGEX FUNDAMENTALS

#### Basic Pattern Matching

In [6]:
import re
text1 = "My name is Vamsi, I am from IIT Bhubaneswar"
text2 = r"Bhubaneswar"
matches = re.findall(text2, text1)
print(matches)

['Bhubaneswar']


text = "I started Learning learning regex fundas"
pattern = r"[Ll]earning" # matches Learing or learning
matches = re.findall(pattern, text)
print(matches)

In [20]:
text = "color, colour, colours, colors, colos, colrs, cs"
pattern = r"colou?rs?"
matches = re.findall(pattern, text)
print(matches)

['color', 'colour', 'colours', 'colors']


#### Speacial Characters and Anchors


##### start and end anchors

In [22]:
texts = ["hello World", "world hello", "hello"]
pattern = r"^hello"
for text in texts:
    print(bool(re.search(pattern, text)))

True
False
True


In [25]:
text = "The cat scattered his food all over the room."
pattern = r"\bcat\b"  # Matches 'cat' but not 'scattered'
matches = re.findall(pattern, text)
print(matches)  

['cat']


In [38]:
text = "My date of birth is 28-02-2005 , 28-03-2009, 24-05-2000 or 28/02/2007"
pattern = r"(\d{2})-(\d{2})-(\d{4})|(\d{2})/(\d{2})/(\d{4})" ##extracting dates
matches = re.findall(pattern , text)
print(matches)

[('28', '02', '2005', '', '', ''), ('28', '03', '2009', '', '', ''), ('24', '05', '2000', '', '', ''), ('', '', '', '28', '02', '2007')]


#### Positive Lookahead 

In [39]:
text = "100 dollars, 200 euros, 300 yen"
pattern = r"\d+(?=\s*dollars)"  # Numbers followed by 'dollars'
matches = re.findall(pattern, text)
print(matches) 

['100']


In [43]:
text = "paid $100, received $200, lost $50"
pattern = r"(?<!paid\s)\$\d+"
matches = re.findall(pattern, text)
print(matches)

['$200', '$50']


In [44]:
import re

text = "Mr. O'Neill said: 'Don't worry!' (It's fine.)"
pattern = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s+|[\"\'\(\)\[\];:,\–\-]|\s+(?=\-)"
tokens = re.split(pattern, text)
print([t for t in tokens if t])


['Mr. O', 'Neill said', ' ', 'Don', 't worry!', ' ', 'It', 's fine.']


In [45]:
text = "Contact John Doe at john.doe@example.com or call 555-123-4567 by 2023-12-31."
patterns = {
    'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
    'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
    'date': r'\b\d{4}-\d{2}-\d{2}\b',
    'name': r'\b[A-Z][a-z]+ [A-Z][a-z]+\b'
}

entities = {}
for entity_type, pattern in patterns.items():
    entities[entity_type] = re.findall(pattern, text)

print(entities)

{'email': ['john.doe@example.com'], 'phone': ['555-123-4567'], 'date': ['2023-12-31'], 'name': ['Contact John']}


In [49]:
mail2='eshant@gmail.com'
p=r'[A-Za-z0-9.]+@[\w]+.\w{3}'
print(re.search(p,mail2))

<re.Match object; span=(0, 16), match='eshant@gmail.com'>


In [51]:
txt="Eshant $ is happ@y !,"
print(re.findall('[^!,$@]+',txt))
print(''.join(re.findall('[^!,$@]+',txt)))

['Eshant ', ' is happ', 'y ']
Eshant  is happy 


In [52]:
txt="I'm Ashish and 24"
print(''.join(re.findall('\D',txt)))

I'm Ashish and 


  print(''.join(re.findall('\D',txt)))


In [56]:
mail1='eshant@gfg.org'
mail2='eshant@gmail.com'
p='[A-Za-z0-2]+@(gfg).(org)'
print(re.search(p,mail1))
print(re.search(p,mail2))

<re.Match object; span=(0, 14), match='eshant@gfg.org'>
None


In [60]:
mail1="eshant@gfg.org"
mail2="eshant@gfg.net"
mail3="eshant@gmail.com"
p="[A-Za-z0-2]+@(gfg).(org|net|in)"
print(re.search(p,mail1))
print(re.search(p,mail2))
print(re.search(p,mail3))

<re.Match object; span=(0, 14), match='eshant@gfg.org'>
<re.Match object; span=(0, 14), match='eshant@gfg.net'>
None


#### Email Extracation

In [63]:

# List of email addresses
mails = ['eshant@gmail.com','eshant@gfg.org','eshant@yahoo.com','eshant@aap.gov.in','esh@geeksforgeeks.com','esh@mail.com','eshant@orkut.com']

# Lists to store user id, host name, and domain type
user_id = []
host_name = []
domain_type = []

# Loop through each email in the list
for mail in mails:
    # Extract and append the user id (part before '@')
    user_id.append(mail.split('@')[0])
    
    # Extract and append the host name (part between '@' and first '.')
    host_name.append(mail.split('@')[1].split('.')[0])
    
    # Extract and append the domain type (part after first '.')
    domain_type.append('.'.join(mail.split('@')[1].split('.')[1:]))

# Print the results
print(user_id)  # List of user ids
print(host_name)  # List of host names
print(domain_type)  # List of domain types

['eshant', 'eshant', 'eshant', 'eshant', 'esh', 'esh', 'eshant']
['gmail', 'gfg', 'yahoo', 'aap', 'geeksforgeeks', 'mail', 'orkut']
['com', 'org', 'com', 'gov.in', 'com', 'com', 'com']


## Processing some Text to extract date and price

In [64]:
class TextProcessor:
    def __init__(self):
        self.patterns = {
            'url': re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[/\w .-]*/?'),
            'hashtag': re.compile(r'#\w+'),
            'mention': re.compile(r'@\w+'),
            'html': re.compile(r'<[^>]+>'),
            'special_chars': re.compile(r'[^\w\s]'),
            'extra_whitespace': re.compile(r'\s+')
        }
        
    def clean_text(self, text):
        text = self.patterns['url'].sub(' URL ', text)
        text = self.patterns['hashtag'].sub(' HASHTAG ', text)
        text = self.patterns['mention'].sub(' MENTION ', text)
        text = self.patterns['html'].sub(' ', text)
        text = self.patterns['special_chars'].sub(' ', text)
        text = self.patterns['extra_whitespace'].sub(' ', text)
        return text.strip()
    
    def extract_entities(self, text):
        entities = {}
        entities['dates'] = re.findall(r'\b\d{4}-\d{2}-\d{2}\b|\b\d{1,2}/\d{1,2}/\d{4}\b', text)
        entities['money'] = re.findall(r'\$\d+(?:,\d{3})*(?:\.\d{2})?', text)
        return entities

processor = TextProcessor()
sample = "Check out https://example.com @user! Sale ends 2023-12-31. Price: $1,299.99 #deal"
print("Cleaned:", processor.clean_text(sample))
print("Entities:", processor.extract_entities(sample))

Cleaned: Check out URL MENTION Sale ends 2023 12 31 Price 1 299 99 HASHTAG
Entities: {'dates': ['2023-12-31'], 'money': ['$1,299.99']}
