## A Regular Expression (Regex) is a pattern used to search, match, or extract text.

In [1]:
import re

In [4]:
text = "The cat is black"
match = re.search("cat", text)

print(match)

<re.Match object; span=(4, 7), match='cat'>


In [9]:
text = "I am learning Python for AI"

match = re.search("Python", text)
print(f"Found",match)

Found <re.Match object; span=(14, 20), match='Python'>


In [14]:
text = "I have a cat, a cort, and a cut."

matches = re.findall("c.t", text)
print(matches)

['cat', 'cut']


In [15]:
text = "bat bit but bet boat boot"
match = re.findall("b.t", text)
print(match)

['bat', 'bit', 'but', 'bet']


In [16]:
text = "bat bet bit bot but"

print(re.findall("b[aeiou]t", text))

['bat', 'bet', 'bit', 'bot', 'but']


In [17]:
text = "cat cot cut cit coat"

print(re.findall("c[ao]t", text))

['cat', 'cot']


In [21]:
text = "My marks are 85 and 90, 890"
re.findall("\d+", text)

['85', '90', '890']

In [22]:
## Extract all numbers from this:

text = "Order 123 costs 450 rupees, discount 50"
re.findall("\d+", text)

['123', '450', '50']

In [23]:
## Find all words that start with go and end with gle where o can repeat.

text = "gogle google gooogle gogole"
re.findall("go+gle", text)

['gogle', 'google', 'gooogle']

In [25]:
## Extract email addresses:

text = "Contact us at help@gmail.com or admin@company.in"
re.findall("\w+@\w+\.\w+",text)

['help@gmail.com', 'admin@company.in']

# ðŸ§¹ NLP Cleaning with Regex

In [27]:
import re

text = "Visit https://openai.com for AI research"
clean = re.sub(r"https\S+ | www\S+", "", text)  #\S+ = all non-space characters after http

clean

'Visit for AI research'

In [30]:
# Remove URLs from:

text = "Read blog at https://medium.com and www.google.com"
clean = re.sub(r"https\S+ | www\S+", "", text)
clean

'Read blog at and'

In [32]:
## Remove Special Characters

text = "AI is awesome!!! #ML @2026"

clean = re.sub(r"[^a-zA-Z\s]", "",text)
clean

'AI is awesome ML '

In [34]:
text = "NLP > ML??? 100% sure!!!"

clean = re.sub(r"[^a-zA-Z\s]", "", text)
clean

'NLP  ML  sure'

In [35]:
## Remove Numbers

text = "My score is 95 in 2026"
clean = re.sub(r"\d+", "", text)
clean

'My score is  in '

In [36]:
text = "Room no 402, Floor 9"
clean = re.sub(r"\d+", "", text)
clean

'Room no , Floor '

In [38]:
# Remove Extra Spaces

text = "AI    is   powerful"
clean = re.sub(r"\s+", " ", text)
clean

'AI is powerful'

In [39]:
text = "Deep     Learning   is   cool"
clean = re.sub(r"\s+", " ", text)
clean

'Deep Learning is cool'

In [42]:
# Remove Hashtags & Mentions 

text = "Loving #AI and #NLP thanks @OpenAI"
clean = re.sub(r"[@#]\w+", "", text)
clean

'Loving  and  thanks '

In [43]:
# Remove Hashtags & Mentions 

text = "Loving #AI and #NLP thanks @OpenAI"
clean = re.sub(r"[@#]", "", text)
clean

'Loving AI and NLP thanks OpenAI'

In [44]:
text = "Follow @user123 for #MachineLearning updates"
clean = re.sub(r"[@#]\w+", "", text)
clean

'Follow  for  updates'

## ðŸ”¥ FINAL NLP CLEANING PIPELINE

In [54]:
import re

# text = "Hey Vishal!!! ðŸ˜ƒ Visit https://openai.com now!! Call me at 9876543210 #AI #ML"

text = "Hey!!! Visit https://site.com #AI score=100 ðŸ˜ƒ"

text = re.sub(r"http\S+|www\S+", "", text)  # remove URLs
text = re.sub(r"[@#]\w+", "", text)          # remove tags
text = re.sub(r"\d+", "", text)             # remove numbers
text = re.sub(r"[^a-zA-Z\s]", "", text)     # remove special chars
text = re.sub(r"\s+", " ", text).strip()      # remove extra spaces
text.lower()

'hey visit score'

## ðŸ§  Mini Project

In [55]:
sample = "Wow!!! AI is 100% awesome ðŸ˜Ž Visit www.ai.com #future @bot"

## Write a function:
def clean_text(text):
    # your regex cleaning steps
    return text

In [65]:
!pip install emoji
import emoji

def clean_text(text):
    text = emoji.replace_emoji(text, replace="")
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[@#]", "", text)
    text = re.sub(r"d+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = text.lower()
    return text




In [66]:
clean_text(sample)

'wow ai is awesome visit future bot'

# ðŸ§© PART 1 â€” Regex for Word Tokenization
Instead of simple .split(), regex gives smarter tokenization.

\w+  : Matches words only (removes punctuation automatically).

Used before:
Bag of Words,
TF-IDF,
Word2Vec

In [67]:
import re

text = "AI is powerful, isn't it?"
tokens = re.findall(r"\w+", text)
tokens

['AI', 'is', 'powerful', 'isn', 't', 'it']

In [68]:
text = "Deep-learning models are amazing!!!"

tokens = re.findall(r"\w+", text)
tokens

['Deep', 'learning', 'models', 'are', 'amazing']

# ðŸ§© PART 2 â€” Sentence Tokenization 
Split text/para into sentences.

Pattern:   [.!?]+

In [69]:
para = "AI is great. NLP is fun! Do you agree?"

sentence = re.split(r"[.!?+]", text)
sentence

['Deep-learning models are amazing', '', '', '']

In [70]:
text = "Machine learning is evolving! AI is the future. Are you ready?"

sentence = re.split(r"[.!?+]", text)
sentence

['Machine learning is evolving', ' AI is the future', ' Are you ready', '']

# ðŸ§© PART 3 â€” Extract Emails (Dataset Cleaning)

In [71]:
text = "Contact: user@gmail.com, admin@company.in"
email = re.findall(r"\w+@\w+.\w+", text)
email

['user@gmail.com', 'admin@company.in']

In [72]:
text = "Send mail to hr@startup.ai or support@helpdesk.org"
email = re.findall(r"\w+@\w+.\w+", text)
email

['hr@startup.ai', 'support@helpdesk.org']

# ðŸ§© PART 4 â€” Extract Phone Numbers

Pattern for 10-digit numbers: \d{10}


In [73]:
text = "Call me at 9876543210 or 9123456780"
number = re.findall(r"\d{10}", text)
number

['9876543210', '9123456780']

In [74]:
text = "Emergency: 9988776655, Office: 8877665544"
re.findall(r"\d{10}", text)

['9988776655', '8877665544']

# ðŸ§© PART 5 â€” Remove Stopwords Using Regex
Sometimes we remove common words using regex pattern.

\b = word boundary

In [None]:
text = "this is a sample sentence for NLP"
text = re.sub(r"\b(is|a|for|this)\b", "", text)

text

'   sample sentence  NLP'

In [77]:
# Remove stopwords "is, the, and":
text = "AI is powerful and the future is bright"
re.sub(r"\b(is|the|and)\b", "", text)

'AI  powerful   future  bright'

# ðŸš€ FINAL MINI PROJECT (Dataset Cleaning)

In [3]:
sample = "AI is changing the world, and NLP is leading the revolution!"

# Write a function:

def preprocess(text):
    # tokenize
    # remove stopwords
    # return list of words
    return text

In [19]:
import re

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)    
    text = re.sub(r"\b(is|the|and)\b", "", text) 
    text = re.sub(r"\s+", " ", text).strip()     
    tokens = re.findall(r"\w+", text)    
    return text

In [20]:
preprocess(sample)

'ai changing world nlp leading revolution'