In [7]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
paragraph ="In recent years, remote work has completely reshaped how many of us go about our daily lives. Instead of long commutes and office desks, people are working from their kitchens, living rooms, or even local coffee shops. It’s made juggling personal responsibilities and professional tasks a bit more manageable for some, while others miss the structure and social side of the traditional workplace. Video calls, messaging apps, and flexible hours have become the norm, and they’re changing not just how we work, but how we live. Still, finding that balance between being “on” for work and having personal time is something many are still figuring out."

In [9]:
print("Original Text:")
print(paragraph)

Original Text:
In recent years, remote work has completely reshaped how many of us go about our daily lives. Instead of long commutes and office desks, people are working from their kitchens, living rooms, or even local coffee shops. It’s made juggling personal responsibilities and professional tasks a bit more manageable for some, while others miss the structure and social side of the traditional workplace. Video calls, messaging apps, and flexible hours have become the norm, and they’re changing not just how we work, but how we live. Still, finding that balance between being “on” for work and having personal time is something many are still figuring out.


In [10]:
lowercase_text = paragraph.lower()
no_punct_text = re.sub(r'[^\w\s]', '', lowercase_text)
print("Lowercase without punctuation:")
print(no_punct_text)


Lowercase without punctuation:
in recent years remote work has completely reshaped how many of us go about our daily lives instead of long commutes and office desks people are working from their kitchens living rooms or even local coffee shops its made juggling personal responsibilities and professional tasks a bit more manageable for some while others miss the structure and social side of the traditional workplace video calls messaging apps and flexible hours have become the norm and theyre changing not just how we work but how we live still finding that balance between being on for work and having personal time is something many are still figuring out


In [11]:
sentences = sent_tokenize(paragraph)
words = word_tokenize(no_punct_text)

print("\nSentences tokenization:")
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}")

print("\nWord tokenization (first 20 words):")
print(words[:20])


Sentences tokenization:
Sentence 1: In recent years, remote work has completely reshaped how many of us go about our daily lives.
Sentence 2: Instead of long commutes and office desks, people are working from their kitchens, living rooms, or even local coffee shops.
Sentence 3: It’s made juggling personal responsibilities and professional tasks a bit more manageable for some, while others miss the structure and social side of the traditional workplace.
Sentence 4: Video calls, messaging apps, and flexible hours have become the norm, and they’re changing not just how we work, but how we live.
Sentence 5: Still, finding that balance between being “on” for work and having personal time is something many are still figuring out.

Word tokenization (first 20 words):
['in', 'recent', 'years', 'remote', 'work', 'has', 'completely', 'reshaped', 'how', 'many', 'of', 'us', 'go', 'about', 'our', 'daily', 'lives', 'instead', 'of', 'long']


In [12]:
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

print("\nAfter stopword removal (first 15 words):")
print(filtered_words[:15])


After stopword removal (first 15 words):
['recent', 'years', 'remote', 'work', 'completely', 'reshaped', 'many', 'us', 'go', 'daily', 'lives', 'instead', 'long', 'commutes', 'office']


In [13]:
word_freq = Counter(filtered_words)
print("\nWord frequency distribution (top 10):")
for word, freq in word_freq.most_common(10):
    print(f"{word}: {freq}")


Word frequency distribution (top 10):
work: 3
many: 2
personal: 2
still: 2
recent: 1
years: 1
remote: 1
completely: 1
reshaped: 1
us: 1


In [14]:
porter = PorterStemmer()
lancaster = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

In [15]:
print("Words after stopword removal:")
print(filtered_words[:10], "...\n")

Words after stopword removal:
['recent', 'years', 'remote', 'work', 'completely', 'reshaped', 'many', 'us', 'go', 'daily'] ...



In [16]:
porter_stems = [porter.stem(word) for word in filtered_words]
lancaster_stems = [lancaster.stem(word) for word in filtered_words]

In [17]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

In [18]:
print("Comparison of stemming and lemmatization (first 15 words):")
print("{:<15} {:<15} {:<15} {:<15}".format("Original", "Porter", "Lancaster", "WordNet"))
print("-" * 60)
for i in range(15):
    print("{:<15} {:<15} {:<15} {:<15}".format(
        filtered_words[i],
        porter_stems[i],
        lancaster_stems[i],
        lemmatized_words[i]
    ))

Comparison of stemming and lemmatization (first 15 words):
Original        Porter          Lancaster       WordNet        
------------------------------------------------------------
recent          recent          rec             recent         
years           year            year            year           
remote          remot           remot           remote         
work            work            work            work           
completely      complet         complet         completely     
reshaped        reshap          reshap          reshaped       
many            mani            many            many           
us              us              us              u              
go              go              go              go             
daily           daili           dai             daily          
lives           live            liv             life           
instead         instead         instead         instead        
long            long            long            

In [19]:
print("Original text:")
print(paragraph[:100], "...\n")

Original text:
In recent years, remote work has completely reshaped how many of us go about our daily lives. Instea ...



In [20]:
long_words = re.findall(r'\b\w{6,}\b', paragraph)
print("Words with more than 5 letters:")
print(long_words[:10], "...\n")

Words with more than 5 letters:
['recent', 'remote', 'completely', 'reshaped', 'Instead', 'commutes', 'office', 'people', 'working', 'kitchens'] ...



In [21]:
numbers = re.findall(r'\b\d+\b', paragraph)
print("Numbers found in text:")
print(numbers if numbers else "No numbers found in the text.\n")

Numbers found in text:
No numbers found in the text.



In [22]:
number_text = "AI research has grown by 200% since 2010, with over 50,000 papers published and 123 major breakthroughs."
numbers = re.findall(r'\b\d+\b', number_text)
print("Numbers in demo sentence:")
print(numbers, "\n")

Numbers in demo sentence:
['200', '2010', '50', '000', '123'] 



In [23]:
cap_words = re.findall(r'\b[A-Z][a-zA-Z]*\b', paragraph)
print("Capitalized words:")
print(cap_words, "\n")

Capitalized words:
['In', 'Instead', 'It', 'Video', 'Still'] 



In [24]:
alpha_only = re.findall(r'\b[a-zA-Z]+\b', paragraph)
print("Words containing only alphabets (first 10):")
print(alpha_only[:10], "...\n")

Words containing only alphabets (first 10):
['In', 'recent', 'years', 'remote', 'work', 'has', 'completely', 'reshaped', 'how', 'many'] ...



In [25]:
vowel_words = re.findall(r'\b[aeiouAEIOU][a-zA-Z]*\b', paragraph)
print("Words starting with a vowel:")
print(vowel_words, "\n")

Words starting with a vowel:
['In', 'of', 'us', 'about', 'our', 'Instead', 'of', 'and', 'office', 'are', 'or', 'even', 'It', 'and', 'a', 'others', 'and', 'of', 'apps', 'and', 'and', 'on', 'and', 'is', 'are', 'out'] 



In [30]:
sample_text = paragraph + """
Natural Language Processing (NLP) is a field in AI. It's becoming more advanced with state-of-the-art techniques.
Many people use emails like user@example.com for communication.
Websites like https://www.example.com provide resources on NLP.
For inquiries, call us at +91 9876543210 or 123-456-7890.
The value of π is approximately 3.14159.
"""
def custom_tokenize(text):

    text = re.sub(r"(\w+)'(\w+)", r"\1'_\2", text)
    text = re.sub(r"(\w+)-(\w+)-(\w+)", r"\1-\2-\3_", text)
    text = re.sub(r"(\w+)-(\w+)", r"\1-\2_", text)
    text = re.sub(r"(\d+)\.(\d+)", r"\1_DOT_\2", text)
    text = re.sub(r'[^\w\s\'_]', ' ', text)
    tokens = text.split()
    tokens = [token.replace("'_", "'").replace("-", "-").replace("_DOT_", ".").replace("_", "") for token in tokens]
    return tokens

tokens = custom_tokenize(sample_text)
print("\nCustom tokenization results:")
print(tokens, "\n")


Custom tokenization results:
['In', 'recent', 'years', 'remote', 'work', 'has', 'completely', 'reshaped', 'how', 'many', 'of', 'us', 'go', 'about', 'our', 'daily', 'lives', 'Instead', 'of', 'long', 'commutes', 'and', 'office', 'desks', 'people', 'are', 'working', 'from', 'their', 'kitchens', 'living', 'rooms', 'or', 'even', 'local', 'coffee', 'shops', 'It', 's', 'made', 'juggling', 'personal', 'responsibilities', 'and', 'professional', 'tasks', 'a', 'bit', 'more', 'manageable', 'for', 'some', 'while', 'others', 'miss', 'the', 'structure', 'and', 'social', 'side', 'of', 'the', 'traditional', 'workplace', 'Video', 'calls', 'messaging', 'apps', 'and', 'flexible', 'hours', 'have', 'become', 'the', 'norm', 'and', 'they', 're', 'changing', 'not', 'just', 'how', 'we', 'work', 'but', 'how', 'we', 'live', 'Still', 'finding', 'that', 'balance', 'between', 'being', 'on', 'for', 'work', 'and', 'having', 'personal', 'time', 'is', 'something', 'many', 'are', 'still', 'figuring', 'out', 'Natural', '

In [31]:
email_replaced = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '<EMAIL>', sample_text)

In [32]:
url_replaced = re.sub(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '<URL>', email_replaced)

In [33]:
phone_replaced = re.sub(r'(\+\d{1,3}\s)?\d{10}|\d{3}-\d{3}-\d{4}', '<PHONE>', url_replaced)

print("Text after regex substitutions:")
print(phone_replaced)

Text after regex substitutions:
In recent years, remote work has completely reshaped how many of us go about our daily lives. Instead of long commutes and office desks, people are working from their kitchens, living rooms, or even local coffee shops. It’s made juggling personal responsibilities and professional tasks a bit more manageable for some, while others miss the structure and social side of the traditional workplace. Video calls, messaging apps, and flexible hours have become the norm, and they’re changing not just how we work, but how we live. Still, finding that balance between being “on” for work and having personal time is something many are still figuring out.
Natural Language Processing (NLP) is a field in AI. It's becoming more advanced with state-of-the-art techniques.
Many people use emails like <EMAIL> for communication. 
Websites like <URL> provide resources on NLP.
For inquiries, call us at <PHONE> or <PHONE>.
The value of π is approximately 3.14159.

