# Assignment 9

### Q1.

In [4]:
# Step 0: Install and download NLTK data
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Step 1: Import libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
import string

# Step 2: Input paragraph (about your favorite topic)
paragraph = """Technology has transformed the way we live and work. From smartphones to smart homes,
innovation drives modern life. Artificial intelligence is revolutionizing industries, making tasks
faster and more efficient. The internet connects people globally, allowing instant communication and
access to information. Technology continues to evolve rapidly, shaping the future of humanity."""

# Step 3: Convert to lowercase and remove punctuation
lower_nopunct = paragraph.lower().translate(str.maketrans('', '', string.punctuation))

# Step 4: Tokenize into words and sentences
words = word_tokenize(lower_nopunct)
sentences = sent_tokenize(paragraph)

# Step 5: Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

# Step 6: Display word frequency distribution
fdist = FreqDist(filtered_words)

# Output
print("Sentences:")
for s in sentences:
    print("-", s)

print("\nFiltered Words (after removing stopwords and punctuation):")
print(filtered_words)

print("\nWord Frequency Distribution:")
for word, freq in fdist.items():
    print(f"{word}: {freq}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Sentences:
- Technology has transformed the way we live and work.
- From smartphones to smart homes, 
innovation drives modern life.
- Artificial intelligence is revolutionizing industries, making tasks 
faster and more efficient.
- The internet connects people globally, allowing instant communication and 
access to information.
- Technology continues to evolve rapidly, shaping the future of humanity.

Filtered Words (after removing stopwords and punctuation):
['technology', 'transformed', 'way', 'live', 'work', 'smartphones', 'smart', 'homes', 'innovation', 'drives', 'modern', 'life', 'artificial', 'intelligence', 'revolutionizing', 'industries', 'making', 'tasks', 'faster', 'efficient', 'internet', 'connects', 'people', 'globally', 'allowing', 'instant', 'communication', 'access', 'information', 'technology', 'continues', 'evolve', 'rapidly', 'shaping', 'future', 'humanity']

Word Frequency Distribution:
technology: 2
transformed: 1
way: 1
live: 1
work: 1
smartphones: 1
smart: 1
home

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Q2.

In [6]:
# Step 0: Install and download required NLTK data
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

# Step 1: Initialize stemmers and lemmatizer
porter = PorterStemmer()
lancaster = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

# Step 2: Use filtered_words from Question 1
# If you didn't run Q1 in the same notebook, you can reuse this list here:
filtered_words = ['technology', 'transformed', 'way', 'live', 'work', 'smartphones', 'smart',
                  'homes', 'innovation', 'drives', 'modern', 'life', 'artificial',
                  'intelligence', 'revolutionizing', 'industries', 'making', 'tasks',
                  'faster', 'efficient', 'internet', 'connects', 'people', 'globally',
                  'allowing', 'instant', 'communication', 'access', 'information',
                  'continues', 'evolve', 'rapidly', 'shaping', 'future', 'humanity']

# Step 3: Apply Stemming and Lemmatization
print(f"{'Word':<20}{'PorterStemmer':<20}{'LancasterStemmer':<20}{'Lemmatizer':<20}")
print("-" * 80)

for word in filtered_words:
    porter_stem = porter.stem(word)
    lancaster_stem = lancaster.stem(word)
    lemma = lemmatizer.lemmatize(word)  # Default pos='n' (noun)
    print(f"{word:<20}{porter_stem:<20}{lancaster_stem:<20}{lemma:<20}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Word                PorterStemmer       LancasterStemmer    Lemmatizer          
--------------------------------------------------------------------------------
technology          technolog           technolog           technology          
transformed         transform           transform           transformed         
way                 way                 way                 way                 
live                live                liv                 live                
work                work                work                work                
smartphones         smartphon           smartphon           smartphones         
smart               smart               smart               smart               
homes               home                hom                 home                
innovation          innov               innov               innovation          
drives              drive               driv                drive               
modern              modern  

### Q3.

In [7]:
import re

# Original paragraph from Q1
paragraph = """Technology has transformed the way we live and work. From smartphones to smart homes,
innovation drives modern life. Artificial intelligence is revolutionizing industries, making tasks
faster and more efficient. The internet connects people globally, allowing instant communication and
access to information. Technology continues to evolve rapidly, shaping the future of humanity."""

# --- 2a. Extract all words with more than 5 letters ---
words_more_than_5 = re.findall(r'\b\w{6,}\b', paragraph)

# --- 2b. Extract all numbers ---
numbers = re.findall(r'\b\d+\b', paragraph)

# --- 2c. Extract all capitalized words ---
capitalized_words = re.findall(r'\b[A-Z][a-z]*\b', paragraph)

# --- 3a. Split text into alphabetic-only words (no digits or special chars) ---
alphabetic_words = re.findall(r'\b[a-zA-Z]+\b', paragraph)

# --- 3b. Extract words starting with a vowel (case insensitive) ---
vowel_words = re.findall(r'\b[aeiouAEIOU][a-zA-Z]*\b', paragraph)

# --- Display results ---
print("Words with more than 5 letters:\n", words_more_than_5)
print("\nNumbers in text:\n", numbers)
print("\nCapitalized words:\n", capitalized_words)
print("\nAlphabetic-only words:\n", alphabetic_words)
print("\nWords starting with a vowel:\n", vowel_words)


Words with more than 5 letters:
 ['Technology', 'transformed', 'smartphones', 'innovation', 'drives', 'modern', 'Artificial', 'intelligence', 'revolutionizing', 'industries', 'making', 'faster', 'efficient', 'internet', 'connects', 'people', 'globally', 'allowing', 'instant', 'communication', 'access', 'information', 'Technology', 'continues', 'evolve', 'rapidly', 'shaping', 'future', 'humanity']

Numbers in text:
 []

Capitalized words:
 ['Technology', 'From', 'Artificial', 'The', 'Technology']

Alphabetic-only words:
 ['Technology', 'has', 'transformed', 'the', 'way', 'we', 'live', 'and', 'work', 'From', 'smartphones', 'to', 'smart', 'homes', 'innovation', 'drives', 'modern', 'life', 'Artificial', 'intelligence', 'is', 'revolutionizing', 'industries', 'making', 'tasks', 'faster', 'and', 'more', 'efficient', 'The', 'internet', 'connects', 'people', 'globally', 'allowing', 'instant', 'communication', 'and', 'access', 'to', 'information', 'Technology', 'continues', 'to', 'evolve', 'rapi

### Q4.

In [9]:
import re

# Sample text
text = """
Contact us at support@example.com or visit our website https://www.example.com.
Call us at 123-456-7890 or +91 9876543210 for more info.
This isn't a drill – state-of-the-art devices cost about 3.14 times more now!
"""

# Step 1: Regex substitutions
def regex_substitutions(text):
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '<EMAIL>', text)               # Email
    text = re.sub(r'https?://[^\s]+', '<URL>', text)                            # URL
    text = re.sub(r'\b(?:\+91\s?)?\d{10}\b|\d{3}-\d{3}-\d{4}', '<PHONE>', text)  # Phone numbers
    return text

# Step 2: Custom tokenization
def custom_tokenizer(text):
    pattern = r"""
        \b\w+(?:-\w+)+\b          # Hyphenated words
        | \b\d+\.\d+\b            # Decimal numbers
        | \b\w+'\w+\b             # Contractions
        | \b\w+\b                 # Normal words
    """
    return re.findall(pattern, text, re.VERBOSE)

# Apply substitutions
clean_text = regex_substitutions(text)

# Apply tokenization
tokens = custom_tokenizer(clean_text)

# Display output
print("Cleaned Text:\n", clean_text)
print("\nCustom Tokens:\n", tokens)

Cleaned Text:
 
Contact us at <EMAIL> or visit our website <URL>
Call us at <PHONE> or +91 <PHONE> for more info.
This isn't a drill – state-of-the-art devices cost about 3.14 times more now!


Custom Tokens:
 ['Contact', 'us', 'at', 'EMAIL', 'or', 'visit', 'our', 'website', 'URL', 'Call', 'us', 'at', 'PHONE', 'or', '91', 'PHONE', 'for', 'more', 'info', 'This', "isn't", 'a', 'drill', 'state-of-the-art', 'devices', 'cost', 'about', '3.14', 'times', 'more', 'now']
