# NLP Assignment
Using a sample paragraph on **Technology**.

## Q1. Text Processing and Word Frequency

In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

text = """Technology is evolving faster than ever. Innovations like artificial intelligence, quantum computing, 
and blockchain are transforming industries. I find it fascinating how algorithms impact daily life. 
From smartphones to smart homes, tech makes everything more connected. The future promises even more 
integration of digital systems into society."""

lower_text = text.lower().translate(str.maketrans('', '', string.punctuation))

words = word_tokenize(lower_text)
sentences = sent_tokenize(lower_text)

stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words and word.isalpha()]

word_freq = Counter(filtered_words)

print("Tokenized Sentences:", sentences)
print("Filtered Words:", filtered_words)
print("Word Frequencies:", word_freq)


## Q2. Stemming and Lemmatization

In [None]:
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

porter = PorterStemmer()
lancaster = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

porter_result = [porter.stem(word) for word in filtered_words]
lancaster_result = [lancaster.stem(word) for word in filtered_words]
lemma_result = [lemmatizer.lemmatize(word) for word in filtered_words]

print("Porter Stemming:", porter_result)
print("Lancaster Stemming:", lancaster_result)
print("Lemmatization:", lemma_result)


## Q3. Regular Expressions and Splitting

In [None]:
import re

words_5plus = re.findall(r'\b\w{6,}\b', text)
numbers = re.findall(r'\b\d+\b', text)
capitalized = re.findall(r'\b[A-Z][a-z]*\b', text)

only_alpha = re.findall(r'\b[a-zA-Z]+\b', text)
vowel_words = [word for word in only_alpha if word[0].lower() in 'aeiou']

print("Words with >5 letters:", words_5plus)
print("Numbers:", numbers)
print("Capitalized Words:", capitalized)
print("Only Alphabet Words:", only_alpha)
print("Words Starting with Vowel:", vowel_words)


## Q4. Custom Tokenization & Regex Cleaning

In [None]:
def custom_tokenizer(text):
    text = re.sub(r'[^a-zA-Z0-9.\s\-\'’]', ' ', text)
    tokens = re.findall(r"\d+\.\d+|\b\w[\w\-']*\b", text)
    return tokens

sample_text = """Email me at hello@example.com or visit https://technews.com. 
Call +91 9876543210 for more info. Our system is state-of-the-art, and isn't it amazing?"""

cleaned_text = re.sub(r'\b[\w.-]+?@\w+?\.\w+\b', '<EMAIL>', sample_text)
cleaned_text = re.sub(r'https?://[^\s]+', '<URL>', cleaned_text)
cleaned_text = re.sub(r'(\+?\d{1,3})?[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4}', '<PHONE>', cleaned_text)

tokens = custom_tokenizer(cleaned_text)

print("Cleaned Text:", cleaned_text)
print("Custom Tokens:", tokens)
