<a href="https://colab.research.google.com/github/TAruna-SP/NLP/blob/week-1/Stop_Words_%26_Stemming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))
# Initialize the stemmer
stemmer = PorterStemmer()

print("A sample of English stopwords:", list(stop_words)[:10])
print() # Empty line

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...


A sample of English stopwords: ['now', 'be', 't', 'there', 'him', 'your', 'as', "it's", 'itself', 'wasn']



[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [4]:
# Re-use our text from the Micro-Project
user_text = "Natural Language Processing (NLP) is amazing! It helps computers understand human language. Don't you think it's fascinating?"

# 1. Tokenize and clean (as before)
from nltk.tokenize import word_tokenize
import string
words = word_tokenize(user_text)
clean_words = [w.lower() for w in words if w not in string.punctuation]

# 2. NEW: Remove stopwords
filtered_words = [w for w in clean_words if w not in stop_words]
print("Step 1 - Removing Stopwords:")
print(f"Original words ({len(clean_words)}): {clean_words}")
print(f"Filtered words ({len(filtered_words)}): {filtered_words}")
print() # Empty line

Step 1 - Removing Stopwords:
Original words (19): ['natural', 'language', 'processing', 'nlp', 'is', 'amazing', 'it', 'helps', 'computers', 'understand', 'human', 'language', 'do', "n't", 'you', 'think', 'it', "'s", 'fascinating']
Filtered words (14): ['natural', 'language', 'processing', 'nlp', 'amazing', 'helps', 'computers', 'understand', 'human', 'language', "n't", 'think', "'s", 'fascinating']



In [5]:
# 3. NEW: Apply stemming to the filtered list
stemmed_words = [stemmer.stem(w) for w in filtered_words]
print("Step 2 - Applying Stemming:")
print(f"Filtered words: {filtered_words}")
print(f"Stemmed words : {stemmed_words}")
print() # Empty line

Step 2 - Applying Stemming:
Filtered words: ['natural', 'language', 'processing', 'nlp', 'amazing', 'helps', 'computers', 'understand', 'human', 'language', "n't", 'think', "'s", 'fascinating']
Stemmed words : ['natur', 'languag', 'process', 'nlp', 'amaz', 'help', 'comput', 'understand', 'human', 'languag', "n't", 'think', "'s", 'fascin']



In [6]:
print("=== Text Inspector 2.0 (With Stopwords & Stemming) ===\n")
# Analyze the FILTERED list (more meaningful)
from collections import Counter
word_freq = Counter(filtered_words)

print("Top 5 Meaningful Words:")
for word, freq in word_freq.most_common(5):
    print(f"   '{word}' appears {freq} time(s)")

# Bonus: See the effect of stemming on frequency
stemmed_freq = Counter(stemmed_words)
print("\nTop 5 Stems (grouping similar words):")
for stem, freq in stemmed_freq.most_common(5):
    print(f"   '{stem}' appears {freq} time(s)")

=== Text Inspector 2.0 (With Stopwords & Stemming) ===

Top 5 Meaningful Words:
   'language' appears 2 time(s)
   'natural' appears 1 time(s)
   'processing' appears 1 time(s)
   'nlp' appears 1 time(s)
   'amazing' appears 1 time(s)

Top 5 Stems (grouping similar words):
   'languag' appears 2 time(s)
   'natur' appears 1 time(s)
   'process' appears 1 time(s)
   'nlp' appears 1 time(s)
   'amaz' appears 1 time(s)
