In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

def nlp_pipeline(sentence):
    # Tokenize the sentence
    tokens = word_tokenize(sentence)
    print("Original Tokens:", tokens)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens_no_stop = [word for word in tokens if word.lower() not in stop_words and word not in string.punctuation]
    print("Tokens Without Stopwords:", tokens_no_stop)

    # Apply stemming
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(word) for word in tokens_no_stop]
    print("Stemmed Words:", stemmed)

sentence = "NLP techniques are used in virtual assistants like Alexa and Siri."
nlp_pipeline(sentence)

# --- Short Answer ---
print("\nQ1 Short Answers:")
print("1. Stemming vs Lemmatization:")
print("   - Stemming cuts off prefixes/suffixes (e.g., 'running' -> 'run'), but might result in non-words (e.g., 'studies' -> 'studi').")
print("   - Lemmatization uses vocabulary and rules to return the base form (e.g., 'running' -> 'run'), so it's more accurate.")

print("2. Removing stopwords:")
print("   - Useful to reduce noise and focus on important words (e.g., for classification).")
print("   - Can be harmful when stopwords carry meaning (e.g., in sentiment analysis).")

Original Tokens: ['NLP', 'techniques', 'are', 'used', 'in', 'virtual', 'assistants', 'like', 'Alexa', 'and', 'Siri', '.']
Tokens Without Stopwords: ['NLP', 'techniques', 'used', 'virtual', 'assistants', 'like', 'Alexa', 'Siri']
Stemmed Words: ['nlp', 'techniqu', 'use', 'virtual', 'assist', 'like', 'alexa', 'siri']

Q1 Short Answers:
1. Stemming vs Lemmatization:
   - Stemming cuts off prefixes/suffixes (e.g., 'running' -> 'run'), but might result in non-words (e.g., 'studies' -> 'studi').
   - Lemmatization uses vocabulary and rules to return the base form (e.g., 'running' -> 'run'), so it's more accurate.
2. Removing stopwords:
   - Useful to reduce noise and focus on important words (e.g., for classification).
   - Can be harmful when stopwords carry meaning (e.g., in sentiment analysis).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
