In [None]:
# NLTK: Natural Language Toolkit (NLTK) is a Python library used for working with human language data (text).
# It provides easy-to-use interfaces for various NLP tasks like tokenization, stemming, and removing stopwords.

import nltk
from nltk.corpus import stopwords  # Contains a list of common words (e.g., 'is', 'the') to be filtered out (stop words).
from nltk.tokenize import word_tokenize  # Tokenizes (splits) text into individual words (tokens).
from nltk.stem import PorterStemmer  # Provides stemming functionality to reduce words to their root form.

# Download necessary NLTK data (only needs to be done once):
# - 'stopwords': A list of common stop words in multiple languages provided by nltk.
# - 'punkt': Data for tokenizing words/sentences.
nltk.download('stopwords')
nltk.download('punkt')

# Sample text for preprocessing.
text = "This is a simple text document for testing text preprocessing. Programmers program with different programming languages."

# Step 1: Tokenize the text.
# - word_tokenize(): Splits the text into words or punctuation marks.
words = word_tokenize(text)
print(f"Original Words:\n{words}")

# Step 2: Remove stop words.
# - Stop words are common words that do not carry significant meaning (e.g., 'is', 'and').
stop_words = set(stopwords.words('english'))  # Fetch English stop words.
filtered_words = [word for word in words if word.lower() not in stop_words]
print(f"\nAfter Removing Stop Words:\n{filtered_words}")

# Step 3: Apply stemming.
# - PorterStemmer: A stemming algorithm that reduces words to their base/root form (e.g., 'programming' -> 'program').
ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in filtered_words]
print(f"\nAfter Stemming:\n{stemmed_words}")

# **Uses of These Methods**:
# - Tokenization (word_tokenize()): Splits text for further analysis (e.g., in text classification).
# - Stop word removal: Improves processing by filtering out unimportant words.
# - Stemming (PorterStemmer): Normalizes text, reducing inflected/derived words to their base form (useful in search engines, text analysis).


Original Words:
['This', 'is', 'a', 'simple', 'text', 'document', 'for', 'testing', 'text', 'preprocessing', '.', 'Programmers', 'program', 'with', 'different', 'programming', 'languages', '.']

After Removing Stop Words:
['simple', 'text', 'document', 'testing', 'text', 'preprocessing', '.', 'Programmers', 'program', 'different', 'programming', 'languages', '.']

After Stemming:
['simpl', 'text', 'document', 'test', 'text', 'preprocess', '.', 'programm', 'program', 'differ', 'program', 'languag', '.']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khata\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khata\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
