In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# --- 1. SETUP & DATA ---
print("--- 0. Setup ---")
# Download necessary NLTK components (run once)
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    print("NLTK components downloaded successfully (punkt, stopwords, wordnet).")
except Exception as e:
    print(f"NLTK download failed. Ensure you have network access. Error: {e}")

# Example Corpus (used for TF-IDF)
CORPUS = [
    "The quick brown fox jumps over the lazy dog.",
    "A dog is lazy and a fox is quick.",
    "Quick animals like foxes and dogs are often found in nature."
]

# Example Document (used for Tokenization, Normalization, Lemmatization demo)
DOCUMENT_TO_PROCESS = "The running foxes are quickly jumping over the dogs! Do they like running?"

print("-" * 50)


# --- 2. TOKENIZATION, NORMALIZATION, AND LEMMATIZATION ---

def preprocess_text(text):
    """
    Applies Tokenization, Normalization, and Lemmatization to a single piece of text.
    """
    print(f"\n[INPUT TEXT]: {text}")

    # 2a. TOKENIZATION
    # Breaking the text into individual words or tokens
    tokens = word_tokenize(text)
    print("\n--- 2a. TOKENIZATION ---")
    print(f"Total tokens: {len(tokens)}")
    print(f"Tokens: {tokens}")

    # 2b. NORMALIZATION (Lowercasing and removing punctuation/stop words)
    print("\n--- 2b. NORMALIZATION ---")

    # 2b.i Lowercasing
    tokens_lower = [token.lower() for token in tokens]
    print(f"After Lowercasing: {tokens_lower}")

    # 2b.ii Removing punctuation
    punctuation_list = string.punctuation
    tokens_no_punct = [token for token in tokens_lower if token not in punctuation_list]
    print(f"After Punctuation Removal: {tokens_no_punct}")

    # 2b.iii Removing stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens_no_punct if token not in stop_words]
    print(f"After Stop Word Removal: {filtered_tokens}")


    # 2c. LEMMATIZATION
    # Reducing words to their base or root form (e.g., 'running' -> 'run', 'dogs' -> 'dog')
    print("\n--- 2c. STEMMING/LEMMATIZATION (Using Lemmatization) ---")
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token, pos='v') for token in filtered_tokens] # pos='v' for verbs
    lemmas = [lemmatizer.lemmatize(token) for token in lemmas] # default pos='n' for nouns

    print(f"Final Lemmatized Tokens: {lemmas}")

    return " ".join(lemmas)

# Run the preprocessing on the example document
print("--- 1. Preprocessing Demo (Tokenization, Normalization, Lemmatization) ---")
processed_document = preprocess_text(DOCUMENT_TO_PROCESS)
print("-" * 50)


# --- 3. TF-IDF CALCULATION ---

def calculate_tfidf(corpus):
    """
    Calculates the Term Frequency-Inverse Document Frequency (TF-IDF) scores.
    """
    print("--- 3. TF-IDF CALCULATION ---")
    print("\n[CORPUS USED FOR TF-IDF]:")
    for i, doc in enumerate(corpus):
        print(f"  Document {i+1}: {doc}")

    # Initialize the TfidfVectorizer (this handles tokenization, normalization, and smoothing internally)
    # We use our own preprocessing for the previous steps, but for a standard TF-IDF calculation,
    # TfidfVectorizer is the most efficient method.
    vectorizer = TfidfVectorizer(stop_words='english')

    # Fit the model to the corpus and transform the corpus into a matrix of TF-IDF scores
    tfidf_matrix = vectorizer.fit_transform(corpus)

    # Get feature names (the vocabulary)
    feature_names = vectorizer.get_feature_names_out()

    # Convert the matrix to a dense array and then to a pandas DataFrame for clear viewing
    tfidf_array = tfidf_matrix.toarray()
    tfidf_df = pd.DataFrame(tfidf_array, columns=feature_names,
                            index=[f'Document {i+1}' for i in range(len(corpus))])

    # Print the resulting TF-IDF Matrix
    print("\n[TF-IDF MATRIX (Term Frequency-Inverse Document Frequency)]:")
    print("Scores indicate term importance relative to the entire corpus.")
    print(tfidf_df.round(4))

    print("\n[INTERPRETATION EXAMPLE]:")
    print("The word 'quick' has a high score in Document 1 and 2 but a lower score in Document 3,")
    print("meaning it is slightly less unique to Document 3 compared to the other two.")

# Run the TF-IDF calculation on the corpus
calculate_tfidf(CORPUS)
print("-" * 50)

--- 0. Setup ---
NLTK components downloaded successfully (punkt, stopwords, wordnet).
--------------------------------------------------
--- 1. Preprocessing Demo (Tokenization, Normalization, Lemmatization) ---

[INPUT TEXT]: The running foxes are quickly jumping over the dogs! Do they like running?

--- 2a. TOKENIZATION ---
Total tokens: 15
Tokens: ['The', 'running', 'foxes', 'are', 'quickly', 'jumping', 'over', 'the', 'dogs', '!', 'Do', 'they', 'like', 'running', '?']

--- 2b. NORMALIZATION ---
After Lowercasing: ['the', 'running', 'foxes', 'are', 'quickly', 'jumping', 'over', 'the', 'dogs', '!', 'do', 'they', 'like', 'running', '?']
After Punctuation Removal: ['the', 'running', 'foxes', 'are', 'quickly', 'jumping', 'over', 'the', 'dogs', 'do', 'they', 'like', 'running']
After Stop Word Removal: ['running', 'foxes', 'quickly', 'jumping', 'dogs', 'like', 'running']

--- 2c. STEMMING/LEMMATIZATION (Using Lemmatization) ---
Final Lemmatized Tokens: ['run', 'fox', 'quickly', 'jump', 'do