<a href="https://colab.research.google.com/github/PavanGavit/NLP_LAB/blob/main/NLP_A3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# --- 1. SETUP & DOWNLOADS ---
# Ensure all necessary NLTK data is available
resources = ['punkt_tab', 'wordnet', 'stopwords', 'omw-1.4']
for resource in resources:
    try:
        nltk.data.find(f'tokenizers/{resource}' if 'punkt' in resource else f'corpora/{resource}')
    except LookupError:
        nltk.download(resource, quiet=True)

# --- 2. CREATE SAMPLE DATASET ---
data = {
    'text': [
        "The quick brown fox jumps over the lazy dog!!!",
        "Data Science is... simply amazing & fun.",
        "I love machine learning and artificial intelligence.",
        "Python is a great programming language for AI.",
        "The dog is barking loudly at the fox."
    ],
    'category': ['animals', 'tech', 'tech', 'tech', 'animals']
}
df = pd.DataFrame(data)

print("--- Original Data ---")
print(df)

# --- 3. DEFINE CLEANING PIPELINE ---
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # A. Lowercasing
    text = text.lower()

    # B. Text Cleaning (Remove special characters/punctuation)
    # This regex keeps only alphanumeric chars and spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # C. Tokenization
    tokens = word_tokenize(text)

    # D. Stop Word Removal & Lemmatization
    cleaned_tokens = [
        lemmatizer.lemmatize(token)  # Lemmatize
        for token in tokens
        if token not in stop_words   # Remove Stop Words
    ]

    # Join back into a string
    return " ".join(cleaned_tokens)

# Apply the preprocessing
df['cleaned_text'] = df['text'].apply(preprocess_text)

print("\n--- After Cleaning & Lemmatization ---")
print(df[['text', 'cleaned_text']])

# --- 4. LABEL ENCODING ---
# Converts text labels ('tech', 'animals') into numbers (1, 0)
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

print("\n--- After Label Encoding ---")
print(df[['category', 'category_encoded']])

# --- 5. TF-IDF VECTORIZATION ---
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

# Create a DataFrame for the TF-IDF output (for visualization)
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print("\n--- TF-IDF Features (First 5 columns) ---")
print(tfidf_df.iloc[:, :5]) # Printing only first 5 columns to save space

# --- 6. SAVE OUTPUTS ---

# A. Save the processed dataset to CSV
df.to_csv("processed_dataset.csv", index=False)
print("\n[SUCCESS] Dataset saved to 'processed_dataset.csv'")

# B. Save the artifacts (Models/Encoders) using Pickle
# This allows you to load the exact same vectorizer logic later
with open("tfidf_model.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("[SUCCESS] TF-IDF Model saved to 'tfidf_model.pkl'")
print("[SUCCESS] Label Encoder saved to 'label_encoder.pkl'")

--- Original Data ---
                                                text category
0     The quick brown fox jumps over the lazy dog!!!  animals
1           Data Science is... simply amazing & fun.     tech
2  I love machine learning and artificial intelli...     tech
3     Python is a great programming language for AI.     tech
4              The dog is barking loudly at the fox.  animals

--- After Cleaning & Lemmatization ---
                                                text  \
0     The quick brown fox jumps over the lazy dog!!!   
1           Data Science is... simply amazing & fun.   
2  I love machine learning and artificial intelli...   
3     Python is a great programming language for AI.   
4              The dog is barking loudly at the fox.   

                                    cleaned_text  
0                  quick brown fox jump lazy dog  
1                data science simply amazing fun  
2  love machine learning artificial intelligence  
3           python great 