In [18]:
!pip install joblib
import pandas as pd
import joblib # For saving models
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Download necessary resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample Data: Text and their Labels
data = {
    'text': [
        "The quick brown foxes are jumping over the lazy dogs!",
        "I love coding in Python and building NLP models.",
        "Dogs are running in the park happily.",
        "Python is a great language for data science foxes."
    ],
    'category': ['nature', 'tech', 'nature', 'tech']
}
df = pd.DataFrame(data)

# --- STEP 1: Text Cleaning & Stopword Removal & Lemmatization ---
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Remove punctuation and special characters, keep only letters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Lowercase and tokenize
    tokens = word_tokenize(text.lower())
    # Remove stopwords and Lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(cleaned_tokens)

df['cleaned_text'] = df['text'].apply(clean_text)

# --- STEP 2: Label Encoding (Target Variable) ---
le = LabelEncoder()
df['label_enc'] = le.fit_transform(df['category'])

# --- STEP 3: TF-IDF Representation ---
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['cleaned_text'])

# Convert TF-IDF matrix to a readable DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# --- STEP 4: Save Outputs ---
df.to_csv('cleaned_data.csv', index=False)        # Save processed text/labels
tfidf_df.to_csv('tfidf_features.csv', index=False) # Save numerical features



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
