Perform text cleaning, perform lemmatization (any method), remove stop words (any method), 
label encoding. Create representations using TF-IDF. Save outputs 

1. Install & Import Libraries

In [None]:
!pip install nltk scikit-learn pandas


In [None]:
import nltk
import re
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


2. Sample Dataset

In [None]:
data = {
    "text": [
        "AI is transforming the world!",
        "Machine learning is a part of AI.",
        "Deep learning makes AI powerful."
    ],
    "label": ["tech", "tech", "tech"]
}

df = pd.DataFrame(data)
print(df)


3. Text Cleaning

In [None]:
#(Remove lowercase, punctuation, numbers)
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df["clean_text"] = df["text"].apply(clean_text)
print(df["clean_text"])


4. Stop Word Removal

In [None]:
stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered = [word for word in tokens if word not in stop_words]
    return filtered

df["no_stopwords"] = df["clean_text"].apply(remove_stopwords)
print(df["no_stopwords"])


5. Lemmatization (WordNet Lemmatizer)

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

df["lemmatized"] = df["no_stopwords"].apply(lemmatize_words)
print(df["lemmatized"])


In [None]:
6. Join Tokens Back to Text

In [None]:
df["final_text"] = df["lemmatized"].apply(lambda x: " ".join(x))
print(df["final_text"])


In [None]:
7. Label Encoding

In [None]:
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])

print(df[["label", "label_encoded"]])


8. TF-IDF Representation

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df["final_text"])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print(tfidf_df)


9. Save Outputs (IMPORTANT)

In [None]:
# Save cleaned & processed data
df.to_csv("processed_text_data.csv", index=False)

# Save TF-IDF features
tfidf_df.to_csv("tfidf_features.csv", index=False)

print("âœ… Files saved successfully!")
