Perform text cleaning, perform lemmatization (any method), remove stop words (any method), 
label encoding. Create representations using TF-IDF. Save outputs.

In [7]:
import nltk
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [4]:
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to C:\Users\Vishal
[nltk_data]     Pattar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Vishal
[nltk_data]     Pattar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Vishal
[nltk_data]     Pattar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [9]:
data = {
    "text": 
        [
            "Alpha, Beta, Gamma are the Greek Alphabets.", 
            "New Delhi is the Captial City of India.", 
            "Mumbai is the captial city of India.", 
            "Apple, Mango, Banana are the some popular fruits.",
            "RCB is the Champion of the IPL Season 2025."
        ],
    "labels": [
        "list", "city", "city", "list", "fact"
    ]
}

In [10]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,text,labels
0,"Alpha, Beta, Gamma are the Greek Alphabets.",list
1,New Delhi is the Captial City of India.,city
2,Mumbai is the captial city of India.,city
3,"Apple, Mango, Banana are the some popular fruits.",list
4,RCB is the Champion of the IPL Season 2025.,fact


In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(lemmas)

In [13]:
df['clean_text'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,text,labels,clean_text
0,"Alpha, Beta, Gamma are the Greek Alphabets.",list,alpha beta gamma greek alphabet
1,New Delhi is the Captial City of India.,city,new delhi captial city india
2,Mumbai is the captial city of India.,city,mumbai captial city india
3,"Apple, Mango, Banana are the some popular fruits.",list,apple mango banana popular fruit
4,RCB is the Champion of the IPL Season 2025.,fact,rcb champion ipl season


In [15]:
label_encoder = LabelEncoder()
df['encoded_labels'] = label_encoder.fit_transform(df['labels'])
df.head()

Unnamed: 0,text,labels,clean_text,encoded_labels
0,"Alpha, Beta, Gamma are the Greek Alphabets.",list,alpha beta gamma greek alphabet,2
1,New Delhi is the Captial City of India.,city,new delhi captial city india,0
2,Mumbai is the captial city of India.,city,mumbai captial city india,0
3,"Apple, Mango, Banana are the some popular fruits.",list,apple mango banana popular fruit,2
4,RCB is the Champion of the IPL Season 2025.,fact,rcb champion ipl season,1


In [20]:
df.to_csv("preprocessing.csv", index=False)

In [17]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])

In [18]:
df_tfidf = pd.DataFrame(data=tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
df_tfidf.head()

Unnamed: 0,alpha,alphabet,apple,banana,beta,captial,champion,city,delhi,fruit,gamma,greek,india,ipl,mango,mumbai,new,popular,rcb,season
0,0.447214,0.447214,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.447214,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.405801,0.0,0.405801,0.50298,0.0,0.0,0.0,0.405801,0.0,0.0,0.0,0.50298,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.469515,0.0,0.469515,0.0,0.0,0.0,0.0,0.469515,0.0,0.0,0.581951,0.0,0.0,0.0,0.0
3,0.0,0.0,0.447214,0.447214,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.447214,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.5


In [21]:
df_tfidf.to_csv("tfidf_matrix.csv", index=False)