Perform text cleaning, perform lemmatization (any method), remove stop words (any method), 
label encoding. Create representations using TF-IDF. Save outputs 

1. Install required libraries

In [24]:
!pip install nltk scikit-learn pandas




[notice] A new release of pip is available: 25.2 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


2. Import libraries & download NLTK resources

In [25]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

3. Sample dataset

In [26]:
data = {
    "text": [
        "I love machine learning!",
        "NLP is a very interesting field.",
        "Deep learning improves AI systems.",
        "Text preprocessing is important in NLP.",
        "I hate bugs in my code.",
        "I don't like waiting for results."
    ],
    "label": [
        "positive",
        "positive",
        "positive",
        "neutral",
        "negative",
        "negative"
    ]
}

df = pd.DataFrame(data)
df


Unnamed: 0,text,label
0,I love machine learning!,positive
1,NLP is a very interesting field.,positive
2,Deep learning improves AI systems.,positive
3,Text preprocessing is important in NLP.,neutral
4,I hate bugs in my code.,negative
5,I don't like waiting for results.,negative


4. Text cleaning + lemmatization + stop-word removal

In [27]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # lowercase
    text = text.lower()
    
    # remove special characters & numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # tokenize
    tokens = word_tokenize(text)
    
    # remove stopwords & lemmatize
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words
    ]
    
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(preprocess_text)
df


Unnamed: 0,text,label,clean_text
0,I love machine learning!,positive,love machine learning
1,NLP is a very interesting field.,positive,nlp interesting field
2,Deep learning improves AI systems.,positive,deep learning improves ai system
3,Text preprocessing is important in NLP.,neutral,text preprocessing important nlp
4,I hate bugs in my code.,negative,hate bug code
5,I don't like waiting for results.,negative,dont like waiting result


5. Label Encoding

In [28]:
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])

df


Unnamed: 0,text,label,clean_text,label_encoded
0,I love machine learning!,positive,love machine learning,2
1,NLP is a very interesting field.,positive,nlp interesting field,2
2,Deep learning improves AI systems.,positive,deep learning improves ai system,2
3,Text preprocessing is important in NLP.,neutral,text preprocessing important nlp,1
4,I hate bugs in my code.,negative,hate bug code,0
5,I don't like waiting for results.,negative,dont like waiting result,0


6. TF-IDF Representation

In [29]:
tfidf = TfidfVectorizer()

X_tfidf = tfidf.fit_transform(df["clean_text"])

tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=tfidf.get_feature_names_out()
)

tfidf_df


Unnamed: 0,ai,bug,code,deep,dont,field,hate,important,improves,interesting,learning,like,love,machine,nlp,preprocessing,result,system,text,waiting
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.501613,0.0,0.611713,0.611713,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.611713,0.0,0.0,0.0,0.611713,0.0,0.0,0.0,0.0,0.501613,0.0,0.0,0.0,0.0,0.0
2,0.462625,0.0,0.0,0.462625,0.0,0.0,0.0,0.0,0.462625,0.0,0.379359,0.0,0.0,0.0,0.0,0.0,0.0,0.462625,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.521823,0.0,0.0,0.0,0.0,0.0,0.0,0.427903,0.521823,0.0,0.0,0.521823,0.0
4,0.0,0.57735,0.57735,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5


7. Save outputs

In [30]:
# Save cleaned dataset
df.to_csv("cleaned_text_data.csv", index=False)

# Save TF-IDF features
tfidf_df.to_csv("tfidf_features.csv", index=False)

print("Files saved successfully!")


Files saved successfully!
