In [1]:
!pip install nltk scikit-learn pandas




In [2]:
import re
import pandas as pd
import nltk

from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


In [3]:
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [4]:
data = {
    "text": [
        "NLTK is a powerful NLP library!",
        "Text cleaning is an important step in NLP.",
        "TF-IDF and Word2Vec are widely used techniques."
    ],
    "label": ["library", "process", "technique"]
}

df = pd.DataFrame(data)
print(df)


                                              text      label
0                  NLTK is a powerful NLP library!    library
1       Text cleaning is an important step in NLP.    process
2  TF-IDF and Word2Vec are widely used techniques.  technique


In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text


In [6]:
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = clean_text(text)
    words = text.split()  # safe tokenization
    words = [lemmatizer.lemmatize(word) for word in words if word not in ENGLISH_STOP_WORDS]
    return " ".join(words)


In [7]:
df["cleaned_text"] = df["text"].apply(preprocess_text)
print(df[["text", "cleaned_text"]])


                                              text  \
0                  NLTK is a powerful NLP library!   
1       Text cleaning is an important step in NLP.   
2  TF-IDF and Word2Vec are widely used techniques.   

                          cleaned_text  
0            nltk powerful nlp library  
1     text cleaning important step nlp  
2  tfidf wordvec widely used technique  


In [8]:
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])

print(df[["label", "label_encoded"]])


       label  label_encoded
0    library              0
1    process              1
2  technique              2


In [9]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df["cleaned_text"])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print(tfidf_df)


   cleaning  important   library       nlp      nltk  powerful      step  \
0  0.000000   0.000000  0.528635  0.402040  0.528635  0.528635  0.000000   
1  0.467351   0.467351  0.000000  0.355432  0.000000  0.000000  0.467351   
2  0.000000   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

   technique      text     tfidf      used    widely   wordvec  
0   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  
1   0.000000  0.467351  0.000000  0.000000  0.000000  0.000000  
2   0.447214  0.000000  0.447214  0.447214  0.447214  0.447214  


In [10]:
df.to_csv("cleaned_text_and_labels.csv", index=False)
tfidf_df.to_csv("tfidf_representation.csv", index=False)

print("Files saved successfully!")


Files saved successfully!
