In [1]:
import re
import nltk
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
texts = [
    "I love Natural Language Processing",
    "NLP is very interesting",
    "I am learning machine learning"
]

labels = ["Positive", "Positive", "Neutral"]


In [5]:
#text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)   # remove punctuation & numbers
    return text


In [6]:
#lemmatization and stopword removal

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word)
             for word in words if word not in stop_words]
    return " ".join(words)


In [7]:
cleaned_texts = []

for text in texts:
    text = clean_text(text)
    text = preprocess(text)
    cleaned_texts.append(text)

print(cleaned_texts)


['love natural language processing', 'nlp interesting', 'learning machine learning']


In [8]:
#label encoding

le = LabelEncoder()
encoded_labels = le.fit_transform(labels)

print(encoded_labels)


[1 1 0]


In [9]:
#TF-IDF Vectorization

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(cleaned_texts)

df_tfidf = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)

print(df_tfidf)


   interesting  language  learning  love   machine  natural       nlp  \
0     0.000000       0.5  0.000000   0.5  0.000000      0.5  0.000000   
1     0.707107       0.0  0.000000   0.0  0.000000      0.0  0.707107   
2     0.000000       0.0  0.894427   0.0  0.447214      0.0  0.000000   

   processing  
0         0.5  
1         0.0  
2         0.0  


In [11]:
#save cleaned text
pd.DataFrame({
    "Cleaned_Text": cleaned_texts,
    "Encoded_Label": encoded_labels
}).to_csv("processed_text.csv", index=False)


In [12]:
#TF-IDE OUTPUT Save
df_tfidf.to_csv("tfidf_output.csv", index=False)
