In [2]:
!pip install nltk

import pandas as pd
import numpy as np
import re
import nltk
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

data = {
    "text": [
        "I love Machine Learning!",
        "Machine learning is very powerful.",
        "Deep learning is a branch of ML.",
        "I enjoy learning new AI techniques."
    ],
    "label": ["positive", "positive", "neutral", "positive"]
}

df = pd.DataFrame(data)
print("ORIGINAL DATASET")
print(df)


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word)
             for word in words if word not in stop_words]
    return " ".join(words)


df['cleaned_text'] = df['text'].apply(clean_text)

print("\nCLEANED TEXT")
print(df[['text', 'cleaned_text']])


label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

print("\nLABEL ENCODING")
print(df[['label', 'label_encoded']])


tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print("\nTF-IDF MATRIX")
print(tfidf_df)


df.to_csv("cleaned_and_encoded_data.csv", index=False)
tfidf_df.to_csv("tfidf_features.csv", index=False)

print("\nFILES SAVED SUCCESSFULLY:")
print("1. cleaned_and_encoded_data.csv")
print("2. tfidf_features.csv")




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


ORIGINAL DATASET
                                  text     label
0             I love Machine Learning!  positive
1   Machine learning is very powerful.  positive
2     Deep learning is a branch of ML.   neutral
3  I enjoy learning new AI techniques.  positive

CLEANED TEXT
                                  text                     cleaned_text
0             I love Machine Learning!            love machine learning
1   Machine learning is very powerful.        machine learning powerful
2     Deep learning is a branch of ML.          deep learning branch ml
3  I enjoy learning new AI techniques.  enjoy learning new ai technique

LABEL ENCODING
      label  label_encoded
0  positive              1
1  positive              1
2   neutral              0
3  positive              1

TF-IDF MATRIX
         ai    branch      deep     enjoy  learning      love   machine  \
0  0.000000  0.000000  0.000000  0.000000  0.379192  0.726641  0.572892   
1  0.000000  0.000000  0.000000  0.000000  0.379

[nltk_data]   Package omw-1.4 is already up-to-date!
