In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Sample dataset
data = {
    "text": [
        "This is an example sentence for text processing!",
        "Another sentence with numbers 123 and symbols #@$.",
        "Lemmatization helps in text normalization.",
    ],
    "label": ["Positive", "Negative", "Neutral"]
}

df = pd.DataFrame(data)

In [None]:
# Text Cleaning Function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text


In [None]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [None]:
# Preprocessing Function
def preprocess_text(text):
    text = clean_text(text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

In [None]:
# Apply preprocessing
df["processed_text"] = df["text"].apply(preprocess_text)



In [None]:
# TF-IDF Representation
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["processed_text"])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())


In [None]:
# Save outputs
df.to_csv("processed_text.csv", index=False)
tfidf_df.to_csv("tfidf_representation.csv", index=False)

print("Processing complete! Files saved: processed_text.csv, tfidf_representation.csv")

Processing complete! Files saved: processed_text.csv, tfidf_representation.csv
