In [3]:
!pip install nltk scikit-learn pandas




In [4]:
import re
import nltk
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


In [5]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [6]:
text = input("Enter text for NLP processing:\n")
label = input("Enter label (example: positive / negative / neutral): ")


Enter text for NLP processing:
Natural Language Processing is very useful and interesting!
Enter label (example: positive / negative / neutral): positive


In [7]:
def clean_text(text):
    text = text.lower()                     # Convert to lowercase
    text = re.sub(r'\d+', '', text)         # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)     # Remove punctuation
    text = text.strip()                     # Remove extra spaces
    return text

cleaned_text = clean_text(text)

print("\nCleaned Text:")
print(cleaned_text)



Cleaned Text:
natural language processing is very useful and interesting


In [8]:
tokens = cleaned_text.split()

print("\nTokens:")
print(tokens)



Tokens:
['natural', 'language', 'processing', 'is', 'very', 'useful', 'and', 'interesting']


In [9]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]

print("\nAfter Stopword Removal:")
print(filtered_tokens)



After Stopword Removal:
['natural', 'language', 'processing', 'useful', 'interesting']


In [10]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

final_text = " ".join(lemmatized_tokens)

print("\nAfter Lemmatization:")
print(final_text)



After Lemmatization:
natural language processing useful interesting


In [11]:
label_encoder = LabelEncoder()
encoded_label = label_encoder.fit_transform([label])

print("\nOriginal Label:", label)
print("Encoded Label:", encoded_label)



Original Label: positive
Encoded Label: [0]


In [12]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([final_text])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print("\nTF-IDF Representation:")
tfidf_df



TF-IDF Representation:


Unnamed: 0,interesting,language,natural,processing,useful
0,0.447214,0.447214,0.447214,0.447214,0.447214


In [13]:
processed_df = pd.DataFrame({
    "Original Text": [text],
    "Cleaned Text": [cleaned_text],
    "Final Processed Text": [final_text],
    "Label": [label],
    "Encoded Label": encoded_label
})

processed_df.to_csv("processed_text_output.csv", index=False)
tfidf_df.to_csv("tfidf_output.csv", index=False)

print("\nFiles saved successfully!")
print("1. processed_text_output.csv")
print("2. tfidf_output.csv")



Files saved successfully!
1. processed_text_output.csv
2. tfidf_output.csv


In [14]:
from google.colab import files

files.download("processed_text_output.csv")
files.download("tfidf_output.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>