In [None]:
documents = [
    "Natural Language Processing helps computers understand human language.",
    "Text cleaning and preprocessing are important steps in NLP!",
    "TF-IDF is widely used for document representation."
]

labels = ["NLP", "Preprocessing", "Representation"]


In [None]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

cleaned_docs = [clean_text(doc) for doc in documents]
print(cleaned_docs)


['natural language processing helps computers understand human language', 'text cleaning and preprocessing are important steps in nlp', 'tfidf is widely used for document representation']


In [None]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

lemmatized_docs = [
    " ".join([lemmatizer.lemmatize(word) for word in doc.split()])
    for doc in cleaned_docs
]

print(lemmatized_docs)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


['natural language processing help computer understand human language', 'text cleaning and preprocessing are important step in nlp', 'tfidf is widely used for document representation']


In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

final_docs = [
    " ".join([word for word in doc.split() if word not in ENGLISH_STOP_WORDS])
    for doc in lemmatized_docs
]

print(final_docs)


['natural language processing help computer understand human language', 'text cleaning preprocessing important step nlp', 'tfidf widely used document representation']


In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

print(encoded_labels)


[0 1 2]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(final_docs)

print(tfidf.get_feature_names_out())
print(tfidf_matrix.toarray())


['cleaning' 'computer' 'document' 'help' 'human' 'important' 'language'
 'natural' 'nlp' 'preprocessing' 'processing' 'representation' 'step'
 'text' 'tfidf' 'understand' 'used' 'widely']
[[0.         0.31622777 0.         0.31622777 0.31622777 0.
  0.63245553 0.31622777 0.         0.         0.31622777 0.
  0.         0.         0.         0.31622777 0.         0.        ]
 [0.40824829 0.         0.         0.         0.         0.40824829
  0.         0.         0.40824829 0.40824829 0.         0.
  0.40824829 0.40824829 0.         0.         0.         0.        ]
 [0.         0.         0.4472136  0.         0.         0.
  0.         0.         0.         0.         0.         0.4472136
  0.         0.         0.4472136  0.         0.4472136  0.4472136 ]]


In [None]:
import pandas as pd

# Save cleaned text
pd.DataFrame({"cleaned_text": final_docs}).to_csv("cleaned_text.csv", index=False)

# Save TF-IDF matrix
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)
tfidf_df.to_csv("tfidf_features.csv", index=False)

# Save encoded labels
pd.DataFrame({"labels": encoded_labels}).to_csv("labels.csv", index=False)
