In [None]:
# Assignement 3

# Text cleaning
# ✔ Lemmatization (WordNet)
# ✔ Stop-word removal
# ✔ Label Encoding
# ✔ TF-IDF representation
# ✔ Saving outputs
# ✔ Dependencies clearly listed

# Import Libraries

In [2]:
import re
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


##  Download NLTK Resources

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Sample Dataset

In [4]:
data = {
    "text": [
        "I love Natural Language Processing!",
        "NLP is fun and very powerful.",
        "I hate boring lectures."
    ],
    "label": ["positive", "positive", "negative"]
}

df = pd.DataFrame(data)
df


Unnamed: 0,text,label
0,I love Natural Language Processing!,positive
1,NLP is fun and very powerful.,positive
2,I hate boring lectures.,negative


# Text Cleaning Function

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text


#  Lemmatization + Stopword Removal

In [6]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = clean_text(text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(preprocess_text)
df


Unnamed: 0,text,label,clean_text
0,I love Natural Language Processing!,positive,love natural language processing
1,NLP is fun and very powerful.,positive,nlp fun powerful
2,I hate boring lectures.,negative,hate boring lecture


#  Label Encoding

In [7]:
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])

df


Unnamed: 0,text,label,clean_text,label_encoded
0,I love Natural Language Processing!,positive,love natural language processing,1
1,NLP is fun and very powerful.,positive,nlp fun powerful,1
2,I hate boring lectures.,negative,hate boring lecture,0


# TF-IDF Representation

In [10]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df["clean_text"])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

tfidf_df


Unnamed: 0,boring,fun,hate,language,lecture,love,natural,nlp,powerful,processing
0,0.0,0.0,0.0,0.5,0.0,0.5,0.5,0.0,0.0,0.5
1,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0
2,0.57735,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0,0.0,0.0


# Save Outputs to Files

In [11]:
df.to_csv("cleaned_text_data.csv", index=False)
tfidf_df.to_csv("tfidf_features.csv", index=False)

print("Files saved successfully!")


Files saved successfully!
