In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the labeled dataset
df = pd.read_csv("labeled_dataset_4_categories.csv")

# Define cleaning function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation/numbers
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

# Clean and tokenize
df["clean_tokens"] = df["Text"].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeetshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeetshah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jeetshah/nltk_data...


In [2]:
stemmer = PorterStemmer()
df["stemmed"] = df["clean_tokens"].apply(lambda tokens: ' '.join([stemmer.stem(word) for word in tokens]))
df_stemmed = df[["stemmed", "Semantic_Label_4"]]
df_stemmed.to_csv("cleaned_stemmed.csv", index=False)


In [3]:
lemmatizer = WordNetLemmatizer()
df["lemmatized"] = df["clean_tokens"].apply(lambda tokens: ' '.join([lemmatizer.lemmatize(word) for word in tokens]))
df_lemmatized = df[["lemmatized", "Semantic_Label_4"]]
df_lemmatized.to_csv("cleaned_lemmatized.csv", index=False)


In [4]:
vectorizer = CountVectorizer(max_df=0.9, min_df=10, max_features=1000)
X_count = vectorizer.fit_transform(df["lemmatized"])
count_df = pd.DataFrame(X_count.toarray(), columns=vectorizer.get_feature_names_out())
count_df["label"] = df["Semantic_Label_4"].values
count_df.to_csv("cleaned_countvectorizer.csv", index=False)

In [5]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=10, max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(df["lemmatized"])
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df["label"] = df["Semantic_Label_4"].values
tfidf_df.to_csv("cleaned_tfidf.csv", index=False)