In [None]:
import pandas as pd
import re
import string
import emoji
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from autocorrect import Speller
import nltk

# Download NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

In [None]:
# 1. Load dataset
data_path = "data/En-Ba-Dataset(20k_4)/dataset.csv"
df = pd.read_csv(data_path)

# Ensure dataset has Sentence and Label
df.columns = ["Sentence", "Label"]

In [None]:
# 2. Define preprocessing functions


# Lowercasing
def to_lower(text):
    return text.lower()


# Remove HTML Tags
def remove_html(text):
    return BeautifulSoup(text, "html.parser").get_text()


# Remove URLs
def remove_urls(text):
    return re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)


# Remove punctuation
def remove_punct(text):
    return text.translate(str.maketrans("", "", string.punctuation))


# Chat word treatment (custom shortforms → full words)
chat_dict = {
    "u": "you",
    "ur": "your",
    "r": "are",
    "pls": "please",
    "plz": "please",
    "thx": "thanks",
    "tnx": "thanks",
    "luv": "love",
    "gr8": "great",
    "btw": "by the way",
    "lol": "laugh out loud",
    "omg": "oh my god",
    "idk": "i do not know",
}


def chat_word_treatment(text):
    tokens = text.split()
    new_tokens = [chat_dict.get(w, w) for w in tokens]
    return " ".join(new_tokens)


# Spelling correction
spell = Speller(lang="en")


def correct_spelling(text):
    tokens = text.split()
    corrected = [spell(w) for w in tokens]
    return " ".join(corrected)


# Handling emojis → convert to text
def handle_emojis(text):
    return emoji.demojize(text)


# Tokenization
def tokenize(text):
    return word_tokenize(text)


# Lemmatization
lemmatizer = WordNetLemmatizer()


def lemmatize(text):
    tokens = tokenize(text)
    lemmatized = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(lemmatized)

In [None]:
# 3. Full pipeline
def preprocess(text):
    text = to_lower(text)
    text = remove_html(text)
    text = remove_urls(text)
    text = remove_punct(text)
    text = chat_word_treatment(text)
    text = handle_emojis(text)
    text = lemmatize(text)
    return text

df["Sentence"] = df["Sentence"].astype(str).apply(preprocess)


In [None]:
# 4. Save processed dataset
output_path = "data/En-Ba-Dataset(20k_4)/dataset_cleaned.csv"
df.to_csv(output_path, index=False)

# Show sample
print(df.head())