#  Data Cleaning for train & test

## Import Libraries

In [1]:
import pandas as pd
import string
import re 

## Load Dataset

In [2]:
train_df = pd.read_csv("../data/raw/train.csv")
test_df  = pd.read_csv("../data/raw/test.csv")

## Cleaning function

In [3]:

def clean_text(text):
    text = str(text).lower()                                      # lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)           # remove urls
    text = re.sub(r"<.*?>", '', text)                             # remove html tags
    text = re.sub(r"\n", " ", text)                               # remove newlines
    text = re.sub(r"\d+", '', text)                               # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()                      # remove extra spaces
    return text

## Apply cleaning

In [4]:
train_df["clean_comment"] = train_df["comment_text"].apply(clean_text)
test_df["clean_comment"]  = test_df["comment_text"].apply(clean_text)

## Drop Id Column

In [5]:
train_df = train_df.drop(columns=["id"])
test_df = test_df.drop(columns=["id"])

## Save Datasets

In [6]:
train_df.to_csv("../data/processed/cleaned_train_data.csv", index = False)
test_df.to_csv("../data/processed/cleaned_test_data.csv", index = False)

# For BERT / Transformer models

- We don’t need heavy cleaning.

- BERT tokenizer already handles casing, punctuation, and subword tokenization.

- Usually, you only do:

    - Lowercasing (if not using bert-base-uncased)

    - Remove URLs, HTML tags, and line breaks

    - Maybe strip extra spaces.