In [None]:
%pip install --upgrade pandas

In [None]:
import pandas as pd

# Load Dataset List
dataset_list_df  = pd.read_csv('./Dataset/dataset_list.csv')
dataset_list_df.head(10)

# Dictionary to store each dataset
datasets = {}

# Iterate over the dataset list and load each dataset
for idx, row in dataset_list_df.iterrows():
    dataset_name = f"dataset_{idx+1}"
    dataset_path = './Dataset/' + row['Path'].strip("./")
    try:
        df = pd.read_csv(dataset_path)
        # Keep only the 'Label' and 'String' columns if they exist
        if {'Label', 'String'}.issubset(df.columns):
            datasets[dataset_name] = df[['Label', 'String']]
            print(f"Loaded {dataset_name} with selected columns from {dataset_path}")
        else:
            print(f"{dataset_name} missing required columns, skipping.")
    except Exception as e:
        print(f"Failed to load {dataset_name}: {e}")


In [None]:
import csv
import re

# Load stopwords
with open('./Dataset/stopwordbahasa.csv', 'r') as f:
    stopwords = set([line.strip() for line in f])

# Load word shortening dictionary
word_map = {}
with open('./Dataset/kamus_singkatan.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    word_map = {row[1].strip(): row[2].strip() for row in reader}


In [None]:
def preprocess_text(text):
    # Lowercase and tokenize
    tokens = re.findall(r'\b\w+\b', text.lower())

    # Replace short words and remove stopwords
    processed = [
        word_map.get(token, token)
        for token in tokens
        if token not in stopwords and not token.startswith('@')
    ]

    return " ".join(processed)

# Encode label to standardize it between datasets
def label_encoding(label):
    # Negative Labels in the datasets include: "negative", "Bullying", "negatif", 0. To be converted to 0
    # Positive Labels in the datasets include: "positive", "Non-bullying", "positif", 1. To be converted to 1
    if label in ['negative', 'Bullying', 'negatif', '0']:
        return 0
    elif label in ['positive', 'Non-bullying', 'positif', '1']:
        return 1
    else:
        return -1

# Apply to your data
for name, df in datasets.items():
  df = df.drop_duplicates(subset="String", keep="first").reset_index(drop=True)
  df['clean_text'] = df['String'].astype(str).apply(preprocess_text)
  df['encoded_label'] = df['Label'].astype(str).apply(label_encoding)
  df = df[df['encoded_label'] != -1]
  datasets[name] = df
  print(f'Done Pre-Processing "{name}".')

In [None]:
combined_dataset = pd.DataFrame(columns=["Label", "clean_text"])

for name, df in datasets.items():
    combined_dataset = pd.concat([combined_dataset, df], ignore_index=True)

print("Combined dataset shape:", combined_dataset.shape)

datasets["combined_dataset"] = combined_dataset

for name, df in datasets.items():
    print(f"\n{name} Label Distribution:")
    if 'encoded_label' in df.columns:
        print(df['encoded_label'].value_counts(dropna=False))
    else:
        print("No 'encoded_label' column found.")

In [None]:
for name, df in datasets.items():
    datasets[name].to_csv(f'./Dataset/Pre-Processed Dataset/{name}.csv')
    print(f'Dataset {name} exported')