In [38]:
# %pip install --upgrade pandas

In [1]:
import pandas as pd

dataset_path = "Dataset\dataset_list.csv"

# Load dataset list
dataset_list_df  = pd.read_csv(dataset_path, encoding_errors='replace')
dataset_list_df.head(10)


Unnamed: 0,No,Dataset,Path,URL
0,1,Cyberbullying_Bahasa_Indonesia-Kaggle-CitaTiar...,./Cyberbullying_Bahasa_Indonesia-Kaggle-CitaTi...,Cyberbullying Bahasa Indonesia
1,2,cyberbullying_dataset-Huggingface-aditdwi123,./cyberbullying_dataset-Huggingface-aditdwi123...,aditdwi123/cyber-bullying-dataset at main
2,3,dataset_komentar_instagram_cyberbullying-githu...,./dataset_komentar_instagram_cyberbullying-git...,Dataset-Sentimen-Analisis-Bahasa-Indonesia/dat...
3,4,dataset_luqyana,./Luqyana-Dataset/dataset_1.4k_clean_luqyana.csv,https://j-ptiik.ub.ac.id/index.php/j-ptiik/art...


In [2]:
# Dictionary to store datasets
datasets = {}

# Iterate over the dataset list and load each dataset
for idx, row in dataset_list_df.iterrows():
    dataset_name = f"dataset_{idx+1}_{row['Dataset'].replace(' ', '_')}"
    dataset_path = './Dataset/' + row['Path'].strip("./")
    try:
        df = pd.read_csv(dataset_path)
        # Keep only the necessary columns used for training, 'Label' and 'String'
        if {'Label', 'String'}.issubset(df.columns):
            datasets[dataset_name] = df[['Label', 'String']]
            print(f"Loaded {dataset_name} with selected columns from {dataset_path}")
        else:
            print(f"{dataset_name} missing required columns, skipping.")
    except Exception as e:
        print(f"Failed to load {dataset_name}: {e}")

Loaded dataset_1_Cyberbullying_Bahasa_Indonesia-Kaggle-CitaTiaraHanni with selected columns from ./Dataset/Cyberbullying_Bahasa_Indonesia-Kaggle-CitaTiaraHanni/DATASET CYBERBULLYING INSTAGRAM - FINAL.csv
Loaded dataset_2_cyberbullying_dataset-Huggingface-aditdwi123 with selected columns from ./Dataset/cyberbullying_dataset-Huggingface-aditdwi123/sentiment_data.csv
Loaded dataset_3_dataset_komentar_instagram_cyberbullying-github-rizalespe with selected columns from ./Dataset/dataset_komentar_instagram_cyberbullying-github-rizalespe/dataset_komentar_instagram_cyberbullying.csv
Loaded dataset_4_dataset_luqyana with selected columns from ./Dataset/Luqyana-Dataset/dataset_1.4k_clean_luqyana.csv


In [3]:
import csv
import re

# Load stopwords (common words to ignore, as they do not contribute to the meaning of the text)
with open('./Dataset/stopwordbahasa.csv', 'r') as f:
    stopwords = set([line.strip() for line in f])

# Load word shortening dictionary (abbreviations can be harmful for understanding and model accuracy)
word_map = {}
with open('./Dataset/kamus_singkatan.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    word_map = {row[1].strip(): row[2].strip() for row in reader}


In [None]:
for name, df in datasets.items():
    print(f"\n{name} Label Distribution Before Pre-Processing:")
    if 'Label' in df.columns:
        print(df['Label'].value_counts(dropna=False))
    else:
        print("No 'Label' column found.")


dataset_1_Cyberbullying_Bahasa_Indonesia-Kaggle-CitaTiaraHanni Label Distribution:
Label
Non-bullying    325
Bullying        325
Name: count, dtype: int64

dataset_2_cyberbullying_dataset-Huggingface-aditdwi123 Label Distribution:
Label
negatif    627
positif    476
Name: count, dtype: int64

dataset_3_dataset_komentar_instagram_cyberbullying-github-rizalespe Label Distribution:
Label
negative    200
positive    200
Name: count, dtype: int64

dataset_4_dataset_luqyana Label Distribution:
Label
bullying       700
tidak bully    699
Name: count, dtype: int64


In [None]:
def preprocess_text(text):
    # Lowercase and tokenize for more reliable encoding. 
    tokens = re.findall(r'\b\w+\b', text.lower())

    # Replace abbreviations and remove stopwords
    processed = [
        word_map.get(token, token)
        for token in tokens
        if token not in stopwords and not token.startswith('@') and not token == "<USERNAME>"
    ]

    return " ".join(processed)

# Encode label to standardize it between datasets
def label_encoding(label):
    # Negative Labels in the datasets include: "negative", "Bullying", "negatif", 0. To be converted to 0
    # Positive Labels in the datasets include: "positive", "Non-bullying", "positif", 1. To be converted to 1
    if label.lower() in ['negative', 'bullying', 'negatif', '0']:
        return 0
    elif label.lower() in ['positive', 'non-bullying', 'tidak bully', 'positif', '1']:
        return 1
    else:
        return -1

for name, df in datasets.items():
    df = df.drop_duplicates(subset="String", keep="first").reset_index(drop=True)
    df['clean_text'] = df['String'].astype(str).apply(preprocess_text)
    df['encoded_label'] = df['Label'].astype(str).apply(label_encoding)
    df = df[df['encoded_label'] != -1]
    datasets[name] = df
    print(f'Done Pre-Processing "{name}".')

Done Pre-Processing "dataset_1_Cyberbullying_Bahasa_Indonesia-Kaggle-CitaTiaraHanni".
Done Pre-Processing "dataset_2_cyberbullying_dataset-Huggingface-aditdwi123".
Done Pre-Processing "dataset_3_dataset_komentar_instagram_cyberbullying-github-rizalespe".
Done Pre-Processing "dataset_4_dataset_luqyana".


In [43]:
combined_dataset = pd.DataFrame(columns=["Label", "clean_text"])

for name, df in datasets.items():
    combined_dataset = pd.concat([combined_dataset, df], ignore_index=True)

print("Combined dataset shape:", combined_dataset.shape)

datasets["combined_dataset"] = combined_dataset

for name, df in datasets.items():
    print(f"\n{name} Label Distribution:")
    if 'encoded_label' in df.columns:
        print(df['encoded_label'].value_counts(dropna=False))
    else:
        print("No 'encoded_label' column found.")

Combined dataset shape: (3435, 4)

dataset_1_Cyberbullying_Bahasa_Indonesia-Kaggle-CitaTiaraHanni Label Distribution:
encoded_label
1    325
0    325
Name: count, dtype: int64

dataset_2_cyberbullying_dataset-Huggingface-aditdwi123 Label Distribution:
encoded_label
0    614
1    405
Name: count, dtype: int64

dataset_3_dataset_komentar_instagram_cyberbullying-github-rizalespe Label Distribution:
encoded_label
1    200
0    197
Name: count, dtype: int64

dataset_4_dataset_luqyana Label Distribution:
encoded_label
0    694
1    675
Name: count, dtype: int64

combined_dataset Label Distribution:
encoded_label
0.0    1830
1.0    1605
Name: count, dtype: int64


In [44]:
for name, df in datasets.items():
    datasets[name].to_csv(f'./Dataset/Pre-Processed Dataset/{name}.csv')
    print(f'Dataset {name} exported')

Dataset dataset_1_Cyberbullying_Bahasa_Indonesia-Kaggle-CitaTiaraHanni exported
Dataset dataset_2_cyberbullying_dataset-Huggingface-aditdwi123 exported
Dataset dataset_3_dataset_komentar_instagram_cyberbullying-github-rizalespe exported
Dataset dataset_4_dataset_luqyana exported
Dataset combined_dataset exported
