### Import Library dan CSV

In [2]:
import pandas as pd
import re
import string

from indoNLP.preprocessing import replace_word_elongation, replace_slang

In [None]:
angerData = pd.read_csv('../data/raw_data/AngerData.csv', delimiter='\t')
fearData = pd.read_csv('../data/raw_data/FearData.csv', delimiter='\t')
joyData = pd.read_csv('../data/raw_data/JoyData.csv', delimiter='\t')
loveData = pd.read_csv('../data/raw_data/LoveData.csv', delimiter='\t')
sadData = pd.read_csv('../data/raw_data/SadData.csv', delimiter='\t')
    
#gabungkan data
df1 = pd.concat([angerData, fearData, joyData, loveData, sadData], ignore_index=True)
df1 = df1.sample(frac=1).reset_index(drop=True) #shuffle data

df1.head()

Unnamed: 0,Tweet,Label
0,hbis nntn layangan putus bareng murid2nya ya b...,Anger
1,pls ini tuh tipe yoongi banget tiap nge prod l...,Love
2,"langsung aku unfoll loh, ternyata alasan orang...",Fear
3,ape la mama ni. sory sayang. mama tak jaga dhy...,Sad
4,gaada yang bikin gw seneng selain kabarnya jim...,Sad


In [None]:
df2 = pd.read_csv('../data/raw_data/emotion_dataset.csv')
#change column name
df2 = df2.rename(columns={'label':'Label', 'tweet':'Tweet'})

df2.head()

Unnamed: 0,Label,Tweet
0,anger,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu..."
1,anger,"Sesama cewe lho (kayaknya), harusnya bisa lebi..."
2,happy,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...
3,anger,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng..."
4,happy,"Sharing pengalaman aja, kemarin jam 18.00 bata..."


In [5]:
df = pd.concat([df1, df2], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True) #shuffle data

df.head()

Unnamed: 0,Tweet,Label
0,"terlepas kdg alurnya bikin kesel, tp terima ka...",Love
1,"Bagian terseram, caraka malam ditengah hutan b...",fear
2,"Sayaa sadar saya ini siapa, km siapa Dan takka...",sadness
3,"Kalau memang tak ada niatan lebih, tolong jang...",sadness
4,"sekuat-kuatnya orang yg selalu riang, tertawa ...",Sad


In [6]:
print(df['Label'].value_counts())

Label
Joy        1271
anger      1101
Anger      1049
happy      1017
Sad        1003
sadness     997
Fear        911
Love        760
fear        649
love        637
Name: count, dtype: int64


In [7]:
# Mengubag nama label menjadi bahasa Indonesia
label_mapping = {
    'Anger': 'Marah',
    'anger': 'Marah',
    'Fear': 'Takut',
    'fear': 'Takut',
    'happy': 'Gembira',
    'Joy': 'Gembira',
    'love': 'Cinta',
    'Love': 'Cinta',
    'sadness': 'Sedih',
    'Sad': 'Sedih'
}
df['Label'] = df['Label'].map(label_mapping)
print(df['Label'].value_counts())

Label
Gembira    2288
Marah      2150
Sedih      2000
Takut      1560
Cinta      1397
Name: count, dtype: int64


In [8]:
with pd.option_context('display.max_colwidth', None):
    print(df.iloc[4])

Tweet    sekuat-kuatnya orang yg selalu riang, tertawa dan bahagia setiap harinya, kalau udah sendiri kadang suka sedih dan netesin air mata. :')
Label                                                                                                                                       Sedih
Name: 4, dtype: object


### Cleaning Data

In [None]:
def cleanDataframe(df):
    print('Missing Values:', df.isnull().sum())
    print('Duplicates:', df.duplicated().sum())

    df_cleaned = df.dropna(subset='Tweet')
    df_cleaned = df_cleaned.drop_duplicates(keep='first')

    print('\nMissing values after cleaning:', df_cleaned.isnull().sum())
    print('Duplicates after cleaning:', df_cleaned.duplicated().sum())
    return df_cleaned




def cleaningText(text):
    # Membersihkan tanda tanda sisa medsos
    text = re.sub(r'\[USERNAME\]', '', text)
    text = re.sub(r'\[URL\]', '', text)
    text = re.sub(r'\[SENSITIVE-NO\]', '', text)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Menghapus Mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text)  # Menghapus Hashtag
    text = re.sub(r'RT[\s]', '', text)  # menghapus RT
    text = re.sub(r"http\S+", '', text)  # menghapus link

    # Pembersihan Karakter
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))# Mengganti tanda baca dengan spasi (alih-alih menghapus)
    text = re.sub(r'\d+', '', text)  # Menghapus angka
    text = text.replace('\n', ' ')  # Mengganti garis baru dengan spasi
    return text


kamusCleaning = pd.read_csv('../data/kamus.csv', encoding='utf-8',
                            delimiter=',', header=None, names=['slang', 'formal'])

cleaningDict = {}
for slang, formal in zip(kamusCleaning.slang, kamusCleaning.formal):
    if pd.notna(slang) and pd.notna(formal):
        slang_lower = str(slang).lower().strip()
        formal_clean = str(formal).strip()
        cleaningDict[slang_lower] = formal_clean
        
print(f"✅ Loaded {len(cleaningDict)} entries from kamus.csv")

def applyCleaningDict(text):
    if not isinstance(text, str) or not text.strip():
        return text

    # Pattern untuk match whole words (case insensitive)
    words = text.split()
    cleaned_words = []

    for word in words:
        # Clean word (lowercase untuk matching)
        clean_word = word.lower().strip()

        # Check jika word ada di dictionary
        if clean_word in cleaningDict:
            cleaned_words.append(cleaningDict[clean_word])
        else:
            cleaned_words.append(word)

    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

def normalizeText(text):
    if not isinstance(text, str):
        return ""

    if not text.strip():
        return text.strip()

    # 1. Lowercase pertama
    text = text.lower()

    # 2. Handle character elongation
    text = re.sub(r'(.)\1+', r'\1\1', text)
    if 'replace_word_elongation' in globals():
        text = replace_word_elongation(text)

    # 3. Replace slang words (dari multiple sources)
    if 'replace_slang' in globals():
        text = replace_slang(text)
    text = applyCleaningDict(text)

    # 4. Final cleanup
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    return text

✅ Loaded 1305 entries from kamus.csv


In [10]:
print(df.iloc[69])

Tweet    semangat kementerian bumn pak mentri bumn eric...
Label                                              Gembira
Name: 69, dtype: object


In [11]:
df_cleaned = cleanDataframe(df)

Missing Values: Tweet    0
Label    0
dtype: int64
Duplicates: 53

Missing values after cleaning: Tweet    0
Label    0
dtype: int64
Duplicates after cleaning: 0


In [12]:
df_cleaned['Tweet'] = df_cleaned['Tweet'].apply(cleaningText)
# Print 1 sample not truncated
with pd.option_context('display.max_colwidth', None):
    print(df_cleaned.iloc[61])


Tweet    lho kirain fest hahahha    
Label                          Takut
Name: 61, dtype: object


In [13]:
all_text = "".join(df_cleaned["Tweet"].astype(str))
text_length = all_text.split()
unique_text = set(text_length)
print('jumlah kata unik sebelum normalisasi:', len(unique_text))

jumlah kata unik sebelum normalisasi: 32020


In [14]:
df_normalized = df_cleaned.copy()
df_normalized['Tweet'] = df_normalized['Tweet'].apply(normalizeText)
# Print 1 sample not truncated
with pd.option_context('display.max_colwidth', None):
    print(df_normalized.iloc[61])

Tweet    lho mengira fest hahahha
Label                       Takut
Name: 61, dtype: object


In [15]:
all_text_normalized = "".join(df_normalized["Tweet"].astype(str))
text_length_normalized = all_text_normalized.split()
unique_text_normalized = set(text_length_normalized)
print('jumlah kata unik setelah normalisasi:', len(unique_text_normalized))

jumlah kata unik setelah normalisasi: 29242


In [16]:
print(df_cleaned['Label'].value_counts())

Label
Gembira    2266
Marah      2143
Sedih      1991
Takut      1552
Cinta      1390
Name: count, dtype: int64


### Export Menjadi CSV Baru

In [None]:
# Export
df_normalized.to_csv('../data/data_cleaned.csv', index=False)