### Import Library dan CSV

In [18]:
import pandas as pd
import re
import string

from indoNLP.preprocessing import replace_word_elongation, replace_slang

In [19]:
angerData = pd.read_csv('../data/raw_data/AngerData.csv', delimiter='\t')
fearData = pd.read_csv('../data/raw_data/FearData.csv', delimiter='\t')
joyData = pd.read_csv('../data/raw_data/JoyData.csv', delimiter='\t')
loveData = pd.read_csv('../data/raw_data/LoveData.csv', delimiter='\t')
sadData = pd.read_csv('../data/raw_data/SadData.csv', delimiter='\t')
    
#gabungkan data
df1 = pd.concat([angerData, fearData, joyData, loveData, sadData], ignore_index=True)
df1 = df1.sample(frac=1).reset_index(drop=True) #shuffle data

df1.head()

Unnamed: 0,Tweet,Label
0,bisa-bisanya mark gak calling gue /marah,Anger
1,"ingett wkwk. iyaa dog lalu kita bertiga , sama...",Joy
2,bersyukur sekali karena berkatmu aku juga baha...,Joy
3,kalian ada yang pernah pengen banget kiss pipi...,Love
4,"kok bisa bisanya kaget, bukanya ente yg ngawas...",Anger


In [20]:
# Mengubag nama label menjadi bahasa Indonesia
label_mapping = {
    'Anger': 'Marah',
    'anger': 'Marah',
    'Fear': 'Takut',
    'fear': 'Takut',
    'happy': 'Gembira',
    'Joy': 'Gembira',
    'love': 'Cinta',
    'Love': 'Cinta',
    'sadness': 'Sedih',
    'Sad': 'Sedih'
}
df1['Label'] = df1['Label'].map(label_mapping)
print(df1['Label'].value_counts())

Label
Gembira    1271
Marah      1049
Sedih      1003
Takut       911
Cinta       760
Name: count, dtype: int64


In [21]:
angerData = pd.read_csv('../data/new_data/marah.csv')
fearData = pd.read_csv('../data/new_data/takut.csv')
joyData = pd.read_csv('../data/new_data/gembira.csv')
loveData = pd.read_csv('../data/new_data/cinta.csv')
sadData = pd.read_csv('../data/new_data/sedih.csv')
#gabungkan data
df2 = pd.concat([angerData, fearData, joyData, loveData, sadData], ignore_index=True)
df2 = df2.sample(frac=1).reset_index(drop=True) #shuffle data

In [22]:
# df2 = pd.read_csv('../data/raw_data/emotion_dataset.csv')
# #change column name
# df2 = df2.rename(columns={'label':'Label', 'tweet':'Tweet'})

# df2.head()

In [23]:
df = pd.concat([df1, df2], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True) #shuffle data

df.head()

Unnamed: 0,Tweet,Label
0,gue takut kehilangan hal kecil,Takut
1,ga ada cahaya buat dituju,Sedih
2,rasanya like semuanya sinkron,Gembira
3,yaallah mika angkasa w kaget gue kira cm ciuma...,Takut
4,gue cemas sejak tadi pagi,Takut


In [24]:
print(df['Label'].value_counts())

Label
Gembira    2423
Marah      2055
Sedih      2037
Takut      1977
Cinta      1685
Name: count, dtype: int64


In [25]:
with pd.option_context('display.max_colwidth', None):
    print(df.iloc[4])

Tweet    gue cemas sejak tadi pagi
Label                        Takut
Name: 4, dtype: object


### Cleaning Data

In [26]:
def cleanDataframe(df):
    print('Missing Values:', df.isnull().sum())
    print('Duplicates:', df.duplicated().sum())

    df_cleaned = df.dropna(subset='Tweet')
    df_cleaned = df_cleaned.drop_duplicates(keep='first')

    print('\nMissing values after cleaning:', df_cleaned.isnull().sum())
    print('Duplicates after cleaning:', df_cleaned.duplicated().sum())
    return df_cleaned




def cleaningText(text):
    # Membersihkan tanda tanda sisa medsos
    text = re.sub(r'\[USERNAME\]', '', text)
    text = re.sub(r'\[URL\]', '', text)
    text = re.sub(r'\[SENSITIVE-NO\]', '', text)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Menghapus Mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text)  # Menghapus Hashtag
    text = re.sub(r'RT[\s]', '', text)  # menghapus RT
    text = re.sub(r"http\S+", '', text)  # menghapus link

    # Pembersihan Karakter
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))# Mengganti tanda baca dengan spasi (alih-alih menghapus)
    text = re.sub(r'\d+', '', text)  # Menghapus angka
    text = text.replace('\n', ' ')  # Mengganti garis baru dengan spasi
    return text


kamusCleaning = pd.read_csv('../data/kamus.csv', encoding='utf-8',
                            delimiter=',', header=None, names=['slang', 'formal'])

cleaningDict = {}
for slang, formal in zip(kamusCleaning.slang, kamusCleaning.formal):
    if pd.notna(slang) and pd.notna(formal):
        slang_lower = str(slang).lower().strip()
        formal_clean = str(formal).strip()
        cleaningDict[slang_lower] = formal_clean
        
print(f"✅ Loaded {len(cleaningDict)} entries from kamus.csv")

def applyCleaningDict(text):
    if not isinstance(text, str) or not text.strip():
        return text

    # Pattern untuk match whole words (case insensitive)
    words = text.split()
    cleaned_words = []

    for word in words:
        # Clean word (lowercase untuk matching)
        clean_word = word.lower().strip()

        # Check jika word ada di dictionary
        if clean_word in cleaningDict:
            cleaned_words.append(cleaningDict[clean_word])
        else:
            cleaned_words.append(word)

    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

def normalizeText(text):
    if not isinstance(text, str):
        return ""

    if not text.strip():
        return text.strip()

    # 1. Lowercase pertama
    text = text.lower()

    # 2. Handle character elongation
    text = re.sub(r'(.)\1+', r'\1\1', text)
    if 'replace_word_elongation' in globals():
        text = replace_word_elongation(text)

    # 3. Replace slang words (dari multiple sources)
    if 'replace_slang' in globals():
        text = replace_slang(text)
    text = applyCleaningDict(text)

    # 4. Final cleanup
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    return text

✅ Loaded 1305 entries from kamus.csv


In [27]:
print(df.iloc[69])

Tweet    benar guys, nasi goreng.,simple, cepat, dan en...
Label                                              Gembira
Name: 69, dtype: object


In [28]:
df_cleaned = cleanDataframe(df)

Missing Values: Tweet    0
Label    0
dtype: int64
Duplicates: 102

Missing values after cleaning: Tweet    0
Label    0
dtype: int64
Duplicates after cleaning: 0


In [29]:
df_cleaned['Tweet'] = df_cleaned['Tweet'].apply(cleaningText)
# Print 1 sample not truncated
with pd.option_context('display.max_colwidth', None):
    print(df_cleaned.iloc[61])


Tweet    sedih banget temen gue 
Label                      Sedih
Name: 61, dtype: object


In [30]:
all_text = "".join(df_cleaned["Tweet"].astype(str))
text_length = all_text.split()
unique_text = set(text_length)
print('jumlah kata unik sebelum normalisasi:', len(unique_text))

jumlah kata unik sebelum normalisasi: 19170


In [31]:
df_normalized = df_cleaned.copy()
df_normalized['Tweet'] = df_normalized['Tweet'].apply(normalizeText)
# Print 1 sample not truncated
with pd.option_context('display.max_colwidth', None):
    print(df_normalized.iloc[61])

Tweet    sedih sekali teman saya
Label                      Sedih
Name: 61, dtype: object


In [32]:
all_text_normalized = "".join(df_normalized["Tweet"].astype(str))
text_length_normalized = all_text_normalized.split()
unique_text_normalized = set(text_length_normalized)
print('jumlah kata unik setelah normalisasi:', len(unique_text_normalized))

jumlah kata unik setelah normalisasi: 18107


In [33]:
print(df_cleaned['Label'].value_counts())

Label
Gembira    2402
Marah      2046
Sedih      2007
Takut      1953
Cinta      1667
Name: count, dtype: int64


### Export Menjadi CSV Baru

In [34]:
# Export
df_normalized.to_csv('../data/data_cleaned.csv', index=False)