In [None]:
import csv
import pandas as pd

DATA_PATH = "../data/raw_data/test_dataset.csv"

data = []
with open(DATA_PATH, 'r', encoding='utf-8') as file:
    reader = csv.reader(file, quotechar='"', escapechar='\\')
    for i, row in enumerate(reader):
        if len(row) == 2:
            data.append(row)
        elif len(row) > 2:
            # Gabungkan field tambahan ke tweet
            label = row[0]
            tweet = ','.join(row[1:])
            data.append([label, tweet])
        else:
            print(f"Skipping line {i}: {row}")

# Convert to DataFrame
if data:
    df = pd.DataFrame(data[1:], columns=data[0])
    print(f"Shape: {df.shape}")
    df.head()
else:
    print("No data found")

Shape: (4941, 2)


In [2]:
df = df.rename(columns={'label': 'Label', 'teks':'Tweet'})

In [3]:
df

Unnamed: 0,Label,Tweet
0,Marah,Gue benar-benar jengkel sama orang-orang yang ...
1,Marah,"Dasar pemerintah bangsat, urusan rakyat kecil ..."
2,Marah,Aku benci banget sama diriku sendiri yang sela...
3,Marah,"Anjir, nyebelin banget dah aplikasi gojek lagi..."
4,Marah,Gue muak sama berita-berita hoax yang bikin re...
...,...,...
4936,Cinta,Cinta kita kuat
4937,Cinta,Kamu adalah damba
4938,Cinta,Aku pengen rawat
4939,Cinta,Perasaan ini beri


In [4]:
# Mengubag nama label menjadi bahasa Indonesia
label_mapping = {
    'Senang': 'Gembira',
    'Cinta': 'Cinta',
    'Takut': 'Takut',
    'Sedih': 'Sedih',
    'Marah': 'Marah'
}
df['Label'] = df['Label'].map(label_mapping)
print(df['Label'].value_counts())

Label
Cinta      2557
Gembira    1095
Marah       470
Takut       424
Sedih       395
Name: count, dtype: int64


In [5]:
print(df['Label'].value_counts())

Label
Cinta      2557
Gembira    1095
Marah       470
Takut       424
Sedih       395
Name: count, dtype: int64


In [6]:
df = df.loc[:, ~df.columns.str.contains('final_text')]

In [7]:
#remove surprise label from dataset
df = df[df['Label'] != 'surprise'].copy()
print(df['Label'].value_counts())

Label
Cinta      2557
Gembira    1095
Marah       470
Takut       424
Sedih       395
Name: count, dtype: int64


In [8]:
#sort df by label
df = df.sort_values(by='Label').reset_index(drop=True)
df.head()

Unnamed: 0,Label,Tweet
0,Cinta,Cinta kita tuh berarti lebih dari apapun
1,Cinta,Aku nggak sembunyi
2,Cinta,Kamu adalah ganti
3,Cinta,Setiap sama anugerah
4,Cinta,Aku sayang hormat


In [9]:
# df.to_csv('./data/external_test.csv', index=False)

In [10]:
import numpy as np

N_SAMPLES = 473
LABEL = 'Label'

df_downsampled = (
    df.groupby(LABEL)
    .apply(lambda x: x.sample(
        n=min(len(x), N_SAMPLES),
        random_state=42
    ))
    .reset_index(drop=True)
)

print("Distribusi Label Setelah Downsampling:")
print(df_downsampled[LABEL].value_counts())

Distribusi Label Setelah Downsampling:
Label
Cinta      473
Gembira    473
Marah      470
Takut      424
Sedih      395
Name: count, dtype: int64


  .apply(lambda x: x.sample(


In [11]:
# drop final_text column
df = df_downsampled.loc[:, ~df.columns.str.contains('final_text')]

In [12]:
import pandas as pd
import re
import string

from indoNLP.preprocessing import replace_word_elongation, replace_slang

In [13]:
def cleanDataframe(df):
    print('Missing Values:', df.isnull().sum())
    print('Duplicates:', df.duplicated().sum())

    df_cleaned = df.dropna(subset='Tweet')
    df_cleaned = df_cleaned.drop_duplicates(keep='first')

    print('\nMissing values after cleaning:', df_cleaned.isnull().sum())
    print('Duplicates after cleaning:', df_cleaned.duplicated().sum())
    return df_cleaned


def cleaningText(text):
    # Membersihkan tanda tanda sisa medsos
    text = re.sub(r'\[USERNAME\]', '', text)
    text = re.sub(r'\[URL\]', '', text)
    text = re.sub(r'\[SENSITIVE-NO\]', '', text)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Menghapus Mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text)  # Menghapus Hashtag
    text = re.sub(r'RT[\s]', '', text)  # menghapus RT
    text = re.sub(r"http\S+", '', text)  # menghapus link

    # Pembersihan Karakter
    # Mengganti tanda baca dengan spasi (alih-alih menghapus)
    text = text.translate(str.maketrans(
        string.punctuation, ' ' * len(string.punctuation)))
    text = re.sub(r'\d+', '', text)  # Menghapus angka
    text = text.replace('\n', ' ')  # Mengganti garis baru dengan spasi
    return text


kamusCleaning = pd.read_csv('kamus.csv', encoding='utf-8',
                            delimiter=',', header=None, names=['slang', 'formal'])

cleaningDict = {}
for slang, formal in zip(kamusCleaning.slang, kamusCleaning.formal):
    if pd.notna(slang) and pd.notna(formal):
        slang_lower = str(slang).lower().strip()
        formal_clean = str(formal).strip()
        cleaningDict[slang_lower] = formal_clean

print(f"✅ Loaded {len(cleaningDict)} entries from kamus.csv")


def applyCleaningDict(text):
    if not isinstance(text, str) or not text.strip():
        return text

    # Pattern untuk match whole words (case insensitive)
    words = text.split()
    cleaned_words = []

    for word in words:
        # Clean word (lowercase untuk matching)
        clean_word = word.lower().strip()

        # Check jika word ada di dictionary
        if clean_word in cleaningDict:
            cleaned_words.append(cleaningDict[clean_word])
        else:
            cleaned_words.append(word)

    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text


def normalizeText(text):
    if not isinstance(text, str):
        return ""

    if not text.strip():
        return text.strip()

    # 1. Lowercase pertama
    text = text.lower()

    # 2. Handle character elongation
    text = re.sub(r'(.)\1+', r'\1\1', text)
    if 'replace_word_elongation' in globals():
        text = replace_word_elongation(text)

    # 3. Replace slang words (dari multiple sources)
    if 'replace_slang' in globals():
        text = replace_slang(text)
    text = applyCleaningDict(text)

    # 4. Final cleanup
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    return text

✅ Loaded 1305 entries from kamus.csv


In [14]:
df_cleaned = cleanDataframe(df)

Missing Values: Label    0
Tweet    0
dtype: int64
Duplicates: 492

Missing values after cleaning: Label    0
Tweet    0
dtype: int64
Duplicates after cleaning: 0


In [None]:
df['Tweet'] = df['Tweet'].apply(cleaningText)
df_normalized = df.copy()
df_normalized['Tweet'] = df_normalized['Tweet'].apply(normalizeText)

In [18]:
df_normalized['Label'].value_counts()

Label
Cinta      473
Gembira    473
Marah      470
Takut      424
Sedih      395
Name: count, dtype: int64

In [None]:
df_normalized.to_csv('../data/external_test.csv', index=False)