In [30]:
import pandas as pd
import torch
from transformers import pipeline
import pandas as pd



In [31]:
DATA_PATH = "../data/dataset/test.txt"

df = pd.read_csv(DATA_PATH, delimiter='\\;', header=None,
                 names=['Text', 'Label'], engine='python')
df

Unnamed: 0,Text,Label
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness
...,...,...
1995,i just keep feeling like someone is being unki...,anger
1996,im feeling a little cranky negative after this...,anger
1997,i feel that i am useful to my people and that ...,joy
1998,im feeling more comfortable with derby i feel ...,joy


In [32]:
print(df['Label'].value_counts())

Label
joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: count, dtype: int64


In [33]:
# Mengubag nama label menjadi bahasa Indonesia
label_mapping = {
    'joy': 'Gembira',
    'sadness': 'Sedih',
    'fear': 'Takut',
    'anger': 'Marah',
    'love': 'Cinta',
}
df['Label'] = df['Label'].map(label_mapping)
print(df['Label'].value_counts())

Label
Gembira    695
Sedih      581
Marah      275
Takut      224
Cinta      159
Name: count, dtype: int64


In [34]:
df_cleaned = df.dropna(subset=['Text'])
df_cleaned = df_cleaned.drop_duplicates(keep='first')

In [35]:
translator = pipeline(
    "translation",
    model="facebook/nllb-200-distilled-600M",
    src_lang="eng_Latn",
    tgt_lang="ind_Latn",
    device=0 if torch.cuda.is_available() else -1
)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 9a0d4d81-45b2-4a8e-ac2b-7232a71302ca)')' thrown while requesting HEAD https://huggingface.co/facebook/nllb-200-distilled-600M/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Device set to use cuda:0


In [36]:
def simple_translate(text):
    if pd.isna(text) or not str(text).strip():
        return text
    result = translator(str(text), max_length=400)
    return result[0]['translation_text']

In [37]:
print("Memulai terjemahan...")
df_translated = df_cleaned
df_translated['Text_ID'] = df_translated['Text'].apply(simple_translate)

Memulai terjemahan...


In [38]:
df_translated

Unnamed: 0,Text,Label,Text_ID
0,im feeling rather rotten so im not very ambiti...,Sedih,Aku merasa agak busuk jadi aku tidak terlalu a...
1,im updating my blog because i feel shitty,Sedih,Aku memperbarui blogku karena aku merasa buruk
2,i never make her separate from me because i do...,Sedih,Aku tak pernah memisahkannya dariku karena aku...
3,i left with my bouquet of red and yellow tulip...,Gembira,Saya pergi dengan buket tulip merah dan kuning...
4,i was feeling a little vain when i did this one,Sedih,Aku merasa sedikit sia-sia ketika aku melakuka...
...,...,...,...
1995,i just keep feeling like someone is being unki...,Marah,Aku hanya terus merasa seperti seseorang tidak...
1996,im feeling a little cranky negative after this...,Marah,Aku merasa sedikit cranky negatif setelah janj...
1997,i feel that i am useful to my people and that ...,Gembira,Saya merasa bahwa saya berguna bagi orang-oran...
1998,im feeling more comfortable with derby i feel ...,Gembira,Aku merasa lebih nyaman dengan Derby Aku meras...


In [39]:
import pandas as pd
import re
import string

from indoNLP.preprocessing import replace_word_elongation, replace_slang

In [40]:
def cleanDataframe(df):
    print('Missing Values:', df.isnull().sum())
    print('Duplicates:', df.duplicated().sum())

    df_cleaned = df.dropna()
    df_cleaned = df_cleaned.drop_duplicates(keep='first')

    print('\nMissing values after cleaning:', df_cleaned.isnull().sum())
    print('Duplicates after cleaning:', df_cleaned.duplicated().sum())
    return df_cleaned


def cleaningText(text):
    # Membersihkan tanda tanda sisa medsos
    text = re.sub(r'\[USERNAME\]', '', text)
    text = re.sub(r'\[URL\]', '', text)
    text = re.sub(r'\[SENSITIVE-NO\]', '', text)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Menghapus Mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text)  # Menghapus Hashtag
    text = re.sub(r'RT[\s]', '', text)  # menghapus RT
    text = re.sub(r"http\S+", '', text)  # menghapus link

    # Pembersihan Karakter
    # Mengganti tanda baca dengan spasi (alih-alih menghapus)
    text = text.translate(str.maketrans(
        string.punctuation, ' ' * len(string.punctuation)))
    text = re.sub(r'\d+', '', text)  # Menghapus angka
    text = text.replace('\n', ' ')  # Mengganti garis baru dengan spasi
    return text


kamusCleaning = pd.read_csv('../data/kamus.csv', encoding='utf-8',
                            delimiter=',', header=None, names=['slang', 'formal'])


cleaningDict = {}
for slang, formal in zip(kamusCleaning.slang, kamusCleaning.formal):
    if pd.notna(slang) and pd.notna(formal):
        slang_lower = str(slang).lower().strip()
        formal_clean = str(formal).strip()
        cleaningDict[slang_lower] = formal_clean

print(f"✅ Loaded {len(cleaningDict)} entries from kamus.csv")


def applyCleaningDict(text):
    if not isinstance(text, str) or not text.strip():
        return text

    # Pattern untuk match whole words (case insensitive)
    words = text.split()
    cleaned_words = []

    for word in words:
        # Clean word (lowercase untuk matching)
        clean_word = word.lower().strip()

        # Check jika word ada di dictionary
        if clean_word in cleaningDict:
            cleaned_words.append(cleaningDict[clean_word])
        else:
            cleaned_words.append(word)

    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text


def normalizeText(text):
    if not isinstance(text, str):
        return ""

    if not text.strip():
        return text.strip()

    # 1. Lowercase pertama
    text = text.lower()

    # 2. Handle character elongation
    text = re.sub(r'(.)\1+', r'\1\1', text)
    if 'replace_word_elongation' in globals():
        text = replace_word_elongation(text)

    # 3. Replace slang words (dari multiple sources)
    if 'replace_slang' in globals():
        text = replace_slang(text)
    text = applyCleaningDict(text)

    # 4. Final cleanup
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    return text

✅ Loaded 1305 entries from kamus.csv


In [50]:
df_final = df_translated[['Label', 'Text_ID']]
df_final = cleanDataframe(df_final)
df_final['Text_ID'] = df_final['Text_ID'].apply(cleaningText)
df_normalized = df_final.copy()
df_normalized['Text_ID'] = df_normalized['Text_ID'].apply(normalizeText)

Missing Values: Label      66
Text_ID     0
dtype: int64
Duplicates: 2

Missing values after cleaning: Label      0
Text_ID    0
dtype: int64
Duplicates after cleaning: 0


In [51]:
df_normalized['Label'].value_counts()

Label
Gembira    695
Sedih      580
Marah      275
Takut      223
Cinta      159
Name: count, dtype: int64

In [52]:
# sort df by label
df_normalized = df_normalized.sort_values(by='Label').reset_index(drop=True)

In [None]:
df_normalized.to_csv('../data/external_test.csv', index=False)