In [8]:
import pandas as pd

df = pd.read_csv("indo_raw_TEST_emoji.csv")
df.head()

Unnamed: 0,Processed_Sentence_1,Processed_Sentence_2,Sentiment
0,"Gk muluk muluk, 100,000 lot saham BBCA aja","Gk muluk muluk, 100,000 lot saham BBCA aja",Positive
1,BCA Expoversary 2024 menawarkan promo suku bun...,BCA Expoversary 2024 menawarkan promo suku bun...,Neutral
2,saham bca nya menyusul ya 🙂,saham bca nya menyusul ya 🙂,Positive
3,PT Bank BCA Syariah (BCA Syariah) turut memeri...,PT Bank BCA Syariah (BCA Syariah) turut memeri...,Neutral
4,Begitu byk saham kamu memilih saham itu kalau ...,Begitu byk saham kamu memilih saham itu kalau ...,Positive


In [None]:
diff_rows = df[df["Processed_Sentence_1"] != df["Processed_Sentence_2"]]

diff_rows

Unnamed: 0,Processed_Sentence_1,Processed_Sentence_2,Sentiment


In [None]:
import pandas as pd
import re
import os

main_df = pd.read_csv("IDSMSA.csv")

slang_df = pd.read_csv("kamus_kata_baku.csv")
slang_dict = dict(zip(slang_df['kata_tidak_baku'], slang_df['kata_baku']))

emoji_df_indo = pd.read_csv("kamus_emoji_indo.csv")
emoji_dict_indo = dict(zip(emoji_df_indo['emoji'], emoji_df_indo['kata_emoji']))
emoji_list_indo = emoji_df_indo['emoji'].tolist()

emoji_df_inggris = pd.read_csv("kamus_emoji_inggris.csv")
emoji_dict_inggris = dict(zip(emoji_df_inggris['emoji'], emoji_df_inggris['kata_emoji']))

emoji_list_combined = list(set(emoji_list_indo + emoji_df_inggris['emoji'].tolist()))

In [None]:
def handle_reduplication(text):
    return re.sub(r'(\w+)[\u00B2](\w*)', r'\1-\1\2', str(text))

def separate_punctuation(text):
    return re.sub(r'([.,!?;:])', r' \1 ', str(text))

def remove_placeholders(text):
    text = str(text)
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\[USERNAME\]|\[URL\]|\[HASHTAG\]', '', text, flags=re.IGNORECASE)
    return text.strip()

def normalize_slang(text, slang_dict):
    words = text.split()
    normalized_words = [slang_dict.get(word.lower(), word) for word in words]
    return ' '.join(normalized_words)

def clean_final_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def translate_emojis(text, emoji_dict):
    text = str(text)
    for emoji, word in emoji_dict.items():
        text = text.replace(emoji, f' {word} ') # Add spaces for separation
    return text

def remove_all_emojis(text, emoji_list):
    text = str(text)
    for emoji in emoji_list:
        text = text.replace(emoji, '')
    return text

In [None]:
def create_cleaned_indo_pipeline(text, slang_dict, emoji_dict, emoji_list, emoji_option='keep'):
    text = handle_reduplication(text)
    
    if emoji_option == 'translate':
        text = translate_emojis(text, emoji_dict)
    elif emoji_option == 'remove':
        text = remove_all_emojis(text, emoji_list)
        
    text = remove_placeholders(text)
    
    text = separate_punctuation(text)
    
    text = normalize_slang(text, slang_dict)
    
    text = clean_final_text(text)
    return text

def create_cleaned_eng_pipeline(text, emoji_dict, emoji_list, emoji_option='keep'):
    if emoji_option == 'translate':
        text = translate_emojis(text, emoji_dict)
    elif emoji_option == 'remove':
        text = remove_all_emojis(text, emoji_list)
        
    text = remove_placeholders(text)
    text = clean_final_text(text)
    return text

In [None]:
output_dir = "other output"
os.makedirs(output_dir, exist_ok=True)

print(f"\nStarting dataset generation... Output will be saved in '{output_dir}/'")

# --- INDONESIAN DATASETS ---
# 1. Cleaned (with emoji)
df1 = main_df.copy()
df1['Processed_Sentence'] = df1['Sentence'].apply(lambda t: create_cleaned_indo_pipeline(t, slang_dict, emoji_dict_indo, emoji_list_combined, emoji_option='keep'))
df1_final = df1[['Sentence', 'Processed_Sentence', 'Sentiment']].rename(columns={'Sentence': 'Original_Sentence'})
df1_final.to_csv(os.path.join(output_dir, "indo_cleaned_with_emoji.csv"), index=False)
print("1. Generated: indo_cleaned_with_emoji.csv")

# 2. Cleaned (no emoji)
df2 = main_df.copy()
df2['Processed_Sentence'] = df2['Sentence'].apply(lambda t: create_cleaned_indo_pipeline(t, slang_dict, emoji_dict_indo, emoji_list_combined, emoji_option='remove'))
df2_final = df2[['Sentence', 'Processed_Sentence', 'Sentiment']].rename(columns={'Sentence': 'Original_Sentence'})
df2_final.to_csv(os.path.join(output_dir, "indo_cleaned_no_emoji.csv"), index=False)
print("2. Generated: indo_cleaned_no_emoji.csv")

# 3. Cleaned (kata_emoji)
df3 = main_df.copy()
df3['Processed_Sentence'] = df3['Sentence'].apply(lambda t: create_cleaned_indo_pipeline(t, slang_dict, emoji_dict_indo, emoji_list_combined, emoji_option='translate'))
df3_final = df3[['Sentence', 'Processed_Sentence', 'Sentiment']].rename(columns={'Sentence': 'Original_Sentence'})
df3_final.to_csv(os.path.join(output_dir, "indo_cleaned_kata_emoji.csv"), index=False)
print("3. Generated: indo_cleaned_kata_emoji.csv")

# 4. Raw (with emoji)
df4 = main_df.copy()
df4['Processed_Sentence'] = df4['Sentence'].apply(remove_placeholders)
df4_final = df4[['Sentence', 'Processed_Sentence', 'Sentiment']].rename(columns={'Sentence': 'Original_Sentence'})
df4_final.to_csv(os.path.join(output_dir, "indo_raw_with_emoji.csv"), index=False)
print("4. Generated: indo_raw_with_emoji.csv")

# 5. Raw (no emoji)
df5 = main_df.copy()
df5['Processed_Sentence'] = df5['Sentence'].apply(remove_placeholders)
df5['Processed_Sentence'] = df5['Processed_Sentence'].apply(lambda t: remove_all_emojis(t, emoji_list_combined))
df5_final = df5[['Sentence', 'Processed_Sentence', 'Sentiment']].rename(columns={'Sentence': 'Original_Sentence'})
df5_final.to_csv(os.path.join(output_dir, "indo_raw_no_emoji.csv"), index=False)
print("5. Generated: indo_raw_no_emoji.csv")

# 6. Raw (kata_emoji)
df6 = main_df.copy()
df6['Processed_Sentence'] = df6['Sentence'].apply(remove_placeholders)
df6['Processed_Sentence'] = df6['Processed_Sentence'].apply(lambda t: translate_emojis(t, emoji_dict_indo))
df6_final = df6[['Sentence', 'Processed_Sentence', 'Sentiment']].rename(columns={'Sentence': 'Original_Sentence'})
df6_final.to_csv(os.path.join(output_dir, "indo_raw_kata_emoji.csv"), index=False)
print("6. Generated: indo_raw_kata_emoji.csv")

# --- ENGLISH DATASETS ---
# 7. Cleaned English (with emoji)
df7 = main_df.copy()
df7['Processed_Sentence'] = df7['English Translation'].apply(lambda t: create_cleaned_eng_pipeline(t, emoji_dict_inggris, emoji_list_combined, emoji_option='keep'))
df7_final = df7[['English Translation', 'Processed_Sentence', 'Sentiment']].rename(columns={'English Translation': 'Original_Sentence'})
df7_final.to_csv(os.path.join(output_dir, "eng_cleaned_with_emoji.csv"), index=False)
print("7. Generated: eng_cleaned_with_emoji.csv")

# 8. Cleaned English (no emoji)
df8 = main_df.copy()
df8['Processed_Sentence'] = df8['English Translation'].apply(lambda t: create_cleaned_eng_pipeline(t, emoji_dict_inggris, emoji_list_combined, emoji_option='remove'))
df8_final = df8[['English Translation', 'Processed_Sentence', 'Sentiment']].rename(columns={'English Translation': 'Original_Sentence'})
df8_final.to_csv(os.path.join(output_dir, "eng_cleaned_no_emoji.csv"), index=False)
print("8. Generated: eng_cleaned_no_emoji.csv")

# 9. Cleaned English (kata_emoji)
df9 = main_df.copy()
df9['Processed_Sentence'] = df9['English Translation'].apply(lambda t: create_cleaned_eng_pipeline(t, emoji_dict_inggris, emoji_list_combined, emoji_option='translate'))
df9_final = df9[['English Translation', 'Processed_Sentence', 'Sentiment']].rename(columns={'English Translation': 'Original_Sentence'})
df9_final.to_csv(os.path.join(output_dir, "eng_cleaned_kata_emoji.csv"), index=False)
print("9. Generated: eng_cleaned_kata_emoji.csv")

In [None]:
p = pd.read_csv("Data Output/indo_raw_no_emoji.csv")
pd.set_option('display.max_colwidth', None)
p.iloc[[457]]

Unnamed: 0,Original_Sentence,Processed_Sentence,Sentiment
457,Apakah memang sudah tidak tertolong 🤔 shm! $UNVR ??? [URL],Apakah memang sudah tidak tertolong shm! $UNVR ???,Negative
