# Data Cleaning

## Import Libraries

In [241]:
import re
import emoji
import nltk
import pandas as pd
import contractions
from tqdm import tqdm
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from collections import Counter, defaultdict
tqdm.pandas()

# nltk.download("all")

## Import Data

In [242]:
data = pd.read_csv('Data/RawData.csv')

# Drop Duplicates

In [243]:
data = data.drop_duplicates().reset_index(drop=True)

## Stopwords

In [244]:
with open("AddData/IndoStopwords.txt", encoding="utf-8") as f:
    BASE_STOPWORDS = set(f.read().split())
EXTRA_STOPWORDS = {"eh", "dn", "an", "yang", "suami", "anak", "rumah", "ya", "ga", "aja", "gak", "orang", "ku", "bun", "kerja", "istri", "uang", "kasih", "kalau", "orang tua"}
ALL_STOPWORDS = BASE_STOPWORDS|EXTRA_STOPWORDS

def stop_word(word: str):
    if word not in ALL_STOPWORDS:
        return word 

## Stemming

In [245]:
kamus_clean = []
with open('AddData/IndoDict.txt') as f:
    for line in f:
        line = line.strip().lower()
        word = line.split()[0]
        kamus_clean.append(word)

akar_kata = []
def kamus_word(word):
    if word in kamus_clean:
        akar_kata.append(word)
        return None
    return word

In [246]:
def hapus_infleksional_suffiks(word):
    rules = [
        (('lah', 'kah', 'nya', 'tah', 'pun'), 3),
        (('ku', 'mu'), 2),
    ]

    for suffixes, cut_len in rules:
        if word.endswith(suffixes):
            stem = word[:-cut_len]
            return kamus_word(stem)

    return word

In [247]:
def hapus_derivation_suffiks(word):
    if word in kamus_clean:
        return word

    # urutan penting: 'kan' dulu, lalu 'an', lalu 'i'
    for suf in ('kan', 'an', 'i'):
        if word.endswith(suf):
            stem = word[:-len(suf)]

            # opsional: cegah stem yang terlalu pendek
            if len(stem) < 3:
                continue

            # hanya terima kalau stem ada di kamus
            if stem in kamus_clean:
                return stem

    # kalau tidak ada aturan yang cocok, balikin kata aslinya
    return word

In [248]:
def hapus_derivation_prefiks(word):
    # kalau kata sudah kata dasar di kamus, jangan dipotong prefiks
    if word in kamus_clean:
        return word

    def check(stem, restore=None):
        # buang derivational suffix dulu
        stem2 = hapus_derivation_suffiks(stem)

        candidates = [stem2]
        if restore is not None:
            candidates.append(restore + stem2)

        for cand in candidates:
            if cand in kamus_clean:      # langsung cek ke kamus
                return cand
        return None

    # (prefixes, cut_len, restore_consonant)
    rules = [
        (('mempel',),                  6, None),
        (('memper',),                  6, None),
        (('diper', 'keber', 'keter'),  5, None),
        (('meng', 'peng'),             4, 'k'),  # meng-/peng- → pulihkan k
        (('meny', 'peny'),             4, 's'),  # meny-/peny- → pulihkan s
        (('mel', 'mer', 'pel', 'per'), 3, None),
        (('men', 'pen'),               3, 't'),  # men-/pen- → pulihkan t
        (('mem', 'pem'),               3, 'p'),  # mem-/pem- → pulihkan p
        (('bel', 'ber', 'tel', 'ter'), 3, None),
        (('di', 'ke', 'se'),           2, None),
        (('be', 'te'),                 2, None),
        (('me', 'pe'),                 2, None),
    ]

    for prefixes, cut_len, restore in rules:
        if word.startswith(prefixes) and len(word) > cut_len:
            sub_word = word[cut_len:]
            result = check(sub_word, restore)
            if result is not None:
                return result

    # kalau tidak ada aturan yang berhasil, balikin kata aslinya
    return word


## Cleaning Pipeline

In [249]:
kamus_set = set(kamus_clean)
UNKNOWN_MAP = defaultdict(Counter)

key_df = pd.read_csv("AddData/IndoSlangMap.csv")
key_df["replace"] = key_df["replace"].fillna("").astype(str)
REPLACE_MAP = dict(zip(key_df["original_word"], key_df["replace"]))

def clean(text: str) -> str:
    text = str(text)

    # lowercase
    text = text.lower()

    # handle gara2/gara-gara dan yang sejenis
    text = re.sub(r"\b([a-z]+)-\1\b", r"\1", text)
    text = re.sub(r"([a-zA-Z])2", r"\1", text)

    # remove non-alphanumeric characters
    text = re.sub(r"[^a-z0-9\s]+", " ", text)

    # separate digits and letters
    text = re.sub(r"(\d)([a-zA-Z])", r"\1 \2", text)
    text = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", text)

    # normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    if not text:
        return ""

    tokens = []

    for w in text.split():
        w = REPLACE_MAP.get(w, w)

        # 1) stopword
        w = stop_word(w)
        if not w:
            continue
        
        original = w
        
        # 2) kalau sudah kata kamus, pakai apa adanya
        if w in kamus_clean:
            tokens.append(w)
            continue

        # 3) hapus infleksional suffiks
        w = hapus_infleksional_suffiks(w)
        if not w:
            continue

        # 4) hapus derivation suffiks
        w = hapus_derivation_suffiks(w)
        if not w:
            continue

        # 5) hapus derivation prefiks
        w = hapus_derivation_prefiks(w)
        if not w:
            continue

        # log kata yang masih bukan kata kamus
        if w not in kamus_set:
            UNKNOWN_MAP[w][original] += 1

        tokens.append(w)

    return " ".join(tokens)

In [250]:
data['text'] = data['text'].progress_apply(clean)
data = data.dropna().reset_index(drop=True)
data.to_csv("Data/CleanData.csv", index=False)

100%|██████████| 2163/2163 [01:00<00:00, 35.90it/s]


In [251]:
rows = []
for final_word, counter in UNKNOWN_MAP.items():
    final_total = sum(counter.values())
    for original_word, count in counter.most_common():
        rows.append({
            "original_word": original_word,
            "final_word": final_word,
            "pair_count": count,
            "final_total": final_total,
        })

df_unknown_map = pd.DataFrame(rows).sort_values(
    ["final_total", "pair_count"], ascending=False
).reset_index(drop=True)

df_unknown_map.to_csv("random_words_mapping.csv", index=False)