In [47]:
# path: scripts/clean_csv.py
import os
import re
from pathlib import Path
from typing import Optional, List

import pandas as pd


# ===================== Konfigurasi =====================
# Gunakan forward slashes agar portable lintas OS
INPUT_CSV_PATH = "rawdata.csv"
OUTPUT_SUFFIX = "_clean"
DELIMITER_OUT = ","
ENCODING_OUT = "utf-8"
PRIMARY_KEY_EXACT = ["Kalimat", "Judul", "Penulis"]


In [48]:
# ===================== Util =====================
def fix_mojibake(text: Optional[str]) -> Optional[str]:
    if text is None or (isinstance(text, float) and pd.isna(text)):
        return None
    s = str(text)
    repl = {
        "â€œ": '"', "â€": '"', "â€˜": "'", "â€™": "'",
        "â€“": "-", "â€”": "-", "â€¢": "*", "â€¦": "...",
        "Â ": " ", "Â": "", "Ã—": "×",
    }
    for bad, good in repl.items():
        s = s.replace(bad, good)
    s = s.replace("“", '"').replace("”", '"').replace("’", "'").replace("‘", "'")
    s = re.sub(r"\s+", " ", s).strip()
    return s

_PUNCT_RE  = re.compile(r"""[!"#$%&()*+,\-./:;<=>?@\[\]\\\]^_`{|}~]""")
_SPACES_RE = re.compile(r"\s+")

def kalimat_key(text: Optional[str]) -> Optional[str]:
    if text is None or pd.isna(text):
        return None
    s = fix_mojibake(text).lower()
    s = _PUNCT_RE.sub(" ", s)
    s = _SPACES_RE.sub(" ", s).strip()
    return s


In [49]:
# ===================== Path Resolver =====================
def resolve_input_path(rel_path: str, depth: int = 5) -> Path:
    """
    Cari file berdasarkan relative path dari CWD dan beberapa parentnya.
    Fallback: glob mencari path yang berakhir dengan rel_path.
    """
    tried: List[Path] = []
    norm_rel = Path(os.path.normpath(rel_path))

    # 1) Coba apa adanya relatif ke masing-masing parent CWD
    cwd = Path.cwd()
    for base in [cwd, *list(cwd.parents)[:depth]]:
        cand = (base / norm_rel).resolve()
        tried.append(cand)
        if cand.exists():
            return cand

    # 2) Jika masih gagal, glob di beberapa parent (dibatasi agar aman)
    pattern_name = norm_rel.name  # e.g., rawdata.csv
    for base in [cwd, *list(cwd.parents)[:depth]]:
        for hit in base.rglob(pattern_name):
            try_path = hit.resolve()
            # Kenapa: pastikan struktur foldernya cocok di akhir
            if tuple(try_path.parts[-len(norm_rel.parts):]) == tuple(norm_rel.parts):
                return try_path

    # 3) Gagal total -> error dengan info debug
    msg = [
        f"Tidak menemukan file: '{rel_path}'",
        f"CWD: {cwd}",
        "Jalur yang dicoba:"
    ] + [f" - {p}" for p in tried]
    raise FileNotFoundError("\n".join(msg))

In [50]:
# ===================== Parser PRODI/FAKULTAS =====================
def parse_prodi_fakultas(value: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
    """
    Ekstrak (jurusan, fakultas) dari pola:
      'PRODI_<jurusan>FAKULTAS_<fakultas>'  (underscore sebagai spasi)
    Toleran terhadap spasi/underscore/case.
    """
    if value is None or pd.isna(value):
        return (pd.NA, pd.NA)
    v = fix_mojibake(str(value))
    if not v:
        return (pd.NA, pd.NA)

    # Normalisasi: hilangkan koma di ujung, spasi → underscore untuk memudahkan regex
    v = v.strip().rstrip(",")
    v_us = re.sub(r"\s+", "_", v)

    # Ambil bagian setelah PRODI_ dan sebelum FAKULTAS (jika ada)
    m = re.search(r"(?i)prodi[_:\-\s]*([A-Z0-9_ ]+?)(?:_?fakultas\b|$)", v_us)
    jur = m.group(1) if m else None

    # Ambil fakultas jika ada
    m2 = re.search(r"(?i)fakultas[_:\-\s]*([A-Z0-9_ ]+)$", v_us)
    fak = m2.group(1) if m2 else None

    def clean(tok: Optional[str]) -> Optional[str]:
        if not tok:
            return pd.NA
        t = tok.strip(" _-")
        t = re.sub(r"_+", " ", t)
        t = re.sub(r"\s+", " ", t).strip()
        return t if t else pd.NA

    return (clean(jur), clean(fak))

# ===================== Proper-Case (ID) =====================
SMALL_WORDS = {
    "dan","di","ke","yang","dari","pada","untuk","oleh","atau","dengan",
    "serta","dalam","tentang","terhadap","para","para","antara","sebagai",
    "pada","yang","dengan","atau","tiap","setiap","dalam"
}
AKRONIM_KEEP = {"FKIP","FK","FEB","FISIP","FIB","FMIPA","FT","FIP","FIK","S1","S2","S3","SD","SMP","SMA","UMKM","AI","IT","UN","UK","USA"}
ROMAN = {"I","II","III","IV","V","VI","VII","VIII","IX","X"}

def proper_case_id(text: Optional[str]) -> Optional[str]:
    """Title Case ala ID; jaga akronim/roman numerals. Simpel tapi cukup untuk kasus ini."""
    if text is None or pd.isna(text):
        return text
    s = fix_mojibake(str(text))
    if not s:
        return s

    # ganti underscore -> spasi
    s = re.sub(r"_+", " ", s)
    words = re.split(r"(\s+)", s)  # keep separators
    out = []
    pos = 0
    for w in words:
        pos += 1
        if w.isspace():
            out.append(w)
            continue
        raw = w

        # jika semua huruf & panjang>1 dan ada di whitelist akronim/roman
        u = raw.upper()
        if u in AKRONIM_KEEP or u in ROMAN:
            out.append(u)
            continue

        # Jika 2+ huruf kapital berturut (mungkin akronim tak dikenal) -> biarkan
        if re.fullmatch(r"[A-Z]{2,}\.?$", raw):
            out.append(raw)
            continue

        # Default title-case
        lower = raw.lower()
        if lower in SMALL_WORDS and len(out) > 0:
            out.append(lower)
        else:
            out.append(lower.capitalize())

    # Post-fix beberapa tanda baca khas ID
    res = "".join(out)
    res = re.sub(r"\s+([,:;.?!)])", r"\1", res)
    res = re.sub(r"([(])\s+", r"\1", res)
    return res.strip()

In [51]:
# ===================== Main =====================
def main() -> None:
    src = Path(INPUT_CSV_PATH)
    if not src.exists():
        raise FileNotFoundError(f"File input tidak ditemukan: {src}")

    # Load
    df = pd.read_csv(src, encoding=ENCODING_IN)

    # Header → Title Case konsisten
    df.columns = [str(c).strip().title() for c in df.columns]

    # Pastikan kolom utama
    for c in ("No", "Kalimat", "Judul", "Penulis"):
        if c not in df.columns:
            df[c] = pd.NA

    # ========== Derivasi Jurusan ==========
    # Sumber utama: kolom Prodi jika terisi; jika kosong, parse dari Fakultas (gabungan)
    jur_from_prodi = df["Prodi"].map(lambda x: fix_mojibake(x) if "Prodi" in df.columns else pd.NA) if "Prodi" in df.columns else pd.Series(pd.NA, index=df.index)
    jur_from_prodi = jur_from_prodi.where(jur_from_prodi.fillna("").str.strip().ne(""), other=pd.NA)

    if "Fakultas" in df.columns:
        parsed = df["Fakultas"].apply(parse_prodi_fakultas)
        jur_parsed = parsed.map(lambda t: t[0])
        fak_parsed = parsed.map(lambda t: t[1])
    else:
        jur_parsed = pd.Series(pd.NA, index=df.index)
        fak_parsed = pd.Series(pd.NA, index=df.index)

    df["Jurusan"] = jur_from_prodi.combine_first(jur_parsed)

    # Optional: jika ingin simpan fakultas hasil parse (tak dipakai di output)
    if "Fakultas" in df.columns:
        df["FakultasNama"] = fak_parsed

    # ========== Bersih teks & Proper Case ==========
    for c in ["Kalimat", "Judul", "Penulis", "Jurusan"]:
        df[c] = df[c].map(fix_mojibake)

    # proper-case; untuk Kalimat biarkan apa adanya (jangan title-case)
    df["Judul"] = df["Judul"].map(proper_case_id)
    df["Penulis"] = df["Penulis"].map(proper_case_id)
    df["Jurusan"] = df["Jurusan"].map(proper_case_id)

    # Buang baris tanpa Kalimat
    df = df[df["Kalimat"].fillna("").astype(str).str.strip().ne("")].copy()

    # Dedup 1: exact
    before = len(df)
    df = df.drop_duplicates(subset=PRIMARY_KEY_EXACT, keep="first")
    d1 = before - len(df)

    # Dedup 2: normalized Kalimat
    df["Kalimatkey"] = df["Kalimat"].map(kalimat_key)
    before2 = len(df)
    df = df.drop_duplicates(subset=["Kalimatkey", "Judul", "Penulis"], keep="first")
    d2 = before2 - len(df)

    # Simpan
    final_cols = ["Kalimat", "Judul", "Penulis", "Jurusan"]
    df_final = df[final_cols].reset_index(drop=True)

    out_path = src.with_name(src.stem + OUTPUT_SUFFIX + ".csv")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    df_final.to_csv(
        out_path, index=False, encoding=ENCODING_OUT,
        sep=DELIMITER_OUT, lineterminator="\n", na_rep=""
    )

    print(f"Input : {src}")
    print(f"Output: {out_path}")
    print(f"Dropped exact dupes     : {d1}")
    print(f"Dropped kalimat-key dupes: {d2}")
    print("\nPreview:")
    try:
        from IPython.display import display
        display(df_final.head(12))
    except Exception:
        print(df_final.head(12).to_string(index=False))

if __name__ == "__main__":
    main()

Input : rawdata.csv
Output: rawdata_clean.csv
Dropped exact dupes     : 419
Dropped kalimat-key dupes: 20

Preview:


Unnamed: 0,Kalimat,Judul,Penulis,Jurusan
0,"ABSTRAK Putri, Prasasti Lishernanda.",SKRIPSI VARIASI BAHASA PADA LANSKAP DI SEKITAR...,PRASASTI LISHERNANDA PUTRI,S1 PENDIDIKAN BAHASA DAN SASTRA INDONESIAFAKUL...
1,2023.,SKRIPSI VARIASI BAHASA PADA LANSKAP DI SEKITAR...,PRASASTI LISHERNANDA PUTRI,S1 PENDIDIKAN BAHASA DAN SASTRA INDONESIAFAKUL...
2,"""Variasi Bahasa pada Lanskap di Sekitar Candi ...",SKRIPSI VARIASI BAHASA PADA LANSKAP DI SEKITAR...,PRASASTI LISHERNANDA PUTRI,S1 PENDIDIKAN BAHASA DAN SASTRA INDONESIAFAKUL...
3,Program Studi Pendidikan Bahasa dan Sastra Ind...,SKRIPSI VARIASI BAHASA PADA LANSKAP DI SEKITAR...,PRASASTI LISHERNANDA PUTRI,S1 PENDIDIKAN BAHASA DAN SASTRA INDONESIAFAKUL...
4,Fakultas Keguruan dan Ilmu Pendidikan.,SKRIPSI VARIASI BAHASA PADA LANSKAP DI SEKITAR...,PRASASTI LISHERNANDA PUTRI,S1 PENDIDIKAN BAHASA DAN SASTRA INDONESIAFAKUL...
5,Universitas Tidar.,SKRIPSI VARIASI BAHASA PADA LANSKAP DI SEKITAR...,PRASASTI LISHERNANDA PUTRI,S1 PENDIDIKAN BAHASA DAN SASTRA INDONESIAFAKUL...
6,Pembimbing I Dr.,SKRIPSI VARIASI BAHASA PADA LANSKAP DI SEKITAR...,PRASASTI LISHERNANDA PUTRI,S1 PENDIDIKAN BAHASA DAN SASTRA INDONESIAFAKUL...
7,"Astuty, M.Pd., Pembimbing II Herpindo, S.Pd., ...",SKRIPSI VARIASI BAHASA PADA LANSKAP DI SEKITAR...,PRASASTI LISHERNANDA PUTRI,S1 PENDIDIKAN BAHASA DAN SASTRA INDONESIAFAKUL...
8,"Kata Kunci: Candi Mendut, fungsi lanskap lingu...",SKRIPSI VARIASI BAHASA PADA LANSKAP DI SEKITAR...,PRASASTI LISHERNANDA PUTRI,S1 PENDIDIKAN BAHASA DAN SASTRA INDONESIAFAKUL...
9,Adanya variasi bahasa tidak menghilangkan fung...,SKRIPSI VARIASI BAHASA PADA LANSKAP DI SEKITAR...,PRASASTI LISHERNANDA PUTRI,S1 PENDIDIKAN BAHASA DAN SASTRA INDONESIAFAKUL...
