In [17]:
# fast_near_duplicate_simhash_langdetect_clean.py

from simhash import Simhash, SimhashIndex
from collections import Counter
from langdetect import detect, DetectorFactory

DetectorFactory.seed = 0

INPUT_FILE = "D:/Document/AI Talent Factory/TIM 1/Data-All/Gabungan-All.txt"
UNIQUE_FILE = "D:/Document/AI Talent Factory/TIM 1/Data-All/unique_Gabungan-All.txt"
DUP_FILE = "D:/Document/AI Talent Factory/TIM 1/Data-All/duplicate-Gabungan-All.txt"

SIMHASH_K = 3
MAX_TOKEN_WEIGHT = 10


# --------------------------------------------------------
# CLEAN-UP: hapus sumber artikel seperti "Liputan6.com, Jakarta"
# --------------------------------------------------------
def clean_text(text: str) -> str:
    remove_list = [
        "Liputan6.com, Jakarta",
        "Liputan6.com , Jakarta",
        "LIPUTAN6.COM, JAKARTA",
        "liputan6.com, jakarta",
        "liputan6.com , jakarta",
        "REPUBLIKA.CO.ID",
        "JAKARTA, KOMPAS.com-",
        "Bisnis.com, JAKARTA"
    ]
    for r in remove_list:
        text = text.replace(r, "")
    return text.strip()


def detect_language_safe(text: str) -> str:
    try:
        return detect(text)
    except:
        return "unknown"


def build_simhash(text: str) -> Simhash:
    tokens = text.lower().split()
    counts = Counter(tokens)
    features = []

    for tok, cnt in counts.items():
        w = cnt if cnt <= MAX_TOKEN_WEIGHT else MAX_TOKEN_WEIGHT
        features.append((tok, w))

    return Simhash(features)


def main():
    index = SimhashIndex([], k=SIMHASH_K)

    unique = 0
    dups = 0
    foreign = 0

    with open(INPUT_FILE, "r", encoding="utf-8") as fin, \
         open(UNIQUE_FILE, "w", encoding="utf-8") as fout_u, \
         open(DUP_FILE, "w", encoding="utf-8") as fout_d:

        for lineno, line in enumerate(fin, start=1):
            text = line.strip()
            if not text:
                continue

            # --------------------------------------------------------
            # STEP 1 — CLEAN-UP sumber artikel
            # --------------------------------------------------------
            text = clean_text(text)
            if not text:
                continue

            # --------------------------------------------------------
            # STEP 2 — DETEKSI BAHASA
            # --------------------------------------------------------
            lang = detect_language_safe(text)

            if lang != "id":
                fout_d.write(f"[LANG={lang}] " + text + "\n\n")
                foreign += 1
                dups += 1
                continue

            # --------------------------------------------------------
            # STEP 3 — SIMHASH
            # --------------------------------------------------------
            sh = build_simhash(text)
            near = index.get_near_dups(sh)

            if near:
                fout_d.write(text + "\n\n")
                dups += 1
            else:
                key = f"l{lineno}"
                index.add(key, sh)

                fout_u.write(text + "\n\n")
                unique += 1

            if lineno % 100000 == 0:
                print(f"Processed {lineno} lines... (unique={unique}, dups={dups}, foreign={foreign})")

    print("\n=== DONE ===")
    print("UNIQUE:", unique)
    print("DUPLICATES:", dups)
    print("FOREIGN LANGUAGE DETECTED:", foreign)


if __name__ == "__main__":
    main()



=== DONE ===
UNIQUE: 4684
DUPLICATES: 166
FOREIGN LANGUAGE DETECTED: 33
