In [None]:
!pip install rapidfuzz


Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m2.1/3.2 MB[0m [31m52.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m47.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.3


In [None]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


DATA_URL = "https://raw.githubusercontent.com/9uz/IDNHoaxCorpus/main/dataset/datasetUMPOHoax.csv"

print("Downloading dataset...")
df = pd.read_csv(DATA_URL)
print("Dataset loaded:", df.shape)

df = df[['tweet', 'label']].dropna()
df.rename(columns={'tweet': 'text'}, inplace=True)


df['label'] = df['label'].astype(str).str.lower().str.strip()


mapping = {
    "hoax": 1,
    "valid": 0,
    "0": 0,
    "1": 1,
}


df['label'] = df['label'].map(mapping)


df = df.dropna(subset=['label'])


df['label'] = df['label'].astype(int)

print(df['label'].value_counts())



df = df[df['label'].isin([0, 1])]



import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopword_set = set(stopwords.words('indonesian'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = text.split()
    words = [w for w in words if w not in stopword_set]
    return " ".join(words)

print("Cleaning text...")
df['clean_text'] = df['text'].apply(clean_text)



# 80% TRAIN - 20% TEST
X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)


print("Vectorizing text...")
vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# TRAIN MODEL!!
print("Training Random Forest...")
model = RandomForestClassifier(
    n_estimators=300,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)
model.fit(X_train_vec, y_train)


#evaluasi duar
print("\nEvaluating model...")
pred = model.predict(X_test_vec)

accuracy = accuracy_score(y_test, pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:\n", classification_report(y_test, pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))



from rapidfuzz import fuzz, process
FUZZY_THRESHOLD = 70

def classify_input(input_judul):
    cleaned = clean_text(input_judul)

    choices = df['clean_text'].tolist()
    best_match, score, index = process.extractOne(
        cleaned,
        choices,
        scorer=fuzz.partial_ratio
    )

    # === JIKA MIRIP DATASET ===
    if score >= FUZZY_THRESHOLD:
        label = int(df.iloc[index]['label'])

        if label == 1:
            return f"HOAX (mirip dataset, similarity {score:.2f}%)"
        else:
            return f"VALID (mirip dataset, similarity {score:.2f}%)"

    # === JIKA TIDAK MIRIP → PAKAI MODEL ===
    vec = vectorizer.transform([cleaned])
    pred = int(model.predict(vec)[0])

    if pred == 1:
        return "HOAX (prediksi model)"
    else:
        return "VALID (prediksi model)"



    vec = vectorizer.transform([cleaned])
    pred = model.predict(vec)[0]

    if str(pred).lower() in ["hoax", "1"]:
        return "HOAX (prediksi model)"
    elif str(pred).lower() in ["valid", "0"]:
        return "VALID (prediksi model)"
    else:
        return "BELUM TERDAFTAR (tidak mirip & model tidak yakin)"


print("\n=== MODE CEK JUDUL BERITA ===")
print("Ketik 'exit' untuk keluar.\n")

while True:
    user_judul = input("Masukkan judul berita: ")

    if user_judul.lower() == "exit":
        print("Program selesai.")
        break

    hasil = classify_input(user_judul)
    print("Hasil:", hasil, "\n")


Downloading dataset...
Dataset loaded: (4617, 6)
label
1    3041
0     716
Name: count, dtype: int64


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Cleaning text...
Vectorizing text...
Training Random Forest...

Evaluating model...
Accuracy: 0.83

Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.22      0.33       143
           1       0.84      0.98      0.90       609

    accuracy                           0.83       752
   macro avg       0.77      0.60      0.62       752
weighted avg       0.81      0.83      0.80       752


Confusion Matrix:
 [[ 31 112]
 [ 14 595]]

=== MODE CEK JUDUL BERITA ===
Ketik 'exit' untuk keluar.

