In [None]:
# === 1. Import Library ===
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import json
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# === 2. Load Dataset ===
file_path = '/content/drive/MyDrive/Tugas/Penalaran Komputer/Tugas UAS/putusan_ma_ekstraksi.csv'
df = pd.read_csv(file_path)

text_column = 'amar_lainnya'
label_column = 'klasifikasi'

# Hapus data kosong
df = df.dropna(subset=[text_column, label_column])

# Encode Label
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df[label_column])

# Split Data
X_train, X_test, y_train, y_test = train_test_split(df[text_column], df['label_encoded'], test_size=0.2, random_state=42)

# === 3. TF-IDF Vectorizer ===
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# === 4. Train TF-IDF Models ===
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

svm_model = LinearSVC()
svm_model.fit(X_train_vec, y_train)

# === 5. Evaluasi TF-IDF Models ===
for model, name in [(nb_model, 'Naive Bayes'), (svm_model, 'SVM')]:
    y_pred = model.predict(X_test_vec)
    acc = accuracy_score(y_test, y_pred) * 100

    print(f"\n=== Evaluasi {name} + TF-IDF ===")
    print(classification_report(y_test, y_pred, zero_division=0))
    print(f"Akurasi: {acc:.2f}%")

# === 6. Retrieval TF-IDF ===
def retrieve_tfidf(query: str, model, k=5):
    query_vec = vectorizer.transform([query])
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(query_vec)[0]
        top_k_idx = probs.argsort()[::-1][:k]
    else:
        scores = model.decision_function(query_vec)
        if len(scores.shape) == 1:
            top_k_idx = [int(scores[0] > 0)]
        else:
            top_k_idx = scores[0].argsort()[::-1][:k]

    return [label_encoder.inverse_transform([i])[0] for i in top_k_idx]

# === 7. Load IndoBERT ===
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model_bert = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_bert = model_bert.to(device)

# === 8. Embedding BERT ===
def get_bert_embedding(text_list):
    embeddings = []
    with torch.no_grad():
        for text in text_list:
            inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = model_bert(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
            embeddings.append(cls_embedding)
    return np.array(embeddings)

X_train_embed = get_bert_embedding(X_train.tolist())
X_test_embed = get_bert_embedding(X_test.tolist())

# Train BERT + SVM
svm_bert_model = make_pipeline(StandardScaler(), LinearSVC())
svm_bert_model.fit(X_train_embed, y_train)

# Evaluasi BERT
y_pred_bert = svm_bert_model.predict(X_test_embed)
acc_bert = accuracy_score(y_test, y_pred_bert) * 100

print("\n=== Evaluasi SVM + BERT ===")
print(classification_report(y_test, y_pred_bert, zero_division=0))
print(f"Akurasi: {acc_bert:.2f}%")

# === 9. Retrieval BERT ===
def retrieve_bert(query: str, model, k=5):
    query_vec = get_bert_embedding([query])
    scores = model.decision_function(query_vec)
    if len(scores.shape) == 1:
        top_k_idx = [int(scores[0] > 0)]
    else:
        top_k_idx = scores[0].argsort()[::-1][:k]

    return [label_encoder.inverse_transform([i])[0] for i in top_k_idx]

# === 10. Evaluasi Queries.json ===
with open('/content/drive/MyDrive/Tugas/Penalaran Komputer/Tugas UAS/queries.json', 'r') as f:
    queries = json.load(f)

def evaluate_queries(queries, model, k=5, metode='Model'):
    total = len(queries)
    correct = 0
    print(f"\nEvaluasi: {metode}")
    for i, q in enumerate(queries):
        query_text = q['query']
        ground_truth = q['ground_truth']
        if metode == 'BERT':
            top_k_pred = retrieve_bert(query_text, model, k)
        else:
            top_k_pred = retrieve_tfidf(query_text, model, k)

        is_correct = ground_truth in top_k_pred
        print(f"\n[{i+1}] Query: {query_text}")
        print(f"Ground Truth: {ground_truth}")
        print(f"Top-{k} Prediction: {top_k_pred}")
        print("✅ MATCH" if is_correct else "❌ MISMATCH")
        if is_correct:
            correct += 1

    precision_at_k = correct / total
    print(f"\n🎯 Precision@{k}: {precision_at_k:.2f}")

evaluate_queries(queries, nb_model, k=5, metode='TF-IDF + Naive Bayes')
evaluate_queries(queries, svm_model, k=5, metode='TF-IDF + SVM')
evaluate_queries(queries, svm_bert_model, k=5, metode='BERT')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



=== Evaluasi Naive Bayes + TF-IDF ===
              precision    recall  f1-score   support

           3       1.00      1.00      1.00         7

    accuracy                           1.00         7
   macro avg       1.00      1.00      1.00         7
weighted avg       1.00      1.00      1.00         7

Akurasi: 100.00%

=== Evaluasi SVM + TF-IDF ===
              precision    recall  f1-score   support

           3       1.00      1.00      1.00         7

    accuracy                           1.00         7
   macro avg       1.00      1.00      1.00         7
weighted avg       1.00      1.00      1.00         7

Akurasi: 100.00%

=== Evaluasi SVM + BERT ===
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           3       1.00      0.86      0.92         7

    accuracy                           0.86         7
   macro avg       0.50      0.43      0.46         7
weighted avg       1.00      0.86      0.92      

In [None]:
import joblib
import os

# === 5a. Simpan TF-IDF Models ===
save_dir = '/content/drive/MyDrive/Tugas/Penalaran Komputer/Tugas UAS/models'
os.makedirs(save_dir, exist_ok=True)

joblib.dump(vectorizer, f'{save_dir}/vectorizer_tfidf.pkl')
joblib.dump(nb_model, f'{save_dir}/naive_bayes_tfidf.pkl')
joblib.dump(svm_model, f'{save_dir}/svm_tfidf.pkl')
joblib.dump(label_encoder, f'{save_dir}/label_encoder.pkl')

print(f"✅ TF-IDF models saved to {save_dir}")

# === Setelah training BERT + SVM ===
joblib.dump(svm_bert_model, f'{save_dir}/svm_bert.pkl')
print(f"✅ BERT SVM model saved to {save_dir}")

✅ TF-IDF models saved to /content/drive/MyDrive/Tugas/Penalaran Komputer/Tugas UAS/models
✅ BERT SVM model saved to /content/drive/MyDrive/Tugas/Penalaran Komputer/Tugas UAS/models


### TAHAP 4

In [None]:
case_solutions = dict(zip(df[label_column], df[text_column]))

def majority_vote(solutions):
    count = Counter(solutions)
    return count.most_common(1)[0][0]

def predict_outcome(query: str, k=5):
    top_k = retrieve_bert(query, svm_model, k)
    solutions = [case_solutions[c] for c in top_k if c in case_solutions]

    if not solutions:
        return "Tidak ada solusi ditemukan"

    predicted = majority_vote(solutions)
    return predicted, top_k

In [None]:
results = []
for q in queries:
    query_text = q['query']
    query_id = q.get('query_id', query_text)
    pred_solusi, top_k_cases = predict_outcome(query_text, k=5)

    results.append({
        "query_id": query_id,
        "predicted_solution": pred_solusi,
        "top_5_case_ids": ";".join(top_k_cases)
    })

output_dir = '/content/drive/MyDrive/Tugas/Penalaran Komputer/Tugas UAS/results'
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, 'predictions.csv')
results_df = pd.DataFrame(results)
results_df.to_csv(output_path, index=False)

print(f"\n✅ Hasil prediksi disimpan di: {output_path}")
results_df.head()


✅ Hasil prediksi disimpan di: /content/drive/MyDrive/Tugas/Penalaran Komputer/Tugas UAS/results/predictions.csv


Unnamed: 0,query_id,predicted_solution,top_5_case_ids
0,pembunuhan berencana oleh suami terhadap istri,PIDANA PENJARA WAKTU TERTENTU,Pidana Umum \n Pidana Umum Pembunuhan;Perdata...
1,penipuan jual beli tanah di Jakarta,PIDANA PENJARA WAKTU TERTENTU,Pidana Umum \n Pidana Umum Pembunuhan;Pidana ...
2,tindak pidana korupsi dana desa,PIDANA PENJARA WAKTU TERTENTU,Pidana Umum \n Pidana Umum Pembunuhan;Pidana ...
3,kasus pencurian dengan kekerasan di rumah toko,PIDANA PENJARA WAKTU TERTENTU,Pidana Umum \n Pidana Umum Pembunuhan;Pidana ...
4,pemalsuan dokumen untuk pinjaman bank,PIDANA PENJARA WAKTU TERTENTU,Pidana Umum \n Pidana Umum Pembunuhan;Pidana ...
