# Spellcheck LLM dengan Ollama di Google Colab

Notebook ini melakukan koreksi ejaan pada kalimat Bahasa Indonesia menggunakan LLM (Ollama) dan menghitung akurasi hasil koreksi.

In [None]:
# --- SETUP: Install dependencies dan Ollama ---
!pip install requests Levenshtein matplotlib numpy
!curl -fsSL https://ollama.com/install.sh | sh
!ollama serve &
!ollama pull llama3

## 1. Dataset & Utilitas Data

In [None]:
# SpellingDataset dan contoh dataset
class SpellingDataset:
    def __init__(self, error_sentences=None, correct_sentences=None):
        self.error_sentences = error_sentences or []
        self.correct_sentences = correct_sentences or []
    def add_sample(self, error_sentence, correct_sentence):
        self.error_sentences.append(error_sentence)
        self.correct_sentences.append(correct_sentence)
    def get_all_samples(self):
        return list(zip(self.error_sentences, self.correct_sentences))
    def __len__(self):
        return len(self.error_sentences)

def get_sample_indonesian_dataset():
    dataset = SpellingDataset()
    dataset.add_sample("Saya Belajaaar Apa", "Saya Belajar Apa")
    dataset.add_sample("Dia mkan nasi goreng kemren", "Dia makan nasi goreng kemarin")
    dataset.add_sample("Aku prgi ke sekola tiap pagi", "Aku pergi ke sekolah tiap pagi")
    dataset.add_sample("Ibuku maseh memaska di dpur", "Ibuku masih memasak di dapur")
    dataset.add_sample("Kmaren sya bermian bola dngan teman", "Kemarin saya bermain bola dengan teman")
    dataset.add_sample("Bapak sdeng membersiihkan moblnya", "Bapak sedang membersihkan mobilnya")
    dataset.add_sample("Adik sya menangiss karna jatuh", "Adik saya menangis karena jatuh")
    dataset.add_sample("Kuching itu berlrai cepet sekali", "Kucing itu berlari cepat sekali")
    dataset.add_sample("Kakak membelii bukuu itu kemarn", "Kakak membeli buku itu kemarin")
    dataset.add_sample("Kami perrgi ke pantay waktu liburran", "Kami pergi ke pantai waktu liburan")
    return dataset

# Load dataset
dataset = get_sample_indonesian_dataset()
kalimat_salah = dataset.error_sentences
kalimat_benar = dataset.correct_sentences
print(f"Jumlah data: {len(dataset)}")

## 2. Client Ollama untuk Koreksi Ejaan

In [None]:
import requests

class OllamaClient:
    def __init__(self, base_url="http://localhost:11434"):
        self.base_url = base_url
        self.api_endpoint = f"{self.base_url}/api/generate"
    def correct_spelling(self, text, model="llama3", system_prompt=None):
        if system_prompt is None:
            system_prompt = (
                "Anda adalah asisten yang membantu memperbaiki ejaan kalimat Bahasa Indonesia. "
                "Perbaiki hanya ejaan yang salah, jangan tambahkan penjelasan atau kata lain. "
                "Kembalikan hanya kalimat hasil koreksi."
            )
        payload = {
            "model": model,
            "prompt": text,
            "system": system_prompt,
            "stream": False,
            "temperature": 0.1,
        }
        try:
            response = requests.post(self.api_endpoint, json=payload)
            response.raise_for_status()
            result = response.json()
            return result["response"].strip()
        except Exception as e:
            print(f"Gagal koreksi: {e}")
            return ""

## 3. Fungsi Evaluasi Akurasi

In [None]:
import Levenshtein
import numpy as np

def exact_match_accuracy(predicted_texts, ground_truth_texts):
    correct_count = sum(1 for p, g in zip(predicted_texts, ground_truth_texts) if p == g)
    return correct_count / len(predicted_texts) if len(predicted_texts) > 0 else 0

def character_level_accuracy(predicted_texts, ground_truth_texts):
    accuracies = []
    for pred, gt in zip(predicted_texts, ground_truth_texts):
        distance = Levenshtein.distance(pred, gt)
        max_length = max(len(pred), len(gt))
        accuracy = 1 - (distance / max_length) if max_length > 0 else 1
        accuracies.append(accuracy)
    return np.mean(accuracies) if accuracies else 0

def word_level_accuracy(predicted_texts, ground_truth_texts):
    accuracies = []
    for pred, gt in zip(predicted_texts, ground_truth_texts):
        pred_words = pred.split()
        gt_words = gt.split()
        correct_words = sum(1 for p, g in zip(pred_words, gt_words) if p == g)
        total_words = max(len(pred_words), len(gt_words))
        accuracy = correct_words / total_words if total_words > 0 else 1
        accuracies.append(accuracy)
    return np.mean(accuracies) if accuracies else 0

## 4. Proses Koreksi Ejaan dengan LLM

In [None]:
import time

client = OllamaClient()
hasil_llm = []
processing_times = []

for i, kalimat in enumerate(kalimat_salah):
    print(f"Input   : {kalimat}")
    start = time.time()
    hasil = client.correct_spelling(kalimat)
    end = time.time()
    hasil_llm.append(hasil)
    processing_times.append(end-start)
    print(f"Target  : {kalimat_benar[i]}")
    print(f"LLM     : {hasil}")
    print(f"Waktu   : {end-start:.2f}s\n")

## 5. Evaluasi & Visualisasi Akurasi

In [None]:
import matplotlib.pyplot as plt

exact_acc = exact_match_accuracy(hasil_llm, kalimat_benar)
char_acc = character_level_accuracy(hasil_llm, kalimat_benar)
word_acc = word_level_accuracy(hasil_llm, kalimat_benar)

print(f"Exact Match Accuracy     : {exact_acc:.4f}")
print(f"Character-level Accuracy : {char_acc:.4f}")
print(f"Word-level Accuracy      : {word_acc:.4f}")
print(f"Rata-rata waktu proses   : {np.mean(processing_times):.2f}s")

# Visualisasi
metrics = ['Exact Match', 'Character-level', 'Word-level']
values = [exact_acc, char_acc, word_acc]

plt.figure(figsize=(8,5))
plt.bar(metrics, values, color=['blue', 'green', 'orange'])
plt.ylim(0, 1.0)
plt.title('LLM Spelling Correction Accuracy')
plt.ylabel('Accuracy')
plt.grid(axis='y', linestyle='--', alpha=0.7)
for i, v in enumerate(values):
    plt.text(i, v + 0.02, f'{v:.2f}', ha='center', va='bottom')
plt.tight_layout()
plt.show()