In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# 1. SETUP AWAL
# Kita hanya panggil "Kamus" (Tokenizer) nya saja, bukan Model AI-nya.
checkpoint = "prajjwal1/bert-tiny" 
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Load Dataset
emot = load_dataset("indonlp/indonlu", "emot")
smsa = load_dataset("indonlp/indonlu", "smsa")
nerp = load_dataset("indonlp/indonlu", "nerp")

# ======================================================
# FUNGSI 1: Untuk Klasifikasi (SMSA & EMOT)
# ======================================================
def process_classification(examples):
    # Deteksi nama kolom teks (karena beda-beda tiap dataset)
    text_col = "tweet" if "tweet" in examples else "text"
    
    # Ubah Teks -> Angka (Input IDs)
    tokenized = tokenizer(
        examples[text_col], 
        truncation=True, 
        max_length=128,      # Batasi panjang kalimat
        padding="max_length" # Samakan panjang semua kalimat
    )
    
    # Ambil label angka (0, 1, 2) dan simpan di kolom 'labels'
    # Penting: Jangan diubah jadi string! Biarkan integer.
    tokenized["labels"] = examples["label"]
    
    return tokenized

# ======================================================
# FUNGSI 2: Untuk NER (NERP) - Lebih Rumit
# ======================================================
def process_ner(examples):
    # Tokenisasi list kata (karena NER inputnya per kata)
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        is_split_into_words=True, # Wajib True!
        max_length=128,
        padding="max_length"
    )

    labels = []
    # Loop setiap kalimat
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i) # Mapping token ke kata asli
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            # Jika token spesial ([CLS], [SEP], [PAD]) -> label -100 (diabaikan)
            if word_idx is None:
                label_ids.append(-100)
            # Jika token kata baru -> ambil label aslinya
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # Jika token pecahan kata (subword) -> label -100
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
            
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# ======================================================
# EKSEKUSI (Jalankan Preprocessing)
# ======================================================

print("Sedang memproses SMSA...")
smsa_ready = smsa.map(
    process_classification, 
    batched=True, 
    remove_columns=smsa["train"].column_names # Hapus kolom mentah
)

print("Sedang memproses EMOT...")
emot_ready = emot.map(
    process_classification, 
    batched=True, 
    remove_columns=emot["train"].column_names
)

print("Sedang memproses NERP (NER)...")
nerp_ready = nerp.map(
    process_ner, 
    batched=True, 
    remove_columns=nerp["train"].column_names
)

# ======================================================
# HASIL AKHIR
# ======================================================
print("\nSukses! Data sudah bersih.")
print("Contoh data SMSA (siap training):", smsa_ready["train"][0].keys())
# Output keys harus: ['input_ids', 'token_type_ids', 'attention_mask', 'labels']

In [None]:
# Cek 1 Sampel dari SMSA
sample = smsa_ready["train"][0]
print("=== CEK SMSA ===")
print("Input IDs:", sample["input_ids"][:10]) # Lihat 10 angka pertama
print("Label ID :", sample["labels"])

# Kembalikan ke teks asli untuk memastikan isinya bukan sampah
decoded_text = tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
print("Decoded Text:", decoded_text)


# Cek 1 Sampel dari NER (PENTING! Cek angka -100 nya)
sample_ner = nerp_ready["train"][0]
print("\n=== CEK NER ===")
print("Input IDs:", sample_ner["input_ids"][:10])
print("Labels   :", sample_ner["labels"][:10]) 
# Kamu harus melihat angka -100 di antara label lain jika ada subword!

Save

In [None]:
import os

# Buat folder penampung
os.makedirs("processed_data", exist_ok=True)

# Simpan
smsa_ready.save_to_disk("processed_data/smsa_processed")
emot_ready.save_to_disk("processed_data/emot_processed")
nerp_ready.save_to_disk("processed_data/nerp_processed")

print("\nSemua data berhasil disimpan ke folder 'processed_data/'!")

Preprocessing QA (SQuAD)

In [None]:
import json
from datasets import Dataset

# Lokasi file kamu
file_path = "/kaggle/input/uad-id/train-SQuAD-id.json"

def load_and_flatten_squad(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        squad_dict = json.load(f)
    
    flattened_data = []
    
    # Masuk ke struktur 'data' -> 'paragraphs' -> 'qas'
    for article in squad_dict["data"]:
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            
            for qa in paragraph["qas"]:
                question = qa["question"]
                id_ = qa["id"]
                answers = qa["answers"]
                
                # Format ulang jawaban agar sesuai standar Hugging Face
                # Biasanya di JSON SQuAD formatnya: [{'text': '..', 'answer_start': 12}]
                # Kita butuh format: {'text': ['..'], 'answer_start': [12]}
                formatted_answers = {
                    "text": [ans["text"] for ans in answers],
                    "answer_start": [ans["answer_start"] for ans in answers]
                }
                
                flattened_data.append({
                    "id": id_,
                    "context": context,
                    "question": question,
                    "answers": formatted_answers
                })
    
    # Ubah list dictionary menjadi Hugging Face Dataset
    return Dataset.from_list(flattened_data)

# --- EKSEKUSI ---
print("Sedang meratakan struktur JSON SQuAD...")
dataset_flat = load_and_flatten_squad(file_path)

print("Sukses!")
print("Contoh data:", dataset_flat[0])
# Sekarang harusnya muncul keys: ['id', 'context', 'question', 'answers']

In [None]:
from transformers import AutoTokenizer

# 1. Pastikan Tokenizer sudah load
checkpoint = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# 2. DEFINISI FUNGSI PREPROCESS_QA (VERSI ANTI-CRASH)
def preprocess_qa(examples):
    max_length = 384
    doc_stride = 128

    inputs = tokenizer(
        [q.strip() for q in examples["question"]],
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        
        # --- PERBAIKAN DI SINI (SABUK PENGAMAN) ---
        # Cek apakah list jawaban kosong?
        if len(answer["answer_start"]) == 0:
            # Jika kosong, arahkan ke CLS (index 0)
            start_positions.append(0)
            end_positions.append(0)
            continue # Lanjut ke data berikutnya
        # ------------------------------------------

        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        
        sequence_ids = inputs.sequence_ids(i)
        
        # Cari batas index token konteks
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# 3. EKSEKUSI ULANG
print("Mencoba Preprocessing lagi (Versi Aman)...")
qa_processed = dataset_flat.map(
    preprocess_qa,
    batched=True,
    remove_columns=dataset_flat.column_names
)

print("✅ SUKSES! Preprocessing QA Selesai.")

In [None]:
import os
os.makedirs("processed_data", exist_ok=True)

# Simpan SQuAD
qa_processed.save_to_disk("processed_data/squad_processed")

print("Data SQuAD aman tersimpan! ✅")

Preprocessing WikiLingua (Khusus IndoT5)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# --- PERUBAHAN DI SINI ---
# Ganti dari 'indobenchmark/indot5-base' ke 'google/mt5-small'
# mt5-small adalah versi ringan, kalau GPU kuat bisa pakai 'google/mt5-base'
checkpoint_t5 = "google/mt5-small" 

tokenizer_t5 = AutoTokenizer.from_pretrained(checkpoint_t5)

# Load Data (tidak perlu diubah)
wiki_lingua_id = load_dataset("wiki_lingua", "indonesian", ignore_verifications=True)

# Konfigurasi
prefix = "ringkas: " 
max_input_length = 512
max_target_length = 150

def preprocess_summarization_t5(examples):
    inputs = []
    targets = []
    
    for article_dict in examples["article"]:
        doc_text = " ".join(article_dict["document"]) 
        sum_text = " ".join(article_dict["summary"])
        
        inputs.append(prefix + doc_text)
        targets.append(sum_text)
    
    model_inputs = tokenizer_t5(
        inputs, 
        max_length=max_input_length, 
        truncation=True,
        padding="max_length"
    )

    with tokenizer_t5.as_target_tokenizer():
        labels = tokenizer_t5(
            targets, 
            max_length=max_target_length, 
            truncation=True,
            padding="max_length"
        )

    labels["input_ids"] = [
        [(l if l != tokenizer_t5.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# EKSEKUSI
print("Sedang memproses WikiLingua dengan Google mT5...")
wiki_processed = wiki_lingua_id["train"].map(
    preprocess_summarization_t5, 
    batched=True, 
    remove_columns=wiki_lingua_id["train"].column_names
)

print("\nSukses!")
print("Fitur:", wiki_processed.features.keys())

In [None]:
import os
os.makedirs("processed_data", exist_ok=True)

# Ganti nama folder jadi 'mt5' agar sesuai dengan tokenizer yang dipakai
wiki_processed.save_to_disk("processed_data/wikilingua_mt5_processed")

print("Data Summary (mT5) aman!")