<a href="https://colab.research.google.com/github/Vins-novaldi/kelasnet-krs/blob/main/uas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1. Instal library yang diperlukan
!pip install transformers datasets seqeval torch pandas numpy scikit-learn -q

# 2. Impor library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset
from seqeval.metrics import classification_report
import torch
import os
os.environ["WANDB_DISABLED"] = "true"

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [2]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/nlp/minang_ner_1000.csv')

df.head()

Mounted at /content/drive


Unnamed: 0,sentence_id,token,label
0,0,Siti,B-PERSON
1,0,naik,O
2,0,ka,O
3,0,Lubuk,B-LOCATION
4,0,Baso,I-LOCATION


In [3]:
# 5. Validasi struktur dataset
required_cols = {"sentence_id", "token", "label"}
if not required_cols.issubset(df.columns):
    raise ValueError(f"Dataset harus memiliki kolom: {required_cols}")

In [4]:
# 6. Konversi ke format Hugging Face Dataset
def agg_func(x):
    return {
        "tokens": x["token"].tolist(),
        "ner_tags_str": x["label"].tolist()
    }

grouped = df.groupby("sentence_id").apply(agg_func).tolist()
tokens = [x["tokens"] for x in grouped]
ner_tags_str = [x["ner_tags_str"] for x in grouped]

  grouped = df.groupby("sentence_id").apply(agg_func).tolist()


In [5]:
# 7. Tentukan label (sesuaikan jika berbeda)
unique_labels = sorted(set(label for tags in ner_tags_str for label in tags))
print(f"\nLabel unik dalam dataset: {unique_labels}")

label_list = ["O", "B-PERSON", "I-PERSON", "B-LOCATION", "I-LOCATION"]
label_to_id = {label: i for i, label in enumerate(label_list)}

# Konversi string ke ID
ner_tags = []
for tags in ner_tags_str:
    try:
        ner_tags.append([label_to_id[label] for label in tags])
    except KeyError as e:
        raise ValueError(f"Label tidak dikenali: {e}. Pastikan dataset hanya berisi label: {label_list}")


Label unik dalam dataset: ['B-LOCATION', 'B-PERSON', 'I-LOCATION', 'O']


In [6]:
# 8. Buat Dataset Hugging Face
dataset = Dataset.from_dict({
    "tokens": tokens,
    "ner_tags": ner_tags
})


In [7]:
# 9. Tokenisasi dengan mBERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding=True,
        is_split_into_words=True,
        max_length=128
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

In [8]:
# 10. Bagi data: 70% train, 15% validation, 15% test
train_test = tokenized_dataset.train_test_split(test_size=0.3, seed=42)
test_val = train_test["test"].train_test_split(test_size=0.5, seed=42)
final_dataset = {
    "train": train_test["train"],
    "validation": test_val["train"],
    "test": test_val["test"]
}

In [9]:
# 11. Siapkan model mBERT untuk Token Classification
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=len(label_list),
    id2label={i: label for i, label in enumerate(label_list)},
    label2id=label_to_id
)


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# 12. Atur pelatihan
training_args = TrainingArguments(
    output_dir="./minang_ner_model",
    eval_strategy="epoch",                 # <-- diubah dari evaluation_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=2,
    seed=42,
    fp16=torch.cuda.is_available()
)

data_collator = DataCollatorForTokenClassification(tokenizer)



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [11]:
# 13. Fungsi evaluasi dengan seqeval
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        "precision": classification_report(true_labels, true_predictions, output_dict=True)["macro avg"]["precision"],
        "recall": classification_report(true_labels, true_predictions, output_dict=True)["macro avg"]["recall"],
        "f1": classification_report(true_labels, true_predictions, output_dict=True)["macro avg"]["f1-score"]
    }


In [12]:
# 14. Latih model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset["train"],
    eval_dataset=final_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("🚀 Mulai fine-tuning Multilingual BERT untuk Bahasa Minang...")
trainer.train()

  trainer = Trainer(


🚀 Mulai fine-tuning Multilingual BERT untuk Bahasa Minang...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.000956,1.0,1.0,1.0
2,No log,0.000341,1.0,1.0,1.0
3,No log,0.000233,1.0,1.0,1.0
4,No log,0.000183,1.0,1.0,1.0
5,No log,0.000145,1.0,1.0,1.0
6,No log,0.000123,1.0,1.0,1.0
7,No log,0.000107,1.0,1.0,1.0
8,No log,9.6e-05,1.0,1.0,1.0
9,No log,8.8e-05,1.0,1.0,1.0
10,No log,8.1e-05,1.0,1.0,1.0


TrainOutput(global_step=900, training_loss=0.01654845023320781, metrics={'train_runtime': 1888.207, 'train_samples_per_second': 7.584, 'train_steps_per_second': 0.477, 'total_flos': 131550153883200.0, 'train_loss': 0.01654845023320781, 'epoch': 20.0})

In [13]:
# 15. Evaluasi pada test set
print("\n🔍 Evaluasi pada test set...")
test_results = trainer.evaluate(eval_dataset=final_dataset["test"])
print(f"Hasil Test: {test_results}")



🔍 Evaluasi pada test set...


Hasil Test: {'eval_loss': 5.78033686906565e-05, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 0.187, 'eval_samples_per_second': 823.612, 'eval_steps_per_second': 53.481, 'epoch': 20.0}


In [14]:
import os
MODEL_PATH = '/content/drive/MyDrive/nlp/data/minang_ner_model'
os.makedirs(MODEL_PATH, exist_ok=True)
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

print(f"✅ Model disimpan di: {MODEL_PATH}")

✅ Model disimpan di: /content/drive/MyDrive/nlp/data/minang_ner_model


In [24]:
# 16. Contoh prediksi manual
def predict_sentence(sentence):
    tokens = sentence.split()

    # Tokenisasi dan dapatkan word_ids SEBELUM konversi ke dict
    inputs = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    )

    # Simpan word_ids sekarang (sebelum modifikasi inputs)
    word_ids = inputs.word_ids()

    # Pindahkan tensor ke device model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)

    # Gunakan word_ids yang sudah disimpan
    predicted_labels = []
    seen_words = set()
    for i, word_idx in enumerate(word_ids):
        if word_idx is not None and word_idx not in seen_words:
            predicted_labels.append(label_list[predictions[0][i].item()])
            seen_words.add(word_idx)

    return list(zip(tokens, predicted_labels))

# Uji prediksi
test_sentence = "Ambo tingga di Payakumbuh"
print(f"\n🧪 Contoh Prediksi:\nKalimat: {test_sentence}")
for token, pred in predict_sentence(test_sentence):
    print(f"  {token} → {pred}")


🧪 Contoh Prediksi:
Kalimat: Ambo tingga di Payakumbuh
  Ambo → B-PERSON
  tingga → O
  di → O
  Payakumbuh → B-LOCATION
