In [16]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [17]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import numpy as np
import collections
from tqdm.auto import tqdm

# Load dataset IndoQA
datasets = load_dataset("jakartaresearch/indoqa")


In [18]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answer', 'category', 'span_start', 'span_end'],
        num_rows: 3309
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'answer', 'category', 'span_start', 'span_end'],
        num_rows: 1104
    })
})

In [19]:
model_checkpoint = "indobenchmark/indobert-base-p2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_length = 384
stride = 128

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    context = [c.strip() for c in examples["context"]]
    inputs = tokenizer(
        questions,
        context,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        start_char = examples["span_start"][sample_idx]
        end_char = examples["span_end"][sample_idx]
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

train_dataset = datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=datasets["train"].column_names,
)

def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    context = [c.strip() for c in examples["context"]]
    inputs = tokenizer(
        questions,
        context,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

validation_dataset = datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=datasets["validation"].column_names,
)


In [20]:
pip install evaluate

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [23]:
import collections
from tqdm.auto import tqdm
import numpy as np
import evaluate

metric = evaluate.load("squad_v2")  # Menggunakan metrik SQuAD v2 karena IndoQA tidak menyediakan metrik tersendiri

n_best = 20
max_answer_length = 30

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": [ex["answer"]]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)


In [28]:
from transformers import BertPreTrainedModel, BertModel
import torch.nn as nn
import torch


class CustomBertForQuestionAnswering(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        if start_positions is not None and end_positions is not None:
            loss_fct = nn.CrossEntropyLoss()
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
            return total_loss
        else:
            return start_logits, end_logits


In [29]:
model = CustomBertForQuestionAnswering.from_pretrained(model_checkpoint)

# Definisi TrainingArguments
args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)


Some weights of CustomBertForQuestionAnswering were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Definisi Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [31]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,2.465500,No log




TrainOutput(global_step=621, training_loss=2.296236931989734, metrics={'train_runtime': 512.1881, 'train_samples_per_second': 19.382, 'train_steps_per_second': 1.212, 'total_flos': 1958927584919040.0, 'train_loss': 2.296236931989734, 'epoch': 3.0})

In [34]:
from transformers import pipeline

# Example of inference
nlp = pipeline("question-answering", model=model, tokenizer=tokenizer)

QA_input = {
    'question': 'Apa ibu kota Indonesia?',
    'context': 'Jakarta adalah ibu kota Indonesia.'
}
res = nlp(QA_input)
print(res)

The model 'CustomBertForQuestionAnswering' is not supported for question-answering. Supported models are ['AlbertForQuestionAnswering', 'BartForQuestionAnswering', 'BertForQuestionAnswering', 'BigBirdForQuestionAnswering', 'BigBirdPegasusForQuestionAnswering', 'BloomForQuestionAnswering', 'CamembertForQuestionAnswering', 'CanineForQuestionAnswering', 'ConvBertForQuestionAnswering', 'Data2VecTextForQuestionAnswering', 'DebertaForQuestionAnswering', 'DebertaV2ForQuestionAnswering', 'DistilBertForQuestionAnswering', 'ElectraForQuestionAnswering', 'ErnieForQuestionAnswering', 'ErnieMForQuestionAnswering', 'FalconForQuestionAnswering', 'FlaubertForQuestionAnsweringSimple', 'FNetForQuestionAnswering', 'FunnelForQuestionAnswering', 'GPT2ForQuestionAnswering', 'GPTNeoForQuestionAnswering', 'GPTNeoXForQuestionAnswering', 'GPTJForQuestionAnswering', 'IBertForQuestionAnswering', 'LayoutLMv2ForQuestionAnswering', 'LayoutLMv3ForQuestionAnswering', 'LEDForQuestionAnswering', 'LiltForQuestionAnswerin

{'score': 0.9322106242179871, 'start': 0, 'end': 7, 'answer': 'Jakarta'}


In [35]:
QA_input1 = {
    'question': 'Siapa yang menemukan lampu pijar?',
    'context': 'Thomas Edison adalah penemu lampu pijar yang sangat terkenal.'
}
res_1 = nlp(QA_input1)
print(res_1)

{'score': 0.6799118518829346, 'start': 0, 'end': 13, 'answer': 'Thomas Edison'}


In [36]:
QA_input2 = {
    'question': 'Apa vitamin yang baik untuk kesehatan mata?',
    'context': 'Vitamin A adalah vitamin yang baik untuk kesehatan mata.'
}
res_2 = nlp(QA_input2)
print(res_2)

{'score': 0.43218639492988586, 'start': 0, 'end': 9, 'answer': 'Vitamin A'}


In [39]:
QA_input3 = {
    'question': 'Apa mata uang resmi Indonesia?',
    'context': 'Rupiah adalah mata uang resmi Indonesia.'
}
res_3 = nlp(QA_input3)
print(res_3)

{'score': 0.8426904082298279, 'start': 0, 'end': 6, 'answer': 'Rupiah'}


In [41]:
QA_input4 = {
    'question': 'Kapan Perang Dunia II?',
    'context': 'Perang Dunia II adalah konflik global yang berlangsung dari tahun 1939 hingga 1945, melibatkan sebagian besar negara di dunia, termasuk semua kekuatan besar, yang akhirnya membentuk dua aliansi militer yang bertentangan: Sekutu dan Blok Poros. Perang ini adalah perang yang paling luas dalam sejarah dan melibatkan lebih dari 100 juta orang dari lebih dari 30 negara. Perang ini ditandai dengan peristiwa penting seperti invasi Polandia oleh Jerman, serangan Pearl Harbor oleh Jepang, dan pembebasan Eropa Barat oleh Sekutu.'
}
res_4 = nlp(QA_input4)
print(res_4)

{'score': 0.29554319381713867, 'start': 66, 'end': 82, 'answer': '1939 hingga 1945'}


In [42]:
QA_input5 = {
    'question': 'Berapa orang yang terlibat dalam perang dunia II?',
    'context': 'Perang Dunia II adalah konflik global yang berlangsung dari tahun 1939 hingga 1945, melibatkan sebagian besar negara di dunia, termasuk semua kekuatan besar, yang akhirnya membentuk dua aliansi militer yang bertentangan: Sekutu dan Blok Poros. Perang ini adalah perang yang paling luas dalam sejarah dan melibatkan lebih dari 100 juta orang dari lebih dari 30 negara. Perang ini ditandai dengan peristiwa penting seperti invasi Polandia oleh Jerman, serangan Pearl Harbor oleh Jepang, dan pembebasan Eropa Barat oleh Sekutu.'
}
res_5 = nlp(QA_input5)
print(res_5)

{'score': 0.46020767092704773, 'start': 326, 'end': 334, 'answer': '100 juta'}
