In [1]:
import pandas as pd
import json
import datasets

In [2]:
model_checkpoint = "bert-base-multilingual-cased"
batch_size = 16

In [3]:
with open('data/zac2022_train_merged_final.json',encoding='utf-8') as f:
    data=json.load(f)
    df = pd.json_normalize(data,'data')

# 2 trường hợp output là có hoặc không có câu trả lời
df = df[(df['category'] == 'FULL_ANNOTATION') | (df['category'] == 'FALSE_LONG_ANSWER')]
# độ dài của câu trả lời
df['short_candidate_length'] = df['short_candidate'].apply(lambda x: len(x) if type(x) == str else 0)

In [4]:
# format theo code copy :v
dataset = datasets.Dataset.from_pandas(df)
dataset = dataset.map(lambda example: {'answers': {'answer_start': [example['short_candidate_start']] if example['short_candidate_start']!=None else [],
                                                           'text': [example['short_candidate']] if example['short_candidate']!=None else[]}})
dataset = dataset.map(lambda example: {'context': example['text']}, batched=True, remove_columns='text')
# drop columns that are not needed
dataset = dataset.remove_columns(['answer','short_candidate_start', 'short_candidate', 'short_candidate_length','is_long_answer','category'])
# split 
dataset = dataset.train_test_split(test_size=0.1,seed=42)

  0%|          | 0/17359 [00:00<?, ?ex/s]

  0%|          | 0/18 [00:00<?, ?ba/s]

In [5]:
dataset['train'][0]

{'id': '35f5cea2c4d48fd056bfffe7d02db260',
 'question': 'Các nước Đông Nam Á gồm bao nhiêu quốc gia',
 'title': 'Âm nhạc tại Đông Nam Á',
 '__index_level_0__': 5643,
 'answers': {'answer_start': [], 'text': []},
 'context': 'Âm nhạc Đông Nam Á gồm có các truyền thống âm nhạc tại phân miền châu Á này. Phân miền địa lý này bao gồm cả quốc gia Brunei, Campuchia,'}

In [6]:
import transformers
from transformers import AutoTokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
max_length = 384
doc_stride = 128
pad_on_right = tokenizer.padding_side == "right"

def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (
                offsets[token_start_index][0] <= start_char
                and offsets[token_end_index][1] >= end_char
            ):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while (
                    token_start_index < len(offsets)
                    and offsets[token_start_index][0] <= start_char
                ):
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [9]:
features = prepare_train_features(dataset["train"][:5])

In [10]:
tokenized_datasets = dataset.map(
    prepare_train_features, batched=True, remove_columns=dataset["train"].column_names
)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [13]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained('model_2\checkpoint-1000')

In [14]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"model_2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none",
)

In [15]:
from transformers import default_data_collator

data_collator = default_data_collator

In [16]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    
)

In [17]:
trainer.train()

***** Running training *****
  Num examples = 15650
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 19580
  Number of trainable parameters = 177264386


  0%|          | 0/19580 [00:00<?, ?it/s]

Saving model checkpoint to model_2\checkpoint-500
Configuration saved in model_2\checkpoint-500\config.json


{'loss': 0.2299, 'learning_rate': 1.9489274770173647e-05, 'epoch': 0.51}


Model weights saved in model_2\checkpoint-500\pytorch_model.bin
tokenizer config file saved in model_2\checkpoint-500\tokenizer_config.json
Special tokens file saved in model_2\checkpoint-500\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1741
  Batch size = 16


  0%|          | 0/109 [00:00<?, ?it/s]

{'eval_loss': 0.6111985445022583, 'eval_runtime': 26.1982, 'eval_samples_per_second': 66.455, 'eval_steps_per_second': 4.161, 'epoch': 1.0}


Saving model checkpoint to model_2\checkpoint-1000
Configuration saved in model_2\checkpoint-1000\config.json


{'loss': 0.1813, 'learning_rate': 1.8978549540347296e-05, 'epoch': 1.02}


Model weights saved in model_2\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in model_2\checkpoint-1000\tokenizer_config.json
Special tokens file saved in model_2\checkpoint-1000\special_tokens_map.json


KeyboardInterrupt: 

In [None]:
trainer.save_model("model_2/saved")

In [None]:
# test model
from transformers import pipeline


qa = pipeline('question-answering', model='model_2\checkpoint-1000',device=0,)

In [None]:
with open('data/zac2022_train_merged_final.json',encoding='utf-8') as f:
    data=json.load(f)
    df = pd.json_normalize(data,'data')

In [None]:
# PARTIAL_ANNOTATION
for c in ['PARTIAL_ANNOTATION','FULL_ANNOTATION','FALSE_LONG_ANSWER']:
    print(c)
    example = df[df['category'] == c].sample(1)
    question = example['question'].values[0]
    context = example['text'].values[0]
    print('QUESTION:', question)
    print('CONTEXT:', context)
    print(qa(question=question, context=context))
    print()

In [None]:
question = 'nhà chiêm tinh thiên văn học nổi tiếng tycho brahe là người nước nào'
context = 'B. - Tycho Brahe ( 1546-1601 ) , nhà thiên văn học người Đan Mạch . - Brahmagupta ( 598-668 ) , nhà thiên văn học người Ấn Độ .'
qa(question=question, context=context)