In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install transformers
!pip install datasets

# 1. Import libs

In [2]:
import os, sys, argparse, gc
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering, DefaultDataCollator
import datasets
from typing import Any

# 2. Run code

In [None]:
print("TF Version: ", tf.__version__)
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
    # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

#########################################################
#MODEL_PHOBERT_BASE = 'vinai/phobert-base'
MODEL_PHOBERT_LARGE = 'vinai/phobert-large'

#########################################################
def get_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default=MODEL_PHOBERT_LARGE, help="Pretrained model bert")
    parser.add_argument("--lr", type=float, default=1e-5, help="Learning rate")
    parser.add_argument("--bs", type=int, default=16, help="Batch size")
    parser.add_argument("--epochs", type=int, default=20, help="Number of epochs")
    parser.add_argument("--maxlen", type=int, default=512, help="Max sentence length")
    parser.add_argument("--stride", type=int, default=128, help="Stride value for window slide")
    parser.add_argument("--use_fast", type=bool, default=True, help="Tokenize sentence with fast bpe")

    return parser.parse_args(args=[])

#########################################################
def preprocess_dataset(ds: pd.DataFrame, tokenizer: Any, maxlen: int):
    questions = [q.strip() for q in ds["question"]]
    contexts = [str(t) for t in ds["context"]]
    inputs = tokenizer(
        questions,
        contexts,
        max_length=maxlen,
        truncation="only_second",
        return_token_type_ids=True,
        padding="max_length",
    )

    answer_starts = ds["answer_start"]
    answer_ends = ds["answer_end"]
    answers = ds["answer"]
    start_positions = []
    end_positions = []
    assert len(answer_starts) == len(answer_ends)

    for i in range(len(answer_starts)):
        start_char = answer_starts[i]
        end_char = answer_ends[i]
        if start_char == 0 and end_char == 0:
            start_positions.append(0)
            end_positions.append(0)
            continue

        answer = answers[i]
        context = contexts[i]
        input_ids = inputs["input_ids"][i]

        # Find the start and end of the context
        idx = 0
        while input_ids[idx] != 2:
            idx += 1
        idx += 2
        context_start = idx
        # print("context start:", context_start, ",id:", input_ids[context_start])
        while idx < len(input_ids) and input_ids[idx] != 1:
            idx += 1
        context_end = idx - 1
        if input_ids[context_end] == 2:
            context_end -= 1
        # print("context end:", context_end, "id:", input_ids[context_end])

        pre_ans = tokenizer.encode(context[:start_char], add_special_tokens=False)
        ans_ids = tokenizer.encode(answer, add_special_tokens=False)

        start_position = context_start + len(pre_ans)
        end_position = start_position + len(ans_ids) - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if start_position < context_start or end_position > context_end:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            start_positions.append(start_position)
            end_positions.append(end_position)

    inputs["start_positions"] = np.array(start_positions, dtype=np.int32)
    inputs["end_positions"] = np.array(end_positions, dtype=np.int32)
    return inputs

def generate_dataset(file_name: str, tokenizer: Any, data_collator: Any, maxlen: int, stride: int, batch_size: int, model_name: str):
    df = pd.read_csv(file_name)
    df.drop("title", axis=1, inplace=True)
    df.fillna({"answer": ""}, inplace=True)
    ds = datasets.Dataset.from_dict(df)

    dataset =  ds.map(
       lambda x: preprocess_dataset(x, tokenizer, maxlen),
       batched=True,
       remove_columns=ds.column_names,
    )
    
    return dataset.to_tf_dataset(
        columns=[
            "input_ids",
            "start_positions",
            "end_positions",
            "attention_mask",
            "token_type_ids",
        ],
        collate_fn=data_collator,
        shuffle=True,
        batch_size=batch_size,
    )

#########################################################
class myCallback(tf.keras.callbacks.Callback):
    def __init__(self, saved_model_name: str):
        super().__init__()

        self.min_loss = sys.float_info.max
        self.min_val_loss = sys.float_info.max

        self.saved_model_name = saved_model_name

    def on_epoch_end(self, epoch, logs={}):
        min_loss = logs.get('loss')
        min_val_loss = logs.get('val_loss')

        if min_loss <= self.min_loss and min_val_loss <= self.min_val_loss:
            self.min_loss = min_loss
            self.min_val_loss = min_val_loss

            print("\nsave model at epoch {}".format(epoch+1))
            # self.model.save("models/{}.h5".format(self.saved_model_name))
            self.model.save("/content/gdrive/MyDrive/Research/NLP-Labs/Lab7.1_BERT-Question-and-Answering-Vietnamese/model/vinai-phobert-large", save_format='tf')
            


In [None]:
if __name__ == "__main__":
    args = get_arguments()
    model_name = args.model
    lr = args.lr
    batch_size = 2 #args.bs
    epochs = 10 # args.epochs
    maxlen = args.maxlen
    stride = args.stride
    use_fast = args.use_fast

In [None]:
#########################################################
# separate so that later test runs, we don't execute (train) again

if __name__ == "__main__":    
    print("##############################")
    print("Model :", model_name)
    print("Learning Rate :", lr)
    print("Batch Size :", batch_size)
    print("Epochs :", epochs)
    print("Max Token Length :", maxlen)
    print("Stride :", stride)
    print("Use Fast :", use_fast)
    print("##############################")

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=use_fast)
    data_collator = DefaultDataCollator(return_tensors="tf")
    dataset_tr = generate_dataset(
        "/content/gdrive/MyDrive/Research/NLP-Labs/Lab7.1_BERT-Question-and-Answering-Vietnamese/datasets/ViWikiQA1.0/ws_train.csv",
        tokenizer, data_collator, maxlen,
        stride, batch_size, model_name
    )
    dataset_val = generate_dataset(
        "/content/gdrive/MyDrive/Research/NLP-Labs/Lab7.1_BERT-Question-and-Answering-Vietnamese/datasets/ViWikiQA1.0/ws_dev.csv",
        tokenizer, data_collator, maxlen, stride,
        batch_size, model_name
    )

    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)
    model.compile(optimizer=optimizer)

    # Train in mixed-precision float16
    tf.keras.mixed_precision.set_global_policy("mixed_float16")

    cb = myCallback(model_name.replace("/", "-"))
    earlyStopCB = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, verbose=1)

    history = model.fit(
        dataset_tr,
        validation_data=dataset_val,
        epochs=epochs,
        callbacks=[cb, earlyStopCB],
    )

    hist = pd.DataFrame(history.history)
    hist.to_csv("{}_bs{}_lr{}.csv".format(model_name.replace("/", "-"), batch_size, lr))

# 3. Load Test Data

In [None]:
df_test = pd.read_csv("/content/gdrive/MyDrive/Research/NLP-Labs/Lab7.1_BERT-Question-and-Answering-Vietnamese/datasets/ViWikiQA1.0/ws_test.csv")
df_test.drop("title", axis=1, inplace=True)
df_test.info()

In [None]:
df_test["context_len"] = df_test["context"].apply(lambda x: len(x.split()))
df_test["context_len"].hist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
ds_test = datasets.Dataset.from_dict(df_test)

dataset_test = ds_test.map(
    lambda ds: preprocess_dataset(ds, tokenizer, maxlen),
    batched=True,
    remove_columns=ds_test.column_names,
)
len(ds_test), len(dataset_test)
#dataset_test
#dataset_test[0]['end_positions']

# 4. Load model

In [None]:
model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)
model.load_weights(f"/content/gdrive/MyDrive/Research/NLP-Labs/Lab7.1_BERT-Question-and-Answering-Vietnamese/model/vinai-phobert-large")
model.summary()

# 5. Predict

In [None]:
def tokenize_question_context(question, context, tokenizer, maxlen, stride):
    question = question.strip()
    context = context.strip()
    inputs = tokenizer(
        question,
        context,
        max_length=maxlen,
        truncation="only_second",
        stride=stride,
        # return_overflowing_tokens=True,
        # return_offsets_mapping=True,
        padding="max_length",
        return_tensors="tf"
    )

    return inputs
  
def predict(model, question, context, tokenizer, maxlen, stride):
    inputs = tokenize_question_context(question, context, tokenizer, maxlen, stride)
    outputs = model(**inputs)

    start_logits = outputs["start_logits"].numpy()
    end_logits = outputs["end_logits"].numpy()

    starts = np.argmax(start_logits, axis=1)
    ends = np.argmax(end_logits,  axis=1)

    start_scores = np.max(start_logits, axis=1)
    end_scores = np.max(end_logits, axis=1)
    scores = start_scores + end_scores

    indices = []
    for idx, start in enumerate(starts):
        end = ends[idx]
        if start == 0 and end == 0:
            continue
        if end < start:
            continue
        indices.append(idx)

    answers = []
    for idx in indices:
        score = scores[idx]
        ans_ids = inputs["input_ids"][idx][starts[idx]:ends[idx]+1]
        answer = tokenizer.decode(ans_ids, skip_special_tokens=True)
        answers.append((answer, score))
    return answers


df = df_test[df_test["context_len"] > 500]
df.head()

In [None]:
df_test.head()

In [None]:
idx = 4
# question = df.loc[idx, "question"]
# context = df.loc[idx, "context"]
question = df_test.loc[idx, "question"]
print("Question:", question)
context = df_test.loc[idx, "context"]
print("Context:", context)

answers = predict(model, question, context, tokenizer, maxlen, stride)
print("Answer:",answers)