We wants to solve open-domain QA task.

My process is as follows:

#### 1. [Tokenization](https://www.kaggle.com/adldotori/tokenizing-hindi-and-tamil-language-nlp-step-1)
#### 2. [Demo](https://www.kaggle.com/adldotori/demo-training-nlp-step-2/)
* ver 1 : init (2021/10/03)
* ver 2 : update validation (2021/10/05)
* ver 3 : validation score 0.45 (2021/10/11)

#### 3. Research QA Model
#### 4. Training
#### 5. Inference

In [None]:
!pip3 install transformers==4.11.2

In [None]:
import os
import os.path as osp

import pandas as pd

import torch

from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [None]:
INPUT_PATH = '../input/chaii-hindi-and-tamil-question-answering/'

train = pd.read_csv(osp.join(INPUT_PATH, 'train.csv'))
test = pd.read_csv(osp.join(INPUT_PATH, 'test.csv'))
sub = pd.read_csv(osp.join(INPUT_PATH, 'sample_submission.csv'))

In [None]:
train_df, val_df = train[:round(len(train) * 0.8)], train[round(len(train) * 0.8):]

# Inference Example

In [None]:
# https://huggingface.co/transformers/usage.html#extractive-question-answering

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""

questions = [
    "How many pretrained models are available in Transformers?",
    "What does Transformers provide?",
    "Transformers provides interoperability between which frameworks?",
]

for question in questions:
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    result = model(**inputs)
    answer_start_scores = result['start_logits']
    answer_end_scores = result['end_logits']

    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")

# Fine-tuning

In [None]:
# https://huggingface.co/transformers/custom_datasets.html#question-answering-with-squad-2-0

def read_dataset(df:pd.DataFrame):
    
    contexts = []
    questions = []
    answers = []
    
    for i, data in df.iterrows():
        contexts.append(data['context'])
        questions.append(data['question'])
        
        answer = {}
        answer['text'] = data['answer_text']
        answer['answer_start'] = data['answer_start']
        answer['answer_end'] = data['answer_start'] + len(data['answer_text'])
        answers.append(answer)
    
    return contexts, questions, answers

In [None]:
train_contexts, train_questions, train_answers = read_dataset(train_df)
val_contexts, val_questions, val_answers = read_dataset(val_df)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('deepset/xlm-roberta-large-squad2')

In [None]:
from transformers import AutoTokenizer

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [None]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [None]:
# https://huggingface.co/transformers/custom_datasets.html#question-answering-with-squad-2-0

import torch

class ChaiiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = ChaiiDataset(train_encodings)
val_dataset = ChaiiDataset(val_encodings)

In [None]:
# model = AutoModelForQuestionAnswering.from_pretrained("deepset/xlm-roberta-base-squad2")

In [None]:
# # https://huggingface.co/transformers/custom_datasets.html#question-answering-with-squad-2-0

# from torch.utils.data import DataLoader
# from transformers import AdamW

# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# model.to(device)
# model.train()

# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# optim = AdamW(model.parameters(), lr=5e-5)

# Training Your Own GPU
It needs 24GB memory, so can't training in kaggle.

In [None]:
# from tqdm import tqdm

# for epoch in range(30):
#     pbar = tqdm(train_loader)
#     total_loss = 0
#     for batch in pbar:
#         optim.zero_grad()
#         input_ids = batch['input_ids'][:,:512].to(device)
#         attention_mask = batch['attention_mask'][:,:512].to(device)
#         start_positions = batch['start_positions'].to(device)
#         end_positions = batch['end_positions'].to(device)
#         outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
#         loss = outputs[0]
#         total_loss += loss
#         loss.backward()
#         optim.step()
        
#         pbar.set_description(f"Loss : {round(loss.item(), 3)}")
#     print(f"[{epoch+1} EPOCH] Total Loss : {round((total_loss / len(pbar)).item(), 4)}\n")


# Validation

This checkpoint based by deepset/xlm-roberta-base-squad2 model.

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = AutoModelForQuestionAnswering.from_pretrained("/kaggle/input/chaiick/checkpoint_all")
model.to(device)

Jaccard Similiarity.

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
def find_index(target, base):
    for i in range(len(base)):
        if target == base[i:i + len(target)]:
            return i,i+len(target)
    return -1,-1

In [None]:
model.eval()


def test_idx(
    df:pd.DataFrame,
    idx:int, 
    is_valid:bool = False, 
    log:bool = False
):
    question = df.loc[idx]['question']
    text = df.loc[idx]['context']
    
    if is_valid:
        answer = df.loc[idx]['answer_text']
    
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
    
    if is_valid:
        start, end = find_index(tokenizer(answer)["input_ids"][1:-1], tokenizer(text)["input_ids"])
    input_ids = inputs["input_ids"][:, :512].to(device)
    attention_mask = inputs["attention_mask"][:, :512].to(device)

    result = model(input_ids, attention_mask=attention_mask)
    answer_start_scores = result['start_logits']
    answer_end_scores = result['end_logits']


    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(
            input_ids[0][answer_start:answer_end]
        )
    )
    
    # delete bad tokens
    bad_starts = [".", ",", "(", ")", "-", "–",  ",", ";"]
    bad_endings = ["...", "-", "(", ")", "–", ",", ";"]
    if any([answer.startswith(token) for token in bad_starts]):
        answer = answer[1:]
    if any([answer.endswith(token) for token in bad_endings]):
        answer = answer[:-1]
    
    if is_valid:
        score = jaccard(df.loc[idx]["answer_text"], answer)
    df.loc[idx, 'predict'] = answer
    if is_valid:
        df.loc[idx, 'ans_start'] = start
        df.loc[idx, 'ans_end'] = end
        df.loc[idx, 'score'] = score
    df.loc[idx, 'pred_start'] = find_index(tokenizer(answer)["input_ids"][1:-1], tokenizer(text)["input_ids"])[0]
    

    if log:
        print(f'Answer[{df.iloc[idx]["answer_start"]} - {df.iloc[idx]["answer_start"] + len(val_df.iloc[idx]["answer_text"])}] : {df.iloc[idx]["answer_text"]}')
        print(f'Prediction[{int(answer_start)} - {int(answer_end)}] : {answer}\n')
        print(f'Score : {score}\n\n')

In [None]:
val_df = val_df.reset_index(drop=True)
val_df[['predict', 'ans_start', 'ans_end', 'pred_start', 'score']] = 0

for i in range(len(val_df)):
    test_idx(val_df, i, is_valid=True)

In [None]:
val_df.sort_values('score', ascending=False).head(15)

In [None]:
print(f'Validation Score : {round(val_df.score.mean(), 2)}')

# Submission

In [None]:
test

In [None]:
test = test.reset_index(drop=True)
test[['predict', 'pred_start']] = 0

for i in range(len(test)):
    test_idx(test, i, is_valid=False)

In [None]:
test

In [None]:
for i, data in test.iterrows():
    sub.loc[sub.id == data.id, 'PredictionString'] = data['predict']

In [None]:
sub.to_csv('submission.csv')