## import package

In [None]:
!pip install torch
!pip install pandas nltk
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
import math
import numpy as np
import time
import torch, pandas as pd
import nltk
import re
nltk.download('punkt')

from transformers import set_seed
set_seed(123)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Training data file
directory="/content/drive/MyDrive/Colab Notebooks/AIcup2022/"


file=directory+"train.csv"
df=pd.read_csv(file, encoding = "ISO-8859-1")

In [None]:
file=directory+"test.csv"
df_test=pd.read_csv(file, encoding = "ISO-8859-1")

In [None]:
df = df.drop(columns=['Unnamed: 6', 'total no.: 7987'])

In [None]:
df_test

In [None]:
df

## Data process

In [None]:
# train
df[['q','r',"q'","r'"]] = df[['q','r',"q'","r'"]].apply(lambda x: x.str.strip('\"'))

In [None]:
# test
df_test[['q','r']] = df_test[['q','r']].apply(lambda x: x.str.strip('\"'))

In [None]:
# train
df['r'] = df['s'] + ':' + df['r']

In [None]:
# test
df_test['r'] = df_test['s'] + ':' + df_test['r']

In [None]:
# train
df['sub_q_true'] = [1 if x in y else 0 for x,y in zip(df["q'"],df["q"])]
df['sub_r_true'] = [1 if x in y else 0 for x,y in zip(df["r'"],df["r"])]
df['sub_both'] = df['sub_q_true']*df['sub_r_true']

In [None]:
df

In [None]:
# train
data = df.loc[df['sub_both'] == 1]

In [None]:
# train
data['q_start'] = [y.index(x) for x,y in zip(data["q'"],data["q"])]
data['r_start'] = [y.index(x) for x,y in zip(data["r'"],data["r"])]
data['q_end'] = [x+len(y)-1 for x,y in zip(data["q_start"],data["q'"])]
data['r_end'] = [x+len(y)-1 for x,y in zip(data["r_start"],data["r'"])]

In [None]:
data

In [None]:
# test
df_test

In [None]:
train = data

In [None]:
# test
test = df_test

## Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
train_data_q = train['q'].tolist()
test_data_q = test['q'].tolist()

train_data_r = train['r'].tolist()
test_data_r = test['r'].tolist()

In [None]:
train_encodings = tokenizer(train_data_q, train_data_r, truncation=True, padding=True)
test_encodings = tokenizer(test_data_q, test_data_r, truncation=True, padding=True)

In [None]:
tokenizer.decode(train_encodings['input_ids'][0])

In [None]:
tokenizer.decode(test_encodings['input_ids'][0])

In [None]:
# train
train_answer = train[['q_start', 'r_start',	'q_end', 'r_end']].to_dict('records')

In [None]:
def add_token_positions(encodings, answers):
    q_start, r_start, q_end, r_end = [],[],[],[]

    for i in range(len(answers)):
        q_start.append(encodings.char_to_token(i, answers[i]['q_start'], 0))
        r_start.append(encodings.char_to_token(i, answers[i]['r_start'], 1))
        q_end.append(encodings.char_to_token(i, answers[i]['q_end'], 0))
        r_end.append(encodings.char_to_token(i, answers[i]['r_end'], 1))

        if q_start[-1] is None:
            q_start[-1] = 0
            q_end[-1] = 0
            # continue

        if r_start[-1] is None:
            r_start[-1] = 0
            r_end[-1] = 0
            # continue

        shift = 1
        while q_end[-1] is None:
            q_end[-1] = encodings.char_to_token(i, answers[i]['q_end'] - shift)
            shift += 1
        shift = 1
        while r_end[-1] is None:
            r_end[-1] = encodings.char_to_token(i, answers[i]['r_end'] - shift)
            shift += 1
    encodings.update({'q_start':q_start, 'r_start':r_start,	'q_end':q_end, 'r_end':r_end})

In [None]:
# Convert char_based_id to token_based_id
# Find the corossponding token id after input being tokenized
add_token_positions(train_encodings, train_answer)

In [None]:
train_encodings.keys()

In [None]:
test_encodings.keys()

In [None]:
# print(train_encodings['q_start'][0])
# print(train_encodings['r_start'][0])
# print(train_encodings['q_end'][0])
# print(train_encodings['r_end'][0])

## Dataset

In [None]:

class qrDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
train_dataset = qrDataset(train_encodings)
test_dataset = qrDataset(test_encodings)

In [None]:
next(iter(train_dataset))

In [None]:
next(iter(test_dataset))

## Model

In [None]:
from transformers import BertModel

class myModel(torch.nn.Module):

    def __init__(self):

        super(myModel, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.fc = nn.Linear(768, 4)


    def forward(self, input_ids, attention_mask, token_type_ids):

        output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=True)
        logits = output[0]
        out = self.fc(logits)

        return out



## Training

In [None]:
from transformers import AdamW
from tqdm import tqdm

# Set GPU / CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Put model on device
model = myModel().to(device)

optim = AdamW(model.parameters(), lr=1e-6)

In [None]:
# Pack data into dataloader by batch
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
training_epoch = 4

In [None]:
loss_fct = CrossEntropyLoss()

In [None]:
for epoch in range(training_epoch):
    model.train()
    running_loss = 0.0

    loop = tqdm(train_loader, leave=True)
    for batch_id, batch in enumerate(loop):
        # reset
        optim.zero_grad()


        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        q_start = batch['q_start'].to(device)
        r_start = batch['r_start'].to(device)
        q_end = batch['q_end'].to(device)
        r_end = batch['r_end'].to(device)


        # model output
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        q_start_logits, r_start_logits, q_end_logits, r_end_logits = torch.split(outputs, 1, 2)

        q_start_logits = q_start_logits.squeeze(-1).contiguous()
        r_start_logits = r_start_logits.squeeze(-1).contiguous()
        q_end_logits = q_end_logits.squeeze(-1).contiguous()
        r_end_logits = r_end_logits.squeeze(-1).contiguous()

        q_start_loss = loss_fct(q_start_logits, q_start)
        r_start_loss = loss_fct(r_start_logits, r_start)
        q_end_loss = loss_fct(q_end_logits, q_end)
        r_end_loss = loss_fct(r_end_logits, r_end)



        loss = q_start_loss + r_start_loss + q_end_loss + r_end_loss

        # calculate loss
        loss.backward()
        # update parameters
        optim.step()

        running_loss += loss.item()
        if batch_id % 200 == 0 and batch_id != 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(
                batch_id + 1, batch_id, running_loss / 200))
            running_loss = 0.0

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
torch.save(model.state_dict(), directory + 'aicup_model_4eplr16.pt')

In [None]:
model = myModel().to(device)
model.load_state_dict(torch.load(directory + 'aicup_model_4eplr16.pt'))

## Predict

In [None]:
def predict(test_loader):
    predict_pos = []

    model.eval()

    q_sub_output, r_sub_output = [],[]

    loop = tqdm(test_loader, leave=True)
    for batch_id, batch in enumerate(loop):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)

        # model output
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        q_start_logits, r_start_logits, q_end_logits, r_end_logits = torch.split(outputs, 1, 2)

        q_start_logits = q_start_logits.squeeze(-1).contiguous()
        r_start_logits = r_start_logits.squeeze(-1).contiguous()
        q_end_logits = q_end_logits.squeeze(-1).contiguous()
        r_end_logits = r_end_logits.squeeze(-1).contiguous()

        q_start_prdict = torch.argmax(q_start_logits, 1).cpu().numpy()
        r_start_prdict = torch.argmax(r_start_logits, 1).cpu().numpy()
        q_end_prdict = torch.argmax(q_end_logits, 1).cpu().numpy()
        r_end_prdict = torch.argmax(r_end_logits, 1).cpu().numpy()

        for i in range(len(input_ids)):
            predict_pos.append((q_start_prdict[i].item(), r_start_prdict[i].item(), q_end_prdict[i].item(), r_end_prdict[i].item()))

            q_sub = tokenizer.decode(input_ids[i][q_start_prdict[i]:q_end_prdict[i]+1])
            r_sub = tokenizer.decode(input_ids[i][r_start_prdict[i]:r_end_prdict[i]+1])

            q_sub_output.append(q_sub)
            r_sub_output.append(r_sub)

    return q_sub_output, r_sub_output, predict_pos



In [None]:
q_sub_output, r_sub_output, predict_pos = predict(test_loader)

In [None]:
def get_output_post_fn(df_test, q_sub_output, r_sub_output):
    q_sub, r_sub = [], []
    for i in range(len(test)):

        q_sub_pred = q_sub_output[i].split()
        r_sub_pred = r_sub_output[i].split()

        if q_sub_pred is None:
            q_sub_pred = []
        q_sub_error_index = q_sub_pred.index('[SEP]') if '[SEP]' in q_sub_pred else -1

        if q_sub_error_index != -1:
            q_sub_pred = q_sub_pred[:q_sub_error_index]

        temp = r_sub_pred.copy()
        if r_sub_pred is None:
            r_sub_pred = []
        else:
            for j in range(len(temp)):
                if temp[j] == '[SEP]':
                    r_sub_pred.remove('[SEP]')
                if temp[j] == '[PAD]':
                    r_sub_pred.remove('[PAD]')

        q_sub.append(" ".join(q_sub_pred))
        r_sub.append(" ".join(r_sub_pred))

    return q_sub, r_sub

In [None]:
q_sub, r_sub = get_output_post_fn(df_test, q_sub_output, r_sub_output)

In [None]:
q = []
for i in range(len(q_sub)):
  q_ = "\"" + q_sub[i] + "\""
  q.append(q_)

In [None]:
r = []
for i in range(len(r_sub)):
  r_ = "\"" + r_sub[i] + "\""
  r.append(r_)

In [None]:
data = {
    "id": df_test["id"],
    "q": q,
    "r": r
}
df_submit = pd.DataFrame(data)

In [None]:
df_submit

In [None]:
with open(directory + "predict_test.4eplr16.csv", "w", encoding="utf-8") as f:
  df_submit.to_csv(f, index=False)