## Setup

In [None]:
import os, sys
from google.colab import drive

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/NYCU NLP Final/')
sys.path.append('/content/drive/MyDrive/NYCU NLP Final/')

In [None]:
!pip install transformers datasets > /dev/null

In [None]:
from typing import List, Dict, Union, Callable, Optional

import numpy as np
import pandas as pd
import torch

In [None]:
# parameters
SEED = 42

MODEL_NAME='distilroberta-base'
HIDDEN_DROPOUT = 0.1
DROPOUT = 0.2

EPOCHS=10
TRAIN_BATCH_SIZE=16
VALID_BATCH_SIZE=64

MODEL_SAVE_DIR = '0610_base_test'
# CHECKPOINT = 'checkpoint-4884'

In [None]:
import random

def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)

set_seed()

## Read Data

In [None]:
traindf = pd.read_csv('data/new_train.csv')
validdf = pd.read_csv('data/new_valid.csv')
testdf = pd.read_csv('data/new_test.csv')

In [None]:
print(f'# train: {len(traindf)}')
print(f'# valid: {len(validdf)}')
print(f'# test: {len(testdf)}')

In [None]:
classes = traindf['label'].unique()
n_labels = len(classes)

sent_id = {
    'sad':      0,  'trusting':     1,  'terrified': 2,  'caring':      3,  'disappointed': 4, 
    'faithful': 5,  'joyful':       6,  'jealous':   7,  'disgusted':   8,  'surprised':    9, 
    'ashamed':  10, 'afraid':       11, 'impressed': 12, 'sentimental': 13, 'devastated':   14, 
    'excited':  15, 'anticipating': 16, 'annoyed':   17, 'anxious':     18, 'furious':      19, 
    'content':  20, 'lonely':       21, 'angry':     22, 'confident':   23, 'apprehensive': 24, 
    'guilty':   25, 'embarrassed':  26, 'grateful':  27, 'hopeful':     28, 'proud':        29, 
    'prepared': 30, 'nostalgic':    31
}

id_sent = {v: k for k, v in sent_id.items()}

## Build Dataset

In [None]:
# class PromptConvDataset(torch.utils.data.Dataset):
#     def __init__(self, df, tokenizer: PreTrainedTokenizerBase):
#         self.size = len(df)
#         self.encoded_dict = tokenizer(df[['prompt', 'conv']].values.tolist(), 
#                                       add_special_tokens=True, 
#                                       padding=True, 
#                                       truncation=True)
        
#         if 'label' in df.columns:
#             self.labels = df['label'].values.tolist()

#     def __getitem__(self, idx):
#         item = {k: torch.tensor(v[idx]) for k, v in self.encoded_dict.items()}

#         if self.labels:
#             item['labels'] = torch.tensor(self.labels[idx])
        
#         return item

#     def __len__(self):
#         return self.size

class PromptConvDataset(torch.utils.data.Dataset):
    def __init__(self, df, transform=None):
        self.size = len(df)
        self.df = df
        self.transform = transform

    def __getitem__(self, idx):
        item = self.df.iloc[idx][['prompt', 'conv']].values.tolist()

        if self.transform:
            encoding = self.transform(item)  # encoded dict
            item = {k: torch.tensor(v) for k, v in encoding.items()}

        if 'label' in self.df.columns:
            item['labels'] = torch.tensor(self.df.iloc[idx]['label'])
        
        return item

    def __len__(self):
        return self.size

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

special_tokens_dict = {'additional_special_tokens': ['[SPEAKER_A]', '[SPEAKER_B]']}
tokenizer.add_special_tokens(special_tokens_dict)

In [None]:
from transforms import (
    Tokenization,
    RandomDeletion,
    RandomSwap,
    RandomMask,
    Encoding
)

encoding = torch.nn.Sequential(
    Tokenization(tokenizer),
    Encoding(tokenizer, max_length=512)
)

augmentation = torch.nn.Sequential(
    Tokenization(tokenizer),
    RandomDeletion(tokenizer, rate=0.1),
    RandomSwap(tokenizer, n_swap=1),
    RandomMask(tokenizer, rate=0.1),
    Encoding(tokenizer, max_length=512)
)


train_dataset = PromptConvDataset(traindf, encoding) # to get augmentated data, just replace the encoding with augmentation
valid_dataset = PromptConvDataset(validdf, encoding)
test_dataset = PromptConvDataset(testdf, encoding)

In [None]:
from datasets import load_metric

metric_precision = load_metric('precision')
metric_recall = load_metric('recall')
metric_f1 = load_metric('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = metric_precision.compute(predictions=predictions, references=labels, average='macro')['precision']
    recall = metric_recall.compute(predictions=predictions, references=labels, average='macro')['recall']
    f1_score = metric_f1.compute(predictions=predictions, references=labels, average='macro')['f1']
    return {'Precision': precision, 'Recall': recall, 'F1': f1_score}

## Build Model

In [None]:
# load raw model
from transformers import AutoModelForSequenceClassification, AutoConfig

config = AutoConfig.from_pretrained(MODEL_NAME, 
                                    hidden_dropout_prob=HIDDEN_DROPOUT, 
                                    classifier_dropout=DROPOUT,
                                    num_labels=n_labels,
                                    id2label=id_sent,
                                    label2id=sent_id)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# load fine-tuned model
# from transformers import AutoModelForSequenceClassification, AutoConfig

# model = AutoModelForSequenceClassification.from_pretrained(os.path.join(MODEL_SAVE_DIR, CHECKPOINT))

## Build Trainer

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=MODEL_SAVE_DIR,
    logging_strategy='epoch',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    lr_scheduler_type='cosine',
    warmup_steps=1000,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="F1",
    seed=SEED,
    data_seed=SEED
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

## Training

In [None]:
trainer.train()

## Prediction & Evaluation

In [None]:
def post_processing(logits, threshold: float = 1., steps: int = 0):
    """Replace the top1 prediction with other potential candidates. 
    Setting threshold=1 and steps=0 will always take the top1 candidate as the answer.

    Args:
        logits (Union[List, np.array]): the output hypothesis of the model
        threshold (float): if top2 > top1 * threshold then the second candidate will be the result
        steps (int): how many candidates should be test
    """
    n_data, n_classes = logits.shape
    logits = torch.softmax(torch.tensor(logits), dim=-1)
    top5_indices = torch.argsort(logits, dim=-1, descending=True)[:, :5] # top 5 label predictions
    result = top5_indices[:, 0].clone() # label predictions

    # default result is just argmax, no candidates will be checked
    if threshold == 1 and steps == 0:
        return result
    
    # check if the second ans satisfies the threshold
    for i in range(n_data):
        if logits[i, top5_indices[i, 1]] > logits[i, top5_indices[i, 0]] * threshold:
            result[i] = top5_indices[i, 1]

    return result

def evaluate_f1(preds, labels, average='macro'):
    precision = metric_precision.compute(predictions=preds, references=labels, average=average)['precision']
    recall = metric_recall.compute(predictions=preds, references=labels, average=average)['recall']
    f1_score = metric_f1.compute(predictions=preds, references=labels, average=average)['f1']
    return {'Precision': precision, 'Recall': recall, 'F1': f1_score}

In [None]:
eval_preds = trainer.predict(valid_dataset)

In [None]:
preds = post_processing(eval_preds.predictions)
valid_f1 = evaluate_f1(preds, eval_preds.label_ids, average=None)['F1']

In [None]:
test_preds = trainer.predict(test_dataset)

In [None]:
test_ans = post_processing(test_preds.predictions)
testdf['pred'] = test_ans

In [None]:
submission = pd.read_csv('data/fixed_test.csv')
submission['pred'] = np.zeros(shape=(len(submission),))
for _, row in testdf.iterrows():
    submission.loc[(submission['conv_id'] == row['conv_id']), 'pred'] = row['pred']

In [None]:
submission

In [None]:
submission[['pred']].to_csv(f'output/20220526_ckpt{CHECKPOINT.split('-')[-1]}_submission.csv', encoding='utf8')