## Master Proposal
- Stacking several fine-tuned models to predict individually
    - distilbert
    - distilroberta
    - distilroberta + augmentation
- Add two linear classifiers on the top

## Setup

In [None]:
import os, sys
from google.colab import drive

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/NYCU NLP Final/')
sys.path.append('/content/drive/MyDrive/NYCU NLP Final/')

In [None]:
!pip install transformers datasets > /dev/null

In [None]:
from typing import Tuple, List, Dict, Union, Callable, Optional
from collections import OrderedDict

import numpy as np
import pandas as pd
import torch

In [None]:
# parameters
SEED = 42

# model
BASE_MODEL_CKPTS=[
    'models/both_distilbert',
    'models/both_distilroberta',
    'models/both_aug_distilroberta',
]

EPOCHS=5
TRAIN_BATCH_SIZE=16
VALID_BATCH_SIZE=64
LEARNING_RATE=1e-4
WARMUP=400

MODEL_SAVE_DIR = '0610_master_test'
# CHECKPOINT = 'checkpoint-4884'

In [None]:
import random

def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)

set_seed()

## Read Data

In [None]:
traindf = pd.read_csv('data/new_train.csv')
validdf = pd.read_csv('data/new_valid.csv')
testdf = pd.read_csv('data/new_test.csv')

In [None]:
print(f'# train: {len(traindf)}')
print(f'# valid: {len(validdf)}')
print(f'# test: {len(testdf)}')

In [None]:
classes = traindf['label'].unique()
n_labels = len(classes)

sent_id = {
    'sad':      0,  'trusting':     1,  'terrified': 2,  'caring':      3,  'disappointed': 4, 
    'faithful': 5,  'joyful':       6,  'jealous':   7,  'disgusted':   8,  'surprised':    9, 
    'ashamed':  10, 'afraid':       11, 'impressed': 12, 'sentimental': 13, 'devastated':   14, 
    'excited':  15, 'anticipating': 16, 'annoyed':   17, 'anxious':     18, 'furious':      19, 
    'content':  20, 'lonely':       21, 'angry':     22, 'confident':   23, 'apprehensive': 24, 
    'guilty':   25, 'embarrassed':  26, 'grateful':  27, 'hopeful':     28, 'proud':        29, 
    'prepared': 30, 'nostalgic':    31
}

id_sent = {v: k for k, v in sent_id.items()}

## Build Dataset

In [None]:
from transformers import PreTrainedTokenizerBase, AutoTokenizer
from transforms import (
    Tokenization,
    Encoding,
    RandomDeletion,
    RandomSwap,
    RandomMask,
)

class EnsembledDataset(torch.utils.data.Dataset):
    def __init__(self, df, distilbert_tokenizer, distilroberta_tokenizer, augmentation, maxlen: Optional[int] = None):
        self.size = len(df)
        self.df = df.copy()
        self.augmentation = augmentation
        self.encodings = [
            distilbert_tokenizer(df[['prompt', 'conv']].values.tolist(), add_special_tokens=True, max_length=maxlen, padding='max_length', truncation=True),
            distilroberta_tokenizer(df[['prompt', 'conv']].values.tolist(), add_special_tokens=True, max_length=maxlen, padding='max_length', truncation=True),
        ]
        
        self.labels = None
        if 'label' in df.columns:
            self.labels = df['label'].values.tolist()

    def __getitem__(self, idx):
        rawdata = self.df.iloc[idx][['prompt', 'conv']].values.tolist()
        aug_enc = self.augmentation(rawdata)

        item = {k: torch.tensor([enc[k][idx] for enc in self.encodings] + [aug_enc[k]]) for k in ['input_ids', 'attention_mask']}

        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        
        return item

    def __len__(self):
        return self.size

In [None]:
distilbert_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
distilroberta_tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')

special_tokens_dict = {'additional_special_tokens': ['[SPEAKER_A]', '[SPEAKER_B]']}
distilbert_tokenizer.add_special_tokens(special_tokens_dict)
distilroberta_tokenizer.add_special_tokens(special_tokens_dict)

In [None]:
encoding = torch.nn.Sequential(
    Tokenization(distilroberta_tokenizer),
    Encoding(distilroberta_tokenizer, max_length=512)
)

augmentation = torch.nn.Sequential(
    Tokenization(distilroberta_tokenizer),
    RandomDeletion(distilroberta_tokenizer, rate=0.1),
    RandomSwap(distilroberta_tokenizer, n_swap=1),
    RandomMask(distilroberta_tokenizer, rate=0.1),
    Encoding(distilroberta_tokenizer, max_length=512)
)

train_dataset = EnsembledDataset(traindf, distilbert_tokenizer, distilroberta_tokenizer, augmentation, maxlen=512)
valid_dataset = EnsembledDataset(validdf, distilbert_tokenizer, distilroberta_tokenizer, encoding, maxlen=512)
test_dataset = EnsembledDataset(testdf, distilbert_tokenizer, distilroberta_tokenizer, encoding, maxlen=512)

In [None]:
from datasets import load_metric

metric_precision = load_metric('precision')
metric_recall = load_metric('recall')
metric_f1 = load_metric('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = metric_precision.compute(predictions=predictions, references=labels, average='macro')['precision']
    recall = metric_recall.compute(predictions=predictions, references=labels, average='macro')['recall']
    f1_score = metric_f1.compute(predictions=predictions, references=labels, average='macro')['f1']
    return {'Precision': precision, 'Recall': recall, 'F1': f1_score}

## Build Model

In [None]:
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import SequenceClassifierOutput

class StackedEnsembleModel(torch.nn.Module):
    def __init__(self, base_models: List, n_labels: int):
        super(StackedEnsembleModel, self).__init__()
        self.n_models = len(base_models)
        self.n_labels = n_labels

        self.n_hidden_state = self.n_models * self.n_labels

        self.base_models = base_models
        self.classifier = torch.nn.Sequential(OrderedDict([
                            ('pre-classifier', torch.nn.Linear(in_features=self.n_hidden_state, out_features=self.n_hidden_state, bias=True)),
                            ('dropout', torch.nn.Dropout(p=0.2)),
                            ('classifier', torch.nn.Linear(in_features=self.n_hidden_state, out_features=n_labels, bias=True))]))

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = torch.hstack([self.base_models[i](input_ids=input_ids[:, i], attention_mask=attention_mask[:, i]).logits for i in range(self.n_models)])
        outputs = self.classifier(outputs)
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(outputs.view(-1, self.n_labels), labels.view(-1))

        return SequenceClassifierOutput(loss=loss, logits=outputs)

In [None]:
# load fine-tuned model
from transformers import AutoModelForSequenceClassification, AutoConfig

base_models = [AutoModelForSequenceClassification.from_pretrained(ckpt_path) for ckpt_path in BASE_MODEL_CKPTS]
for model in base_models:
    for p in model.parameters():
        p.requires_grad = False
    model.to(torch.device('cuda'))

In [None]:
model = StackedEnsembleModel(base_models, n_labels)

In [None]:
# model.load_state_dict(torch.load(os.path.join(MODEL_SAVE_DIR, CHECKPOINT, 'pytorch_model.bin')))

## Build Trainer

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=MODEL_SAVE_DIR,
    logging_strategy='epoch',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    lr_scheduler_type='cosine',
    learning_rate=LEARNING_RATE,
    warmup_steps=WARMUP,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="F1",
    seed=SEED,
    data_seed=SEED
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

## Training

In [None]:
trainer.train()

## Prediction & Evaluation

In [None]:
def post_processing(logits, threshold: float = 1., steps: int = 0):
    """Replace the top1 prediction with other potential answers

    Args:
        logits (Union[List, np.array]): the output hypothesis of the model
        threshold (float): top2 > top1 * threshold then top2 will be the result
        steps (int): how many candidates should be test
    """
    n_data, n_classes = logits.shape
    logits = torch.softmax(torch.tensor(logits), dim=-1)
    top5_indices = torch.argsort(logits, dim=-1, descending=True)[:, :5] # top 5 label predictions
    result = top5_indices[:, 0].clone() # label predictions

    # default result is just argmax, no candidates will be checked
    if threshold == 1. and steps == 0:
        return result
    
    # check if the second ans satisfies the threshold
    for i in range(n_data):
        if logits[i, top5_indices[i, 1]] > logits[i, top5_indices[i, 0]] * threshold:
            result[i] = top5_indices[i, 1]

    # TODO: check the absent labels

    return result

def evaluate_f1(preds, labels, average='macro'):
    precision = metric_precision.compute(predictions=preds, references=labels, average=average)['precision']
    recall = metric_recall.compute(predictions=preds, references=labels, average=average)['recall']
    f1_score = metric_f1.compute(predictions=preds, references=labels, average=average)['f1']
    return {'Precision': precision, 'Recall': recall, 'F1': f1_score}

In [None]:
test_preds = trainer.predict(test_dataset)

In [None]:
test_ans = post_processing(test_preds.predictions)
testdf['pred'] = test_ans

In [None]:
submission = pd.read_csv('data/fixed_test.csv')
submission['pred'] = np.zeros(shape=(len(submission),), dtype=int)
for _, row in testdf.iterrows():
    submission.loc[(submission['conv_id'] == row['conv_id']), 'pred'] = row['pred']

In [None]:
submission

In [None]:
submission[['pred']].to_csv(f'output/20220609_ckpt{CHECKPOINT.split("-")[-1]}_submission.csv', encoding='utf8')