## Baseline Proposal
- Vanilla BERT as our baseline
- only consider the conversations, exlude prompts
- Use Adam as our optimizer

## Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os

os.chdir('/content/drive/MyDrive/NYCU NLP Final/')

In [3]:
!pip install transformers datasets > /dev/null

In [4]:
import numpy as np
import pandas as pd
import torch

In [5]:
import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

torch.cuda.is_available()

True

In [6]:
# device = torch.device(0)
# torch.cuda.set_device(device)
# print(f'{device} is now being set.')

In [7]:
# parameters
SEED = 42
# N_SAMPLES_PER_LABEL = 377  # the smallest label count
MODEL_NAME='roberta-base'
EPOCHS=50
TRAIN_BATCH_SIZE=16
VALID_BATCH_SIZE=64

In [8]:
import random

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fa58b3e0b50>

## Read data

In [9]:
traindf = pd.read_csv('data/new_train.csv')
validdf = pd.read_csv('data/new_valid.csv')
testdf = pd.read_csv('data/new_test.csv')

In [10]:
print(f'# train: {len(traindf)}')
print(f'# valid: {len(validdf)}')
print(f'# test: {len(testdf)}')

# train: 19533
# valid: 2770
# test: 2547


In [11]:
classes = traindf['label'].unique()
n_labels = len(classes)

## Tokenization & Dataset

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

special_tokens_dict = {'additional_special_tokens': ['[SPEAKER_A]', '[SPEAKER_B]']}
tokenizer.add_special_tokens(special_tokens_dict)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

2

In [13]:
class PromptDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.size = len(df)
        self.features = tokenizer(df['prompt'].values.tolist(), truncation=True, padding=True)
        self.labels = df['label'].values.tolist() if ('label' in df.columns) else None

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.features.items()}
        if self.labels:
          item['labels'] = torch.tensor(self.labels[idx])
        
        return item

    def __len__(self):
        return self.size

class PromptConvDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.size = len(df)
        self.features = tokenizer((df['prompt'] + ' [SEP] ' + df['conv']).values.tolist(), truncation=True, padding=True)
        self.labels = df['label'].values.tolist() if ('label' in df.columns) else None

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.features.items()}
        if self.labels:
          item['labels'] = torch.tensor(self.labels[idx])
        
        return item

    def __len__(self):
        return self.size

train_dataset = PromptConvDataset(traindf, tokenizer)
valid_dataset = PromptConvDataset(validdf, tokenizer)
test_dataset = PromptConvDataset(testdf, tokenizer)

## Model Training

In [14]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=n_labels)
model.resize_token_embeddings(len(tokenizer))

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Embedding(50267, 768)

In [15]:
from datasets import load_metric

metric_acc = load_metric('accuracy')
metric_f1 = load_metric('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = metric_acc.compute(predictions=predictions, references=labels)['accuracy']
    f1_score = metric_f1.compute(predictions=predictions, references=labels, average='macro')['f1']
    return {'accuracy': acc, 'F1': f1_score}

In [16]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='roberta_baseline',
    logging_dir='roberta_baseline_logs',
    logging_strategy='epoch',
    evaluation_strategy='epoch',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 19533
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 61050


Epoch,Training Loss,Validation Loss


## Prediction & Evaluation

In [40]:
from datasets import load_metric

metric_acc = load_metric('accuracy')
metric_f1 = load_metric('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = metric_acc.compute(predictions=predictions, references=labels)['accuracy']
    f1_score = metric_f1.compute(predictions=predictions, references=labels, average='macro')['f1']
    return {'accuracy': acc, 'F1': f1_score}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# load local model
model = AutoModelForSequenceClassification.from_pretrained('roberta_baseline/checkpoint-10000')

training_args = TrainingArguments(
    output_dir='roberta_baseline',
    logging_dir='roberta_baseline_logs',
    logging_strategy='epoch',
    evaluation_strategy='epoch',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

In [None]:
eval_preds = trainer.predict(valid_dataset)

***** Running Prediction *****
  Num examples = 2770
  Batch size = 64


In [None]:
compute_metrics((eval_pred.predictions, eval_pred.label_ids))

{'F1': 0.5827597979066158, 'accuracy': 0.5895306859205777}

In [45]:
test_preds = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 2547
  Batch size = 64


In [None]:
test_ans = np.argmax(test_preds.predictions, axis=-1)
testdf['pred'] = test_ans

In [None]:
submission = pd.read_csv('data/fixed_test.csv')
submission['pred'] = [-1]*len(submission)
for _, row in testdf.iterrows():
  submission.loc[(submission['conv_id'] == row['conv_id']), 'pred'] = row['pred']

In [None]:
submission

Unnamed: 0,conv_id,utterance_idx,prompt,utterance,pred
0,hit:0_conv:0,1,I felt guilty when I was driving home one nigh...,Yeah about 10 years ago I had a horrifying exp...,25
1,hit:0_conv:0,2,I felt guilty when I was driving home one nigh...,Did you suffer any injuries?,25
2,hit:0_conv:0,3,I felt guilty when I was driving home one nigh...,No I wasn't hit. It turned out they were drunk...,25
3,hit:0_conv:0,4,I felt guilty when I was driving home one nigh...,Why did you feel guilty? People really shouldn...,25
4,hit:0_conv:0,5,I felt guilty when I was driving home one nigh...,I don't know I was new to driving and hadn't e...,25
...,...,...,...,...,...
10968,hit:12416_conv:24832,4,I saw a huge cockroach outside my house today....,I live in Texas to so i know those feels,8
10969,hit:12423_conv:24847,1,I have a big test on Monday. I am so nervous_c...,I have a big test on Monday_comma_ I am so ner...,18
10970,hit:12423_conv:24847,2,I have a big test on Monday. I am so nervous_c...,What is the test on?,18
10971,hit:12423_conv:24847,3,I have a big test on Monday. I am so nervous_c...,It's for my Chemistry class. I haven't slept m...,18


In [None]:
submission[['pred']].to_csv('output/20220519_submission.csv', encoding='utf8')

## Master Proposal
- Use BERT to infer `prompt` & `utterance` representations，concatenate the two hypotheses.
- Add a `LayerNorm` layer to receive the concatenated result.
- Use `Linear` layer to do classification.
- Maybe we can use `SAM` to smooth the loss landscape