In [1]:
from datasets import load_dataset, load_metric

import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## Swag Dataset

### Swag

In [None]:
swag = load_dataset("swag", "regular", cache_dir="./swag_cache")

In [None]:
swag

In [None]:
# swag["train"][0]

In [None]:
data_path = './swag/'
swag_train = pd.read_csv(os.path.join(data_path, 'train.csv'), index_col=0)
swag_val   = pd.read_csv(os.path.join(data_path, 'val.csv'), index_col=0)
swag_test  = pd.read_csv(os.path.join(data_path, 'test.csv'), index_col=0)

In [None]:
from datasets import Dataset, DatasetDict

train, val, test = Dataset.from_pandas(swag_train, preserve_index=False), Dataset.from_pandas(swag_val, preserve_index=False), Dataset.from_pandas(swag_test, preserve_index=False)
datasets = DatasetDict()
datasets['train'], datasets['val'], datasets['test'] = train, val, test
datasets

In [None]:
datasets["val"][-1]

In [None]:
data_files = {}
data_files["train"] = "train.json"
data_files["valid"] = "valid.json"
extension = data_files["train"].split('.')[-1]
raw_datasets = load_dataset(extension, data_files=data_files)
raw_datasets

### Swag formatter

In [None]:
import json

def swag_formatter():
    folder = "dataset/"
    corpus = json.load(open(f"{folder}context.json"))
    train = json.load(open(f"{folder}train.json"))
    valid = json.load(open(f"{folder}valid.json"))
    test = json.load(open(f"{folder}test.json"))
    save_keys = ['id', 'question', 'paragraphs', 'relevant']
    ending_names = [f"ending{i}" for i in range(4)]
    
    for idx, data in enumerate(['train', 'valid', 'test']):
        results = []
        for element in eval(data):
            pairs = {}
            for key in save_keys:
                if key == 'relevant':
                    if idx != 2:
                        pairs['label'] = element['paragraphs'].index(element[key])
                    else:
                        pairs['label'] = 0
                elif key == 'paragraphs':
                    for i, num in enumerate(element[key]):
                        pairs[ending_names[i]] = corpus[num]
                elif key == 'question':
                    pairs['sent1'] = element[key]
                    pairs['sent2'] = ''
                else:
                    pairs['video-id'] = element[key]
            results.append(pairs)
        json_obj = json.dumps(results, indent=2, ensure_ascii=False)
        with open(f"{folder}swag_{data}.json", "w", encoding="utf-8") as file:
            file.write(json_obj)
    
swag_formatter()

## SQuAD Dataset

### SQuAD

In [None]:
squad_v2 = False
SQuAD = load_dataset("squad_v2" if squad_v2 else "squad")

In [None]:
SQuAD

In [None]:
SQuAD["train"][0]

In [None]:
SQuAD["train"][-1]

In [None]:
import json
data_path = './SQuAD/'
SQuAD_train = json.load(open(os.path.join(data_path, 'train-v1.1.json')))
SQuAD_valid = json.load(open(os.path.join(data_path, 'dev-v1.1.json')))

In [None]:
formatted = json.dumps(SQuAD_valid, indent=2)
print(formatted)

In [None]:
SQuAD_train["data"][0]["paragraphs"][0]

### SQuAD formatter

In [None]:
import json

folder = "dataset/"
corpus = json.load(open(f"{folder}context.json"))
train = json.load(open(f"{folder}train.json"))
valid = json.load(open(f"{folder}valid.json"))
test = json.load(open(f"{folder}test.json"))

def squad_formatter():
    save_keys = ['id', 'question', 'context', 'answers']
    for idx, data in enumerate(['train', 'valid', 'test']):
        print(data)
        results = []
        for element in eval(data):
            pairs = {}
            for key in save_keys:
                if key == 'answers':
                    if idx != 2:
                        new_dict = {}
                        for k, v in element[key[:-1]].items():
                            if k != "text":
                                new_dict["answer_" + k] = [v]
                            else:
                                new_dict[k] = [v]
                        pairs[key] = new_dict
                elif key == 'context':
                    if idx != 2:
                        pairs[key] = corpus[element['relevant']]
                    else:
                        pairs[key] = corpus[element['paragraphs'][-1]]
                else:
                    pairs[key] = element[key]
            results.append(pairs)
        json_obj = json.dumps(results, indent=2, ensure_ascii=False)
        with open(f"{folder}squad_{data}.json", "w", encoding="utf-8") as file:
            file.write(json_obj)
    
squad_formatter()

## Validate

In [None]:
# for swag evaluation
import json
valid = json.load(open("dataset/squad_valid.json")) # validation
test = json.load(open("format_test.json")) # prediction

total = len(valid)
correct = 0
for i, element in enumerate(test):
    correct += element['context'] == valid[i]['context']
print(f"accuracy: {(correct/total*100):.4f}")

In [None]:
# for squad evaluation
ans_dict = {}
for element in valid:
    ans_dict[element['id']] = element['answers']['text'][0]
# json_obj = json.dumps(ans_dict, ensure_ascii=False, indent=2)
# with open("ground_truths.json", "w", encoding="utf-8") as file:
#     file.write(json_obj)

correct = 0
total = len(ans_dict)
pred = json.load(open("output/valid_qa/predict_predictions.json"))
for key, val in pred.items():
    if ans_dict[key] == val:
        correct += 1
print(f"exact_match: {(correct/total*100):.4f}")

In [None]:
import pandas as pds

df_pred = pd.read_json("output/test_qa21/predict_predictions.json", typ="series").reset_index()
df_pred.columns = ['id', 'answer']
df_pred.to_csv("submit.csv", index=False)

## HW1

In [1]:
import numpy as np
import pandas as pd

from transformers import BertTokenizer
import torch 
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from transformers import BertTokenizer
from tqdm import tqdm
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
labels = {}
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['intent'].values]
        self.text = [tokenizer(text, padding='max_length', max_length=512, truncation=True, return_tensors='pt') for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.text[idx]
        
    def __getitem__(self,idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        
        return batch_texts, batch_y

In [3]:
train = pd.read_json("data/intent/train.json")
# train.set_index("id")
valid = pd.read_json("data/intent/train.json")
# valid.set_index("id")

classes = len(set(train['intent'].values).union(set(valid['intent'].values)))
k = 0
for i in train['intent'].unique():
    labels[i] = k
    k += 1

In [4]:
from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.drop = nn.Dropout(dropout)
        self.dense = nn.Linear(768,384)
        self.out = nn.Linear(384,classes)
        
    def forward(self, inp, msk):
        _, bert_out = self.bert(input_ids = inp, attention_mask = msk, return_dict = False)
        dropped_bert_out = self.drop(bert_out)
        activated_output = F.relu(self.dense(dropped_bert_out))
        fin_output = self.out(activated_output)
        return fin_output

In [6]:
def trainer(model, train, val, learning_rate, epochs, batch_size):
    train_loader = torch.utils.data.DataLoader(train,batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val,batch_size=batch_size, shuffle=True)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    if torch.cuda.is_available():
        model = model.cuda()
        criterion = criterion.cuda()
        
    train_loss_lst = []
    val_loss_lst = []
    train_acc_lst = []
    val_acc_lst = []
    for i in range(epochs):
        train_loss = 0.0
        val_loss = 0.0
        train_acc = 0.0
        val_acc = 0.0
        for train_input, train_label in tqdm(train_loader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            train_id = train_input['input_ids'].squeeze(1).to(device)
            
            output = model(train_id, mask)
            loss = criterion(output, train_label)
            train_loss += loss.item()
            train_acc += ((output.argmax(dim=1) == train_label).sum().item())
            
            model.zero_grad()
            loss.backward()
            optimizer.step()
        train_loss_lst.append(train_loss/len(train))
        train_acc_lst.append(train_acc/len(train))
        torch.save(model.state_dict(), os.path.join('./', f"Bert+drop+tanh+relu-{i}.pth"))

        with torch.no_grad():
            for val_input, val_label in val_loader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                val_id = val_input['input_ids'].squeeze(1).to(device)
                
                output = model(val_id,mask)
                loss = criterion(output,val_label)
                val_loss+=loss.item()
                val_acc += ((output.argmax(dim=1) == val_label).sum().item())
            val_loss_lst.append(val_loss/len(val))
            val_acc_lst.append(val_acc/len(val))
        
        print(f'train loss: {sum(train_loss_lst)/len(train_loss_lst)}')
        print(f'train loss: {sum(val_loss_lst)/len(val_loss_lst)}')
        print(f'train loss: {sum(train_acc_lst)/len(train_acc_lst)}')
        print(f'train loss: {sum(val_acc_lst)/len(val_acc_lst)}')

In [7]:
EPOCHS = 5
BATCH_SIZE = 16
LR = 2e-5
model = BertClassifier()

trainer(model, Dataset(train), Dataset(valid), LR, EPOCHS, BATCH_SIZE)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 938/938 [07:30<00:00,  2.08it/s]


train loss: 0.2725789999802907
train loss: 0.20213762574195862
train loss: 0.22446666666666668
train loss: 0.6737333333333333


100%|██████████| 938/938 [07:31<00:00,  2.08it/s]


train loss: 0.20184959970712663
train loss: 0.13346486353874207
train loss: 0.5398999999999999
train loss: 0.8223


100%|██████████| 938/938 [07:31<00:00,  2.08it/s]


train loss: 0.1468596853852272
train loss: 0.09484770091209148
train loss: 0.6859333333333333
train loss: 0.8789555555555556


100%|██████████| 938/938 [07:31<00:00,  2.08it/s]


train loss: 0.1130140339265267
train loss: 0.07303900706730783
train loss: 0.7623333333333333
train loss: 0.9077


100%|██████████| 938/938 [07:31<00:00,  2.08it/s]


train loss: 0.09145686451156934
train loss: 0.05927341348374884
train loss: 0.8089333333333333
train loss: 0.9251733333333332


In [None]:
model = BertClassifier()
state_dict = torch.load('Bert+drop+tanh+relu-4.pth')
model = model.load_state_dict(state_dict)

val = Dataset(valid)
val_loader = torch.utils.data.DataLoader(val, batch_size=16, shuffle=False)
outputs = model(val_loader)
outputs