In [29]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup
from lion_pytorch import Lion
from torch.utils.data import DataLoader
import GLUEGPT2
from torch.optim.lr_scheduler import LambdaLR
from functools import partial
import numpy as np
from sklearn.metrics import matthews_corrcoef
import matplotlib.pyplot as plt
import datasets
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from pathlib import Path
import torch.nn.utils.rnn as rnn_utils

In [30]:
current_path = Path.cwd()
dataset_name = 'cola'
lr_list = [1e-5]
scheduler_list = ['no']
optimizer_list = ['Lion']
batch_size_list = [32]
steps = 100

In [34]:
train_dataset, test_dataset, _  = GLUEGPT2.get_torch_dataset('gpt2', "cola")

def pad_examples(batch):
    input_ids = rnn_utils.pad_sequence(batch['input_ids'], batch_first=True, padding_value=0)
    attention_mask = rnn_utils.pad_sequence(batch['attention_mask'], batch_first=True, padding_value=0)
    labels = batch['labels']
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

train_dataset = train_dataset.map(pad_examples, batched=True)
test_dataset = test_dataset.map(pad_examples, batched=True)

def constant_scheduler(
    optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
):
    def lambda_func(step:int):
        return 1.

    return LambdaLR(optimizer, lambda_func, last_epoch)

def prepare(sche, opt):
    if sche == 'no':
        sches = partial(constant_scheduler)
    if sche == 'linear':
        sches = partial(get_linear_schedule_with_warmup)
    if sche == 'ord10':
        sches = partial(get_polynomial_decay_schedule_with_warmup,power = 10.0)

    if opt == 'Lion':
        opts = partial(Lion)
    if opt == 'Adam':
        opts = partial(torch.optim.Adam)
    if opt == 'AdamW':
        opts = partial(torch.optim.AdamW)


    return sches, opts


Found cached dataset glue (/Users/xiongbowen/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /Users/xiongbowen/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-3bec0afcd36b72dc.arrow
Loading cached processed dataset at /Users/xiongbowen/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-a95ecb074f032c93.arrow
Loading cached processed dataset at /Users/xiongbowen/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-3e12d90a7c61c551.arrow
Loading cached processed dataset at /Users/xiongbowen/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-2257558fcd85bd6c.arrow
Loading cached processed dataset at /Users/xiongbowen/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-a4d417244a0c9aad.arrow


In [35]:


test_loader = DataLoader(test_dataset, shuffle = False, batch_size = 32)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")



In [33]:
# Training


report_step = 10 # evaluate test metric each step
# metric_mat = np.zeros((len(batch_size_list),len(scheduler_list), len(optimizer_list), len(lr_list),steps//report_step,2))
for i,this_batch_size in enumerate(batch_size_list):
    for j,this_scheduler in enumerate(scheduler_list):
        for k,this_optimizer in enumerate(optimizer_list):
            for m, this_lr in enumerate(lr_list):
                loss_list = []
                metric_list = []
                acc_list = []
                model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
                train_loader = DataLoader(train_dataset, batch_size=this_batch_size)
                sche, opt = prepare(this_scheduler, this_optimizer)
                optimizer = opt(model.parameters(), lr = this_lr)
                scheduler = sche(optimizer, num_warmup_steps=int(steps/100),num_training_steps=steps)
                step = 0
                print(f'Start training for: sche:{this_scheduler},opt:{this_optimizer},batchsize:{this_batch_size}, lr:{this_lr}')

                while True:

                    for X in train_loader:
                        model.train()
                        optimizer.zero_grad()
                        batch = {k: v.to(device) for k, v in X.items()}
                        loss = model(**batch).loss
                        print(f"step: {step+1}, loss:{loss.item():.8f}")

                        # loss_mat[i,j,k,m,step] = loss.item()
                        loss_list.append(loss.item())
                        loss.backward()
                        optimizer.step()
                        scheduler.step()
                        step += 1

                    # valid
                        if step % report_step == 0:
                            model.eval()
                            with torch.no_grad():
                                logits = []
                                labelss = []
                                for X in test_loader:
                                    batch = {k: v.to(device) for k, v in X.items()}
                                    logits.append(model(**batch).logits)
                                    labelss.append(batch['labels'])
                                total_test = torch.concatenate(logits, dim = 0)
                                _,predicted = torch.max(total_test,dim = 1)
                                real_label =torch.concatenate(labelss,dim=0).cpu().numpy()
                                predicted = predicted.cpu().numpy()
                                metric = matthews_corrcoef(real_label, predicted)
                                acc = np.mean(predicted==real_label)
                                # print(i,j,k,m,step//report_step)
                                metric_list.append(metric)
                                acc_list.append(acc)
                                # metric_mat[i,j,k,m,step//report_step - 1,0] = metric
                                # metric_mat[i,j,k,m,step//report_step - 1,1] = acc
                                print(f"step:{step}, matthews_corr:{metric:.6f}, Acc:{acc*100:4f}%")

                        if step == steps:
                            break
                    if step == steps:
                      break
                file_name = dataset_name+",batchsize"+str(this_batch_size)+",scheduler"+this_scheduler+",optimizer"+str(this_optimizer)+",LR"+str(this_lr)
                np.save(current_path/(file_name+'loss.npy'),np.array(loss_list))
                np.save(current_path/(file_name+'metric.npy'),np.array(metric_list))
                np.save(current_path/(file_name+'acc.npy'),np.array(acc_list))

                del model
                del optimizer
                del scheduler
                del train_loader
                torch.cuda.empty_cache()

Start training for: sche:no,opt:Lion,batchsize:32, lr:1e-05


ValueError: Expected input batch_size (928) to match target batch_size (31).