In [2]:
import torch
from transformers import AutoTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup,get_polynomial_decay_schedule_with_warmup
import torch.nn as nn
import logging
from lion_pytorch import Lion
from torch.utils.data import DataLoader
import GLUE
from torch.optim.lr_scheduler import LambdaLR
from functools import partial
import numpy as np
from sklearn.metrics import matthews_corrcoef

In [3]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [4]:
train_dataset,_,test_dataset = GLUE.get_torch_dataset(tokenizer, "cola", padding="max_length", truncation=True, max_length=64)

Found cached dataset glue (C:/Users/Xiang/.cache/huggingface/datasets/mariosasko___glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\Xiang\.cache\huggingface\datasets\mariosasko___glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-2bc0360bf4726f4f.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at C:\Users\Xiang\.cache\huggingface\datasets\mariosasko___glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-31cb33c6889f365c.arrow


https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/optimizer_schedules#schedules
Reference of Scheduler

In [11]:

def constant_scheduler(
    optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
):
    def lambda_func(step:int):
        return 1.

    return LambdaLR(optimizer, lambda_func, last_epoch)

In [21]:
def prepare(sche, opt):
    if sche == 'no':
        sches = partial(constant_scheduler)
    if sche == 'linear':
        sches = partial(get_linear_schedule_with_warmup)
    if sche == 'ord10':
        sches = partial(get_polynomial_decay_schedule_with_warmup,power = 10.0)

    if opt == 'Lion':
        opts = partial(Lion)
    if opt == 'Adam':
        opts = partial(torch.optim.Adam)
    if opt == 'AdamW':
        opts = partial(torch.optim.AdamW)


    return sches, opts

In [28]:
print(len(train_dataset),len(test_dataset))

8551 1043


In [22]:
sche, opt = prepare('no','Adam')
optimizer = opt(model.parameters(),lr = 1e-4)
scheduler = sche(optimizer, num_warmup_steps=100,num_training_steps=100*10)

In [34]:
lr_list = [1e-5,1e-4,1e-3]
scheduler_list = ['no',  'ord10']
optimizer_list = ['Lion', 'AdamW']
batch_size_list = [32,64]
steps = 500




In [6]:
test_dataset['labels']

tensor([1, 1, 1,  ..., 0, 1, 1])

In [30]:
test_loader = DataLoader(test_dataset, shuffle = True, batch_size = 16)

In [31]:
next(iter(test_loader))

{'label': tensor([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]),
 'input_ids': tensor([[ 101, 1045, 2064,  ...,    0,    0,    0],
         [ 101, 3021, 2134,  ...,    0,    0,    0],
         [ 101, 2040, 2467,  ...,    0,    0,    0],
         ...,
         [ 101, 2673, 2017,  ...,    0,    0,    0],
         [ 101, 2008, 3520,  ...,    0,    0,    0],
         [ 101, 1045, 2741,  ...,    0,    0,    0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [12]:
test_input = ['HA HA HA.','you are pig right?','You are bad.']
test_input = tokenizer(test_input,return_tensors ='pt', padding = True)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
with torch.no_grad():
    logits = model(**test_input).logits

predicted_class_id = logits.argmax().item()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [13]:
logits.shape

torch.Size([3, 2])

In [None]:
# Training

loss_mat = np.zeros((len(batch_size_list),len(scheduler_list), len(optimizer_list), len(lr_list),steps))

report_step = 10 # evaluate test metric each step
metric_mat = np.zeros((len(batch_size_list),len(scheduler_list), len(optimizer_list), len(lr_list),steps//report_step))
for i,this_batch_size in enumerate(batch_size_list):
    for j,this_scheduler in enumerate(scheduler_list):
        for k,this_optimizer in enumerate(optimizer_list):
            for m, this_lr in enumerate(lr_list):

                model = BertForSequenceClassification.from_pretrained("bert-base-uncased").to(device)
                train_loader = DataLoader(train_dataset, batch_size=this_batch_size)
                optimizer = this_optimizer(model.parameters(), lr = this_lr)
                scheduler = this_scheduler(optimizer, num_warmup_steps=int(steps/100),num_training_steps=steps)
                step = 0

                while True:
                    model.train()
                    for X in train_loader:
                        optimizer.zero_grad()
                        batch = {k: v.to(device) for k, v in X.items()}
                        loss = model(**batch).loss
                        print(f"step: {step+1}, loss:{loss.item():.8f}")

                        loss_mat[i,j,k,m,step] = loss.item()

                        loss.backward()
                        optimizer.step()
                        scheduler.step()
                        step += 1

                    # valid
                    if (step+1)%report_step == 0:
                        model.eval()
                        with torch.no_grad():
                            logits = []
                            for X in test_loader:
                                batch = {k: v.to(device) for k, v in X.items()}
                                logits.append(model(**batch).logits)
                            total_test = torch.concatenate(logits, dim = 0)
                            _,predicted = torch.max(total_test,dim = 1)
                            real_label =test_dataset['labels'].numpy()
                            predicted = predicted.cpu().numpy()
                            metric = matthews_corrcoef(real_label, predicted)
                            print(f"step:{step}, matthews_corr:{metric:.6f}")








In [17]:
torch.max(torch.concatenate([torch.randn((3,2)) for i in range(3)],dim=0),dim = 1)

torch.return_types.max(
values=tensor([ 1.9424,  1.7879,  0.9814,  0.1394,  0.7443,  1.0075,  1.7859,  2.2063,
        -0.2133]),
indices=tensor([0, 1, 0, 0, 0, 0, 0, 0, 1]))