In [1]:
import torch
from transformers import AutoTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup,get_polynomial_decay_schedule_with_warmup, Adafactor
import torch.nn as nn
import logging
from lion_pytorch import Lion
from torch.utils.data import DataLoader
import GLUE
from torch.optim.lr_scheduler import LambdaLR
from functools import partial
import numpy as np
from sklearn.metrics import matthews_corrcoef
from pathlib import Path
import matplotlib.pyplot as plt

In [2]:
current_path = Path.cwd()
dataset_name = 'cola'
# lr_list = [3e-5,3e-4,3e-3]
# scheduler_list = ['no', 'linear' ,'ord10']
# optimizer_list = ['Lion', 'AdamW','AdaFactor']
# batch_size_list = [32,64,128]
# steps = 50*1000
lr_list = [1e-5]
scheduler_list = ['no']
optimizer_list = ['Lion']
batch_size_list = [32]
steps = 100

In [3]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
train_dataset,_,test_dataset = GLUE.get_torch_dataset(tokenizer, "cola", padding="max_length", truncation=True, max_length = 64)

def constant_scheduler(
    optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
):
    def lambda_func(step:int):
        return 1.

    return LambdaLR(optimizer, lambda_func, last_epoch)

def prepare(sche, opt):
    if sche == 'no':
        sches = partial(constant_scheduler)
    if sche == 'linear':
        sches = partial(get_linear_schedule_with_warmup)
    if sche == 'ord10':
        sches = partial(get_polynomial_decay_schedule_with_warmup,power = 10.0)

    if opt == 'Lion':
        opts = partial(Lion, betas = (0.95,0.98), weight_decay = 0.01)
    if opt == 'AdaFactor':
        opts = partial(Adafactor, weight_decay = 0.001)
    if opt == 'AdamW':
        opts = partial(torch.optim.AdamW, betas = (0.9,0.99), weight_decay = 0.001)


    return sches, opts

Found cached dataset glue (C:/Users/Xiang/.cache/huggingface/datasets/mariosasko___glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\Xiang\.cache\huggingface\datasets\mariosasko___glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-2bc0360bf4726f4f.arrow
Loading cached processed dataset at C:\Users\Xiang\.cache\huggingface\datasets\mariosasko___glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-77227c16e4defc4f.arrow
Loading cached processed dataset at C:\Users\Xiang\.cache\huggingface\datasets\mariosasko___glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-c61c89821cd722e6.arrow


In [4]:
test_loader = DataLoader(test_dataset, shuffle = False, batch_size = 32)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [6]:
# Training

# loss_mat = np.zeros((len(batch_size_list),len(scheduler_list), len(optimizer_list), len(lr_list),steps))

report_step = 10 # evaluate test metric each step
# metric_mat = np.zeros((len(batch_size_list),len(scheduler_list), len(optimizer_list), len(lr_list),steps//report_step,2))
for i,this_batch_size in enumerate(batch_size_list):
    for j,this_scheduler in enumerate(scheduler_list):
        for k,this_optimizer in enumerate(optimizer_list):
            for m, this_lr in enumerate(lr_list):
                loss_list = []
                metric_list = []
                acc_list = []
                model = BertForSequenceClassification.from_pretrained("bert-base-uncased").to(device)
                train_loader = DataLoader(train_dataset, batch_size=this_batch_size)
                sche, opt = prepare(this_scheduler, this_optimizer)
                optimizer = opt(model.parameters(), lr = this_lr)
                scheduler = sche(optimizer, num_warmup_steps=int(steps/100),num_training_steps=steps)
                step = 0
                print(f'Start training for: sche:{this_scheduler},opt:{this_optimizer},batchsize:{this_batch_size}, lr:{this_lr}')

                while True:

                    for X in train_loader:
                        model.train()
                        optimizer.zero_grad()
                        batch = {k: v.to(device) for k, v in X.items()}
                        loss = model(**batch).loss
                        print(f"step: {step+1}, loss:{loss.item():.8f}")

                        # loss_mat[i,j,k,m,step] = loss.item()
                        loss_list.append(loss.item())
                        loss.backward()
                        optimizer.step()
                        scheduler.step()
                        step += 1

                    # valid
                        if step % report_step == 0:
                            model.eval()
                            with torch.no_grad():
                                logits = []
                                labelss = []
                                for X in test_loader:
                                    batch = {k: v.to(device) for k, v in X.items()}
                                    logits.append(model(**batch).logits)
                                    labelss.append(batch['labels'])
                                total_test = torch.concatenate(logits, dim = 0)
                                _,predicted = torch.max(total_test,dim = 1)
                                real_label =torch.concatenate(labelss,dim=0).cpu().numpy()
                                predicted = predicted.cpu().numpy()
                                metric = matthews_corrcoef(real_label, predicted)
                                acc = np.mean(predicted==real_label)
                                # print(i,j,k,m,step//report_step)
                                metric_list.append(metric)
                                acc_list.append(acc)
                                # metric_mat[i,j,k,m,step//report_step - 1,0] = metric
                                # metric_mat[i,j,k,m,step//report_step - 1,1] = acc
                                print(f"step:{step}, matthews_corr:{metric:.6f}, Acc:{acc*100:4f}%")

                        if step == steps:
                            break
                    if step == steps:
                      break
                file_name = dataset_name+",batchsize"+str(this_batch_size)+",scheduler"+this_scheduler+",optimizer"+str(this_optimizer)+",LR"+str(this_lr)
                np.save(current_path/(file_name+'loss.npy'),np.array(loss_list))
                np.save(current_path/(file_name+'metric.npy'),np.array(metric_list))
                np.save(current_path/(file_name+'acc.npy'),np.array(acc_list))

                del model
                del optimizer
                del scheduler
                del train_loader
                torch.cuda.empty_cache()








Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Start training for: sche:no,opt:Lion,batchsize:32, lr:1e-05
step: 1, loss:0.51871634
step: 2, loss:0.62306547
step: 3, loss:0.56145179
step: 4, loss:0.58063710
step: 5, loss:0.87819719
step: 6, loss:0.68156290
step: 7, loss:0.70783985
step: 8, loss:0.58677846
step: 9, loss:0.74820590
step: 10, loss:0.67673910
step:10, matthews_corr:0.079514, Acc:69.319271%
step: 11, loss:0.61580312
step: 12, loss:0.65886658
step: 13, loss:0.59500867
step: 14, loss:0.61484498
step: 15, loss:0.58851606
step: 16, loss:0.58159840
step: 17, loss:0.61258012
step: 18, loss:0.36540309
step: 19, loss:0.80590719
step: 20, loss:0.52038127
step:20, matthews_corr:0.041615, Acc:69.223394%
step: 21, loss:0.52342367
step: 22, loss:0.65004784
step: 23, loss:0.84120882
step: 24, loss:0.81308097
step: 25, loss:0.57725710
step: 26, loss:0.65588945
step: 27, loss:0.64778298
step: 28, loss:0.69552952
step: 29, loss:0.71658319
step: 30, loss:0.62821519
step:30, matthews_corr:0.288569, Acc:73.250240%
step: 31, loss:0.59506452

In [8]:
loss_list


[0.5187163352966309,
 0.6230654716491699,
 0.56145179271698,
 0.5806370973587036,
 0.878197193145752,
 0.6815629005432129,
 0.707839846611023,
 0.586778461933136,
 0.7482059001922607,
 0.6767390966415405,
 0.6158031225204468,
 0.6588665843009949,
 0.5950086712837219,
 0.6148449778556824,
 0.5885160565376282,
 0.5815984010696411,
 0.6125801205635071,
 0.36540308594703674,
 0.8059071898460388,
 0.5203812718391418,
 0.5234236717224121,
 0.6500478386878967,
 0.841208815574646,
 0.8130809664726257,
 0.5772570967674255,
 0.6558894515037537,
 0.6477829813957214,
 0.6955295205116272,
 0.7165831923484802,
 0.6282151937484741,
 0.5950645208358765,
 0.5821189880371094,
 0.578532874584198,
 0.5805097222328186,
 0.5756036639213562,
 0.6947827339172363,
 0.6989505290985107,
 0.6317431330680847,
 0.5658966302871704,
 0.5752441883087158,
 0.6382665634155273,
 0.6557499766349792,
 0.609664261341095,
 0.6560441255569458,
 0.6650992035865784,
 0.7146214246749878,
 0.49163541197776794,
 0.5888212323188782