In [1]:
import torch
from transformers import AutoTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup,get_polynomial_decay_schedule_with_warmup, Adafactor
import torch.nn as nn
import logging
from lion_pytorch import Lion
from torch.utils.data import DataLoader
import GLUE
from torch.optim.lr_scheduler import LambdaLR
from functools import partial
import numpy as np
from sklearn.metrics import matthews_corrcoef
from pathlib import Path
import matplotlib.pyplot as plt

In [9]:

dataset_name = 'cola'
current_path = Path.cwd().parents[0]/dataset_name
current_path.mkdir(exist_ok=True)
# lr_list = [3e-5,3e-4,3e-3]
# scheduler_list = ['no', 'linear' ,'ord10']
# optimizer_list = ['Lion', 'AdamW','AdaFactor']
# batch_size_list = [32,64,128]
# steps = 50*1000
lr_list = [1e-5]
scheduler_list = ['no']
optimizer_list = ['AdamW']
batch_size_list = [32]
steps = 50*1000

In [7]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
train_dataset,_,test_dataset = GLUE.get_torch_dataset(tokenizer, "cola", padding="max_length", truncation=True, max_length = 64)

def constant_scheduler(
    optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
):
    def lambda_func(step:int):
        return 1.

    return LambdaLR(optimizer, lambda_func, last_epoch)

def prepare(sche, opt):
    if sche == 'no':
        sches = partial(constant_scheduler)
    if sche == 'linear':
        sches = partial(get_linear_schedule_with_warmup)
    if sche == 'ord10':
        sches = partial(get_polynomial_decay_schedule_with_warmup,power = 10.0)

    if opt == 'Lion':
        opts = partial(Lion, betas = (0.95,0.98), weight_decay = 0.01)
    if opt == 'AdaFactor':
        opts = partial(Adafactor, weight_decay = 0.001, relative_step = False, scale_parameter=False)
    if opt == 'AdamW':
        opts = partial(torch.optim.AdamW, betas = (0.9,0.99), weight_decay = 0.001)


    return sches, opts

Found cached dataset glue (C:/Users/Xiang/.cache/huggingface/datasets/mariosasko___glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\Xiang\.cache\huggingface\datasets\mariosasko___glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-2bc0360bf4726f4f.arrow
Loading cached processed dataset at C:\Users\Xiang\.cache\huggingface\datasets\mariosasko___glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-77227c16e4defc4f.arrow
Loading cached processed dataset at C:\Users\Xiang\.cache\huggingface\datasets\mariosasko___glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-c61c89821cd722e6.arrow


In [None]:


def get_log(file_name):
    logger = logging.getLogger('train')  # 设定logger的名字
    logger.setLevel(logging.INFO)  # 设定logger得等级

    ch = logging.StreamHandler()  # 输出流的hander，用与设定logger的各种信息
    ch.setLevel(logging.INFO)  # 设定输出hander的level

    fh = logging.FileHandler(file_name, mode='a')  # 文件流的hander，输出得文件名称，以及mode设置为覆盖模式
    fh.setLevel(logging.INFO)  # 设定文件hander得lever

    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)  # 两个hander设置个是，输出得信息包括，时间，信息得等级，以及message
    fh.setFormatter(formatter)
    logger.addHandler(fh)  # 将两个hander添加到我们声明的logger中去
    logger.addHandler(ch)
    return logger

In [None]:
logger = get_log('log1.txt')

In [4]:
test_loader = DataLoader(test_dataset, shuffle = False, batch_size = 32)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [10]:
# Training

# loss_mat = np.zeros((len(batch_size_list),len(scheduler_list), len(optimizer_list), len(lr_list),steps))

report_step = 100 # evaluate test metric each step
# metric_mat = np.zeros((len(batch_size_list),len(scheduler_list), len(optimizer_list), len(lr_list),steps//report_step,2))
for i,this_batch_size in enumerate(batch_size_list):
    for j,this_scheduler in enumerate(scheduler_list):
        for k,this_optimizer in enumerate(optimizer_list):
            for m, this_lr in enumerate(lr_list):
                loss_list = []
                metric_list = []
                acc_list = []
                model = BertForSequenceClassification.from_pretrained("bert-base-uncased").to(device)
                train_loader = DataLoader(train_dataset, batch_size=this_batch_size)
                sche, opt = prepare(this_scheduler, this_optimizer)
                optimizer = opt(model.parameters(), lr = this_lr if this_scheduler == 'no' else this_lr*2)
                scheduler = sche(optimizer, num_warmup_steps=int(steps/10),num_training_steps=steps)
                step = 0
                model.eval()
                with torch.no_grad():
                    logits = []
                    labelss = []
                    for X in test_loader:
                        batch = {k: v.to(device) for k, v in X.items()}
                        logits.append(model(**batch).logits)
                        labelss.append(batch['labels'])
                    total_test = torch.concatenate(logits, dim = 0)
                    _,predicted = torch.max(total_test,dim = 1)
                    real_label =torch.concatenate(labelss,dim=0).cpu().numpy()
                    predicted = predicted.cpu().numpy()
                    metric = matthews_corrcoef(real_label, predicted)
                    acc = np.mean(predicted==real_label)
                    # print(i,j,k,m,step//report_step)
                    metric_list.append(metric)
                    acc_list.append(acc)
                    # metric_mat[i,j,k,m,step//report_step - 1,0] = metric
                    # metric_mat[i,j,k,m,step//report_step - 1,1] = acc
                    print(f"step:{step}, matthews_corr:{metric:.6f}, Acc:{acc*100:4f}%")

                # print(f'Start training for: sche:{this_scheduler},opt:{this_optimizer},batchsize:{this_batch_size}, lr:{this_lr}')
                logger.info(f'Start training for: sche:{this_scheduler},opt:{this_optimizer},batchsize:{this_batch_size}, lr:{this_lr}')
                while True:

                    for X in train_loader:
                        model.train()
                        optimizer.zero_grad()
                        batch = {k: v.to(device) for k, v in X.items()}
                        loss = model(**batch).loss
                        print(f"step: {step+1}, loss:{loss.item():.8f}")

                        # loss_mat[i,j,k,m,step] = loss.item()
                        loss_list.append(loss.item())
                        loss.backward()
                        optimizer.step()
                        scheduler.step()
                        step += 1

                    # valid
                        if step % report_step == 0:
                            model.eval()
                            with torch.no_grad():
                                logits = []
                                labelss = []
                                for X in test_loader:
                                    batch = {k: v.to(device) for k, v in X.items()}
                                    logits.append(model(**batch).logits)
                                    labelss.append(batch['labels'])
                                total_test = torch.concatenate(logits, dim = 0)
                                _,predicted = torch.max(total_test,dim = 1)
                                real_label =torch.concatenate(labelss,dim=0).cpu().numpy()
                                predicted = predicted.cpu().numpy()
                                metric = matthews_corrcoef(real_label, predicted)
                                acc = np.mean(predicted==real_label)
                                # print(i,j,k,m,step//report_step)
                                metric_list.append(metric)
                                acc_list.append(acc)
                                # metric_mat[i,j,k,m,step//report_step - 1,0] = metric
                                # metric_mat[i,j,k,m,step//report_step - 1,1] = acc
                                # print(f"step:{step}, matthews_corr:{metric:.6f}, Acc:{acc*100:4f}%")
                                logger.info(f"step:{step}, matthews_corr:{metric:.6f}, Acc:{acc*100:4f}%")

                        if step == steps:
                            break
                    if step == steps:
                      break
                file_name = dataset_name+",batchsize"+str(this_batch_size)+",scheduler"+this_scheduler+",optimizer"+str(this_optimizer)+",LR"+str(this_lr)
                np.save(current_path/(file_name+'loss.npy'),np.array(loss_list))
                np.save(current_path/(file_name+'metric.npy'),np.array(metric_list))
                np.save(current_path/(file_name+'acc.npy'),np.array(acc_list))

                del model
                del optimizer
                del scheduler
                del train_loader
                torch.cuda.empty_cache()








Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Start training for: sche:no,opt:AdamW,batchsize:32, lr:1e-05
step: 1, loss:0.81424779
step: 2, loss:0.73156959
step: 3, loss:0.76243651
step: 4, loss:0.73396164
step: 5, loss:0.69784808
step: 6, loss:0.67255771


KeyboardInterrupt: 

In [None]:
loss_list
