In [5]:
import torch
from transformers import AutoTokenizer, GPT2ForSequenceClassification, get_linear_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup, Adafactor
import torch.nn as nn
import logging
from lion_pytorch import Lion
from torch.utils.data import DataLoader
import GLUEGPT
from torch.optim.lr_scheduler import LambdaLR
from functools import partial
import numpy as np
from sklearn.metrics import matthews_corrcoef
from pathlib import Path
import matplotlib.pyplot as plt

dataset_name = 'cola'
current_path = Path.cwd().parents[0] / dataset_name
current_path.mkdir(exist_ok=True)

lr_list = [3e-4]
scheduler_list = ['no']
optimizer_list = ['AdamW']
batch_size_list = [32]
steps = 4 * 1000
report_step = 50

tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token


train_dataset, _, test_dataset = GLUEGPT.get_torch_dataset(tokenizer, "cola", padding="max_length", truncation=True, max_length=64)




Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading and preparing dataset glue/cola to /home/featurize/.cache/huggingface/datasets/mariosasko___glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/377k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /home/featurize/.cache/huggingface/datasets/mariosasko___glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [6]:
def constant_scheduler(
    optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
):
    def lambda_func(step: int):
        return 1.

    return LambdaLR(optimizer, lambda_func, last_epoch)


def prepare(sche, opt):
    if sche == 'no':
        sches = partial(constant_scheduler)
    if sche == 'linear':
        sches = partial(get_linear_schedule_with_warmup)
    if sche == 'ord10':
        sches = partial(get_polynomial_decay_schedule_with_warmup, power=10.0)

    if opt == 'Lion':
        opts = partial(Lion, betas=(0.95, 0.98), weight_decay=0.01)
    if opt == 'AdaFactor':
        opts = partial(Adafactor, weight_decay=0.001, relative_step=False, scale_parameter=False)
    if opt == 'AdamW':
        opts = partial(torch.optim.AdamW, betas=(0.9, 0.99), weight_decay=0.001)

    return sches, opts


def get_log(file_name):
    logger = logging.getLogger('train')
    logger.setLevel(logging.INFO)

    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)

    fh = logging.FileHandler(file_name, mode='a')
    fh.setLevel(logging.INFO)

    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    logger.addHandler(ch)
    return logger


logger = get_log('log1.txt')

In [7]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

def evaluate(model, dataset):
    model.eval()
    eval_loader = DataLoader(dataset, shuffle=False, batch_size=32)
    logits = []
    labelss = []
    with torch.no_grad():
        for X in eval_loader:
            batch = {k: v.to(device) for k, v in X.items()}
            logits.append(model(**batch).logits)
            labelss.append(batch['labels'])
        total_test = torch.concat(logits, dim=0)
        _, predicted = torch.max(total_test, dim=1)
        real_label = torch.concat(labelss, dim=0).cpu().numpy()
        predicted = predicted.cpu().numpy()
        metric = matthews_corrcoef(real_label, predicted)
        acc = np.mean(predicted == real_label)

    return metric, acc

In [None]:
for i, this_batch_size in enumerate(batch_size_list):
    for j, this_scheduler in enumerate(scheduler_list):
        for k, this_optimizer in enumerate(optimizer_list):
            for m, this_lr in enumerate(lr_list):
                loss_list = []
                metric_list = []
                acc_list = []
                train_metric_list = []
                train_acc_list = []
                model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
                model.config.pad_token_id = tokenizer.pad_token_id
                model.to(device)
                train_loader = DataLoader(train_dataset, batch_size=this_batch_size)
                sche, opt = prepare(this_scheduler, this_optimizer)
                optimizer = opt(model.parameters(), lr=this_lr if this_scheduler == 'no' else this_lr * 2)
                scheduler = sche(optimizer, num_warmup_steps=int(steps / 10), num_training_steps=steps)
                step = 0

                metric, acc = evaluate(model, test_dataset)
                metric_list.append(metric)
                acc_list.append(acc)
                tmetric, tacc = evaluate(model, train_dataset)
                train_metric_list.append(tmetric)
                train_acc_list.append(tacc)
                logger.info(f"step:{step}, matthews_corr:{metric:.6f}, Acc:{acc*100:4f}%, Train: matthews_corr:{tmetric:.6f}, Acc:{tacc*100:4f}%,")

                while True:
                    for X in train_loader:
                        model.train()
                        optimizer.zero_grad()
                        batch = {k: v.to(device) for k, v in X.items()}
                        loss = model(**batch).loss
                        logger.info(f"step: {step+1}, loss:{loss.item():.8f}")

                        loss_list.append(loss.item())
                        loss.backward()
                        optimizer.step()
                        scheduler.step()
                        step += 1

                        if step % report_step == 0:
                            metric, acc = evaluate(model, test_dataset)
                            metric_list.append(metric)
                            acc_list.append(acc)
                            tmetric, tacc = evaluate(model, train_dataset)
                            train_metric_list.append(tmetric)
                            train_acc_list.append(tacc)
                            logger.info(f"step:{step}, matthews_corr:{metric:.6f}, Acc:{acc*100:4f}%, Train: matthews_corr:{tmetric:.6f}, Acc:{tacc*100:4f}%,")

                        if step == steps:
                            break
                    if step == steps:
                        break

                file_name = dataset_name + ",batchsize" + str(this_batch_size) + ",scheduler" + this_scheduler + ",optimizer" + str(this_optimizer) + ",LR" + str(this_lr)
                np.save(current_path / (file_name + 'loss.npy'), np.array(loss_list))
                np.save(current_path / (file_name + 'metric.npy'), np.array(metric_list))
                np.save(current_path / (file_name + 'acc.npy'), np.array(acc_list))
                np.save(current_path / (file_name + 'trainmetric.npy'), np.array(train_metric_list))
                np.save(current_path / (file_name + 'trainacc.npy'), np.array(train_acc_list))

                del model
                del optimizer
                del scheduler
                del train_loader
                torch.cuda.empty_cache()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2023-03-17 22:01:04,102 - INFO - step:0, matthews_corr:0.000000, Acc:30.872483%, Train: matthews_corr:0.000000, Acc:29.563794%,
INFO:train:step:0, matthews_corr:0.000000, Acc:30.872483%, Train: matthews_corr:0.000000, Acc:29.563794%,
2023-03-17 22:01:04,185 - INFO - step: 1, loss:3.04088902
INFO:train:step: 1, loss:3.04088902
2023-03-17 22:01:04,412 - INFO - step: 2, loss:0.64065135
INFO:train:step: 2, loss:0.64065135
2023-03-17 22:01:04,627 - INFO - step: 3, loss:0.76807135
INFO:train:step: 3, loss:0.76807135
2023-03-17 22:01:04,841 - INFO - step: 4, loss:0.63765377
INFO:train:step: 4, loss:0.63765377
2023-03-17 22:01:05,055 - INFO - step: 5, loss:0.74461001
INFO:train:step: 5, loss:0.74461001
2023-03-17 22:01:05,270 - INFO - step: 6