# Fine-tuning GPT-2 for text classification
Stefan/Yuzhao Heng
Since Wed. Feb. 9th, 2022


Reproduce the results in paper [Zero-shot Text Classification With Generative Language Models](https://arxiv.org/abs/1912.10165),
since the authors didn't release the code.

Serve as infrastructure and baseline for project on efficient and accurate encoder for text classification with many labels.


## Notebook Setup



In [10]:
%load_ext autoreload
%autoreload 2



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Colab Setup



In [11]:
import os
import sys


if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')

    ! pip3 install icecream datasets sentence-transformers

    # base_path = '/content/drive/My Drive//Research/'
    # os.chdir(os.path.join(base_path, 'Unified Encoder/Unified-Encoder'))

    sys.path.append(os.path.join('drive', 'My Drive', 'Research', 'Unified Encoder', 'Unified-Encoder'))


from unified_encoder.util import *
print(PATH_BASE)  # Sanity check, should be the path appended if Colab



/Users/stefanh/Documents/UMich/Research/Clarity Lab/Unified Encoder


## Setup



In [12]:
import random

import numpy as np
import transformers
from transformers import AutoTokenizer, AutoModel
from transformers import TrainingArguments, Trainer, SchedulerType
from datasets import load_dataset
from icecream import ic


rcParams['figure.dpi'] = 200
rcParams['font.size'] = 6



## Seed setup



In [13]:
if torch.cuda.is_available():
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'  # Required for some CuBLAS operations
    ! echo $CUBLAS_WORKSPACE_CONFIG


seed = config('random-seed')
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.use_deterministic_algorithms(True)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(seed)
transformers.set_seed(seed)



## Prep Model & Dataset


In [14]:
dnm = 'ag_news'
dset = load_dataset(dnm)


# Models used in the paper, parameter sizes: 124M, 355M; Discrepancy with Open AI reported numbers?
MODEL_NMS = dict(small='gpt2', large='gpt2-medium')
model_nm = MODEL_NMS['small']
model = AutoModel.from_pretrained(model_nm)
tokenizer = AutoTokenizer.from_pretrained(model_nm)

def model_param_size(m: torch.nn.Module) -> str:
   return fmt_num(sum(p.numel() for p in m.parameters()))

ic(model_param_size(model))


SPEC_TOKS = ['<|question|>', '<|text|>', '<|answer|>']
tokenizer.add_special_tokens(dict(pad_token='[PAD]', additional_special_tokens=SPEC_TOKS))
model.resize_token_embeddings(len(tokenizer))

def tokenize_func(sample):
    return tokenizer(sample['text'], padding='max_length', truncation=True)

dset_tok = dset.map(tokenize_func, batched=True)
dset_tok = dset_tok.remove_columns('label')  # For autoregressive learning
ic(dset_tok)



Using custom data configuration default
Reusing dataset ag_news (/Users/stefanh/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /Users/stefanh/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformer

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'text'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'text'],
        num_rows: 7600
    })
})

## Prep Training


In [15]:
D_TRAIN_ARGS = {
    'gpt2': dict(
        learning_rate=3e-5,
        batch_size=32,
        weight_decay=1e-2
    ),
    'gpt2-medium': dict(
        learning_rate=4e-5,
        batch_size=128,
        weight_decay=1e-2
    )
}
lr, bsz, decay = (D_TRAIN_ARGS[model_nm][k] for k in ['learning_rate', 'batch_size', 'weight_decay'])

training_args = TrainingArguments(
    output_dir=os.path.join(PATH_BASE, DIR_PROJ, DIR_MDL, 'gpt2'),
    do_train=True, do_eval=True,
    evaluation_strategy='steps',
    per_device_train_batch_size=bsz,
    per_device_eval_batch_size=bsz,
    # TODO: Adam's beta1, beta2, epsilon, what values were used???
    max_grad_norm=1,
    num_train_epochs=1,
    lr_scheduler_type=SchedulerType.COSINE,
    warmup_ratio=1e-2,
    logging_strategy='steps',
    logging_steps=1,
    fp16=torch.cuda.is_available(),  # TODO: dynamic loss scaling??
)

n = 20
trainer = Trainer(
    model=model,
    args=training_args,
    # train_dataset=dset_tok['train'],
    # eval_dataset=dset_tok['test']
    train_dataset=dset_tok["train"].shuffle(seed=seed).select(range(n)),
    eval_dataset=dset_tok["test"].shuffle(seed=seed).select(range(n))
)


using `logging_steps` to initialize `eval_steps` to 1
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Loading cached shuffled indices for dataset at /Users/stefanh/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-488bdf8a38d0a560.arrow
Loading cached shuffled indices for dataset at /Users/stefanh/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-ed1bfd2deec94415.arrow


## Train


In [16]:
trainer.train()



The following columns in the training set  don't have a corresponding argument in `GPT2Model.forward` and have been ignored: text.
***** Running training *****
  Num examples = 20
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1


KeyboardInterrupt: 