In [1]:
import torch
import torch.nn as nn
from custom_dataloaders import construct_dataloaders
from gpt2_classification_model import Gpt2ClsModel
from hf_trainer import infer, train
from torch import cuda
from transformers import GPT2Config, GPT2ForSequenceClassification, GPT2Tokenizer

Setup the dataloaders

In [2]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Define PAD Token = EOS Token = 50256
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
pad_token_id = gpt2_tokenizer.encode(gpt2_tokenizer.eos_token)[0]

train_dataloader, val_dataloader, test_dataloader = construct_dataloaders(
    batch_size=4, train_split_ratio=0.8, tokenizer=gpt2_tokenizer, dataset_name="ag_news"
)

Found cached dataset ag_news (/h/demerson/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /h/demerson/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-f9d9def1ebac2526.arrow


Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Training data example encoding: tensor([12468,   263, 29651,  ..., 50256, 50256, 50256])
Training data example decoding: Worker morale may take toll on airlines It is a management truism that low morale among workers inevitably results in low productivity, low quality, erosion of customer loyalty and, ultimately, lower profits.


Setup the different variables we'd like for training

In [3]:
device = "cuda" if cuda.is_available() else "cpu"
print(f"Detected Device {device}")
# We'll provide two options. First we create our own model on top of the vanilla RoBERTa model. The second is to use
# HuggingFace's GPT2ForSequenceClassification class, which essentially does the same thing.
use_hf_sequence_classification = True
gpt2_model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path="gpt2", num_labels=4)
# The pad_token_id is used to determine when a sequence of inputs ends.
gpt2_model_config.pad_token_id = pad_token_id
gpt2_classifier_model = (
    GPT2ForSequenceClassification.from_pretrained("gpt2", config=gpt2_model_config)
    if use_hf_sequence_classification
    else Gpt2ClsModel(pad_token_id=pad_token_id)
)
loss_function = nn.CrossEntropyLoss()
n_training_epochs = 1
n_training_steps = 300

Detected Device cuda


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train the model on the training dataset

In [4]:
print("Begin Model Training...")
train(
    gpt2_classifier_model, train_dataloader, val_dataloader, loss_function, device, n_training_epochs, n_training_steps
)
print("Training Complete")

Begin Model Training...
Starting Epoch 0
Completed batch number: 100 of 24000 in loader
Training Loss over last 100 steps: 1.6661197769641876
Training Accuracy over last 100 steps: 36.633663366336634%
Validation Loss: 1.1145606362352185
Validation Accuracy: 50.0%
Completed batch number: 200 of 24000 in loader
Training Loss over last 100 steps: 0.9291619142889976
Training Accuracy over last 100 steps: 61.5%
Validation Loss: 0.7292167333995595
Validation Accuracy: 71.07843137254902%
Completed batch number: 300 of 24000 in loader
Training Loss over last 100 steps: 0.5718618040159344
Training Accuracy over last 100 steps: 78.5%
Validation Loss: 0.474610919717188
Validation Accuracy: 85.7843137254902%
Training rounds complete. Validating on entire validation set.
Completed 300 of 6000...
Completed 600 of 6000...
Completed 900 of 6000...
Completed 1200 of 6000...
Completed 1500 of 6000...
Completed 1800 of 6000...
Completed 2100 of 6000...
Completed 2400 of 6000...
Completed 2700 of 6000...


Save the final model to disk

In [5]:
print("Saving model...")
output_model_file = "./gpt2_ag_news.bin"
torch.save(gpt2_classifier_model, output_model_file)
print("Model saved.")

Saving model...
Model saved.


Load model back up and perform inference on the test set

In [6]:
print("Loading model...")
gpt2_classifier_model = torch.load(output_model_file)
print("Model loaded.")

print("Evaluating model on test set...")
test_accuracy, test_loss = infer(gpt2_classifier_model, loss_function, test_dataloader, device)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}%")
print("Model evaluated.")

Loading model...
Model loaded.
Evaluating model on test set...
Completed 300 of 1900...
Completed 600 of 1900...
Completed 900 of 1900...
Completed 1200 of 1900...
Completed 1500 of 1900...
Completed 1800 of 1900...
Test Loss: 0.5428967128377898
Test Accuracy: 80.67105263157895%
Model evaluated.
