## GPT-2

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
%cd /content/drive/My Drive/MyColab/CS4248/
!ls

/content/drive/My Drive/MyColab
dev.jsonl  gpt2-2.py  gpt2_colab.ipynb	logs_v1  results_v1  test.jsonl  train.jsonl


In [3]:
import numpy as np
import pandas as pd
import json
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, GPT2Config
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import Trainer, TrainingArguments
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import label_binarize

In [4]:
# Read JSONL files
def read_jsonl(file_path):
    records = []
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            records.append(json.loads(line))
    return pd.DataFrame(records)

# Read datasets
train_df = read_jsonl('/content/drive/My Drive/MyColab/train.jsonl')
dev_df = read_jsonl('/content/drive/My Drive/MyColab/dev.jsonl')
test_df = read_jsonl('/content/drive/My Drive/MyColab/test.jsonl')

In [5]:
# Define the mapping of labels to integers
label_to_int = {
    'background': 0,
    'method': 1,
    'result': 2
}

# Convert labels to integers
train_df['label'] = train_df['label'].map(label_to_int)
dev_df['label'] = dev_df['label'].map(label_to_int)
test_df['label'] = test_df['label'].map(label_to_int)

In [6]:
# Instantiate the configuration for model
configuration = GPT2Config()

# Set up tokenizer and  pad token
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Instantiate the model
model = GPT2ForSequenceClassification(configuration).from_pretrained('gpt2', num_labels=3)

# Set the pad token of the model's configuration
model.config.pad_token_id = model.config.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Encode datasets
class CitationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare datasets
def encode_data(tokenizer, texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
    return CitationDataset(encodings, labels)

train_dataset = encode_data(tokenizer, train_df['string'].tolist(), train_df['label'].tolist())
dev_dataset = encode_data(tokenizer, dev_df['string'].tolist(), dev_df['label'].tolist())
test_dataset = encode_data(tokenizer, test_df['string'].tolist(), test_df['label'].tolist())

In [21]:
try:
    from accelerate import Accelerator
    print("Accelerate is installed and can be imported.")
except ImportError as e:
    print("Accelerate is not installed or there's an issue with importing it.")

Accelerate is installed and can be imported.


In [12]:
# Define training parameters
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [13]:
# Train
trainer.train()

Step,Training Loss
10,3.4155
20,3.6769
30,2.8359
40,2.7243
50,1.6978
60,1.224
70,0.8628
80,0.8421
90,0.7743
100,0.789


TrainOutput(global_step=1545, training_loss=0.473669177968911, metrics={'train_runtime': 2763.0923, 'train_samples_per_second': 8.95, 'train_steps_per_second': 0.559, 'total_flos': 6456701005332480.0, 'train_loss': 0.473669177968911, 'epoch': 3.0})

In [19]:
# Predict
predictions = trainer.predict(test_dataset)

# Extract predicted results and true labels
preds = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids
n_classes = np.max(true_labels) + 1
true_labels_binary = label_binarize(true_labels, classes=range(n_classes))

In [20]:
# Compute f1 score, accuracy, precision, recall and auc
f1 = f1_score(true_labels, preds, average='weighted')
accuracy = accuracy_score(true_labels, preds)
precision = precision_score(true_labels, preds, average='weighted')
recall = recall_score(true_labels, preds, average='weighted')
auc = roc_auc_score(true_labels_binary, predictions.predictions, multi_class='ovr')

print(f'F1 Score: {f1}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'AUC (One-vs-Rest): {auc}')

F1 Score: 0.8616884573074306
Accuracy: 0.8602901665771091
Precision: 0.8664778938639706
Recall: 0.8602901665771091
AUC (One-vs-Rest): 0.8569322462992547
