# GPT2 Model

## Install necessary libraries

In [1]:
!pip install transformers torch pandas scikit-learn



In [2]:
!pip install optuna



In [3]:
pip install transformers[torch]



In [4]:
pip install accelerate -U



In [5]:
pip install tqdm



## Load necessary libraries

In [6]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from transformers import get_scheduler, TrainerCallback
from torch.cuda.amp import GradScaler, autocast
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

## Load and Prepare the Data

In [7]:
class EssayDataset(Dataset):
    def __init__(self, encodings, scores=None):
        self.encodings = encodings
        self.scores = scores

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.scores is not None:
            item['labels'] = torch.tensor(self.scores[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

### Train and test split with 20%

In [8]:
import pandas as pd
from transformers import GPT2Tokenizer
from sklearn.model_selection import train_test_split

# Load your dataset
df = pd.read_csv('train.csv')

# Split the data into training and validation sets
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['score'])
train_scores = train_df['score'].values
valid_scores = valid_df['score'].values

# Initialize the tokenizer
# Since GPT-2 does not use padding during its initial training; it processes sequences end-to-end. So we will use EOF(end-of-sentence) token to handle the data.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Tokenize essays
train_encodings = tokenizer(train_df['full_text'].tolist(), truncation=True, padding='max_length', max_length=512, pad_to_multiple_of=None)
test_encodings = tokenizer(valid_df['full_text'].tolist(), truncation=True, padding='max_length', max_length=512, pad_to_multiple_of=None)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
train_dataset = EssayDataset(train_encodings, train_scores)
val_dataset = EssayDataset(test_encodings, valid_scores)

In [10]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True)

## Set up the Model

In [12]:
# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model initialization
def model_init():
    model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=1)
    model.config.pad_token_id = tokenizer.eos_token_id
    return model

model = model_init().to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train

In [13]:
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn

writer = SummaryWriter('runs/experiment_name')

In [None]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.cuda.amp import GradScaler, autocast
from transformers import GPT2ForSequenceClassification
from tqdm import tqdm

# GradScaler for AMP
scaler = GradScaler()

optimizer = Adam(model.parameters(), lr=5e-5)

from torch.optim.lr_scheduler import StepLR
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

# Training function
def train(model, train_loader, optimizer, scaler, device):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc="Training", leave=False):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        with autocast():
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    return avg_loss

# Validation function
def validate(model, val_loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)

            with autocast():
                outputs = model(**inputs, labels=labels)
                loss = outputs.loss

            total_loss += loss.item()

    avg_loss = total_loss / len(val_loader)
    return avg_loss

# Main training loop
num_epochs = 4
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, scaler, device)
    val_loss = validate(model, val_loader, device)

    scheduler.step()  # Update the learning rate
    writer.add_scalar('Loss/train', train_loss, epoch)
    writer.add_scalar('Loss/val', val_loss, epoch)

    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

writer.close()

  self.pid = os.fork()
Training:   2%|▏         | 32/1731 [00:14<11:56,  2.37it/s]