In [16]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.tensorboard import SummaryWriter
import os

# Set a random seed for reproducibility
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Load pre-trained model and tokenizer
model = GPT2ForSequenceClassification.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Define the training data and labels
train_texts = [
    "Example sentence 1 for class 0",
    "Example sentence 2 for class 1",
    # Add more sentences
]
train_labels = [0, 1]  # Replace with actual labels

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42
)

# Preprocess the data and encode it into features
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Define tensors for input ids, attention masks, and labels
train_input_ids = torch.tensor(train_encodings['input_ids'])
train_attention_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels)

val_input_ids = torch.tensor(val_encodings['input_ids'])
val_attention_masks = torch.tensor(val_encodings['attention_mask'])
val_labels = torch.tensor(val_labels)

# Create DataLoader for training and validation
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up an optimizer and criterion
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
criterion = torch.nn.CrossEntropyLoss()

# Set up a learning rate scheduler
total_steps = len(train_dataloader) * 5  # 5 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Set up Tensorboard logging
log_dir = "./logs"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
writer = SummaryWriter(log_dir=log_dir)

# Set up early stopping
early_stopping = {'patience': 3, 'counter': 0, 'best_val_loss': float('inf')}

# Set up model saving
best_model_path = "./best_model.pth"

# Set up a training loop with validation, early stopping, and model saving
def train_model(model, train_dataloader, val_dataloader, optimizer, criterion, scheduler, device, num_epochs=5):
    for epoch in range(1, num_epochs + 1):
        print(f"Epoch {epoch}/{num_epochs}")
        # Training
        model.train()
        total_loss = 0.0
        for batch in train_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_masks, labels = batch

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
            loss = outputs.loss
            loss.backward()

            total_loss += loss.item()

        # Gradient accumulation (if needed)
        # optimizer.step()  # Move this line outside of the loop if not using gradient accumulation

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Train Loss: {avg_train_loss}")
        writer.add_scalar("Train Loss", avg_train_loss, epoch)

        # Perform optimization step after accumulating gradients (if needed)
        optimizer.step()

        # Learning rate scheduler step
        scheduler.step()

        # Validation
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in val_dataloader:
                batch = tuple(t.to(device) for t in batch)
                input_ids, attention_masks, labels = batch

                outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
                loss = outputs.loss
                val_loss += loss.item()

                _, preds = torch.max(outputs.logits, 1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = val_loss / len(val_dataloader)
        accuracy = accuracy_score(all_labels, all_preds)
        print(f"Validation Loss: {avg_val_loss}, Accuracy: {accuracy * 100:.2f}%")
        writer.add_scalar("Validation Loss", avg_val_loss, epoch)
        writer.add_scalar("Accuracy", accuracy, epoch)

        # Early stopping
        if avg_val_loss < early_stopping['best_val_loss']:
            early_stopping['best_val_loss'] = avg_val_loss
            early_stopping['counter'] = 0
            # Save the best model
            torch.save(model.state_dict(), best_model_path)
        else:
            early_stopping['counter'] += 1
            if early_stopping['counter'] >= early_stopping['patience']:
                print("Early stopping.")
                break

# Train the model
train_model(model, train_dataloader, val_dataloader, optimizer, criterion, scheduler, device, num_epochs=4)

# Load the best model
best_model = GPT2ForSequenceClassification.from_pretrained('gpt2')
best_model.load_state_dict(torch.load(best_model_path))
best_model.to(device)

# Now you can use 'best_model' for inference.

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4
Train Loss: 0.0002840353990904987
Validation Loss: 9.389070510864258, Accuracy: 0.00%
Epoch 2/4
Train Loss: 0.0029093578923493624
Validation Loss: 9.83505630493164, Accuracy: 0.00%
Epoch 3/4
Train Loss: 6.317892984952778e-05
Validation Loss: 10.1011962890625, Accuracy: 0.00%
Epoch 4/4
Train Loss: 4.076874756719917e-05
Validation Loss: 10.249918937683105, Accuracy: 0.00%
Early stopping.


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)