In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
train_df = pd.read_csv('train.csv')  # Update the file path accordingly
test_df = pd.read_csv('test.csv')    # Update the file path accordingly

texts_train = train_df['review'].tolist()
labels_train = train_df['sentiment'].tolist()

texts_test = test_df['review'].tolist()
labels_test = test_df['sentiment'].tolist()

In [5]:
# Step 2: Tokenize the data
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
max_length = 512  # Define your desired maximum sequence length

tokenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 6.50kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 1.75MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.68MB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:01<00:00, 1.10MB/s]
config.json: 100%|██████████| 665/665 [00:00<00:00, 111kB/s]


In [6]:
# Step 1: Prepare the IMDb dataset
class IMDBDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
max_length = 512  # Define your desired maximum sequence length

# Step 3: Create DataLoader for training and testing sets
train_dataset = IMDBDataset(texts_train, labels_train, tokenizer, max_length)
test_dataset = IMDBDataset(texts_test, labels_test, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [8]:
# Step 4: Fine-tune the GPT-2 model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 1  # Define number of training epochs

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss:.4f}")

model.safetensors: 100%|██████████| 548M/548M [03:53<00:00, 2.35MB/s] 
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [None]:
    # Step 5: Evaluate the fine-tuned model
    model.eval()
    val_correct = 0
    val_total = 0
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
            
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)
    
    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = val_correct / val_total
    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")