Lab 42 Implement Fine-Tuning Pretrained Models (GPT-2)​

#### 1. Install Dependencies
First, ensure you have the necessary libraries installed






In [None]:
pip install torch transformers datasets

In [1]:
import os


In [3]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader

# Load the CSV file into a pandas dataframe
df = pd.read_csv('covid_data.csv')

# Define a custom dataset class
class CovidDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        question = row['question']
        answer = row['answer']
        input_text = f"Q: {question} A: {answer}"
        encoding = self.tokenizer.encode_plus(
            input_text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_attention_mask=True,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }



#### 2. Load Pretrained  Model and Tokenizer
Import the required libraries and load the pretrained BERT model and tokenizer

In [12]:
# Constants
MODEL_NAME = "gpt2"
LEARNING_RATE = 5e-5
EPOCHS = 10
WARMUP_STEPS = 5
OUTPUT_DIR = "/content/fine_tuned_gpt2"
BATCH_SIZE = 4
MAX_LENGTH = 128

# Load pretrained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

# Create dataset and dataloader
dataset = CovidDataset(df, tokenizer, MAX_LENGTH)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Move model to the device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = EPOCHS * len(dataloader)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=num_training_steps)





### Model training

I will train the model and save the model weights after each epoch and then I will try to generate jokes with each version of the weight to see which performs the best.

In [13]:
# Training loop
model.train()
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    epoch_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        loss.backward()

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()

    print(f"Epoch loss: {epoch_loss / len(dataloader)}")

# Save the model
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Model saved to {OUTPUT_DIR}")



Epoch 1/10
Epoch loss: 7.082181167602539
Epoch 2/10
Epoch loss: 1.651819372177124
Epoch 3/10
Epoch loss: 1.0214077711105347
Epoch 4/10
Epoch loss: 0.8959383606910706
Epoch 5/10
Epoch loss: 0.7755052328109742
Epoch 6/10
Epoch loss: 0.6961182832717896
Epoch 7/10
Epoch loss: 0.636821174621582
Epoch 8/10
Epoch loss: 0.6069189548492432
Epoch 9/10
Epoch loss: 0.5808164954185486
Epoch 10/10
Epoch loss: 0.5646387696266174
Model saved to /content/fine_tuned_gpt2


### Generating responses


In [15]:
def generate_text(prompt, model, tokenizer, max_length=150, temperature=0.7, top_k=50, top_p=0.9):
    input_text = f"Q: {prompt} A:"
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=2,  # Prevent repeating n-grams
            early_stopping=True
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example prompt
prompt = "What is Covid 19?"

# Generate a response
response = generate_text(prompt, model, tokenizer)
print(f"Generated response: {response}")


Generated response: Q: What is Covid 19? A: Covids are a virus that can spread through the body through contact with the virus.
