In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [None]:
# Load GPT2 model and tokenizer 
model_name = "gpt2" 
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Ensure the tokenizer is configured for padding tokens
tokenizer.pad_token = tokenizer.eos_token


In [None]:
# Loading the dataset 
dataset = load_dataset("pubmed", split="train")  
print(dataset[:2])

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512)

# Tokenize dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)


In [None]:
train_dataset, eval_dataset = train_test_split(tokenized_dataset, test_size=0.1, random_state=42)

#adjust the batch size depending on your GPU memory
train_dataset = train_dataset.shuffle(seed=42).select(range(5000))  
eval_dataset = eval_dataset.shuffle(seed=42).select(range(500))  


In [None]:
training_args = TrainingArguments(
    output_dir="./results",  # Where to save the trained model
    evaluation_strategy="epoch",  # Evaluate the model after each epoch
    learning_rate=5e-5,  # Learning rate
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    num_train_epochs=3,  # Number of epochs
    save_steps=500,  # Save checkpoint after these many steps
    logging_dir="./logs",  # Directory for logging
    logging_steps=50,
    weight_decay=0.01,  # Weight decay
    load_best_model_at_end=True,  # Load the best model based on evaluation
    push_to_hub=False  # Set to True if you want to upload the model to Hugging Face hub
)

In [None]:
trainer = Trainer(
    model=model,  # The pre-trained model
    args=training_args,  # The training arguments
    train_dataset=train_dataset,  # The training dataset
    eval_dataset=eval_dataset,  # The evaluation dataset
    tokenizer=tokenizer,  # The tokenizer
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
results = trainer.evaluate()
print(f"Validation Results: {results}")

In [None]:
# Save the model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")