<a href="https://colab.research.google.com/github/samir41939/Verisk-GenAI-Workshop/blob/main/gpt2_custom_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load the dataset
dataset = load_dataset('text', data_files={'train': 'custom_dataset.txt', 'validation': 'custom_dataset.txt'})

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], return_special_tokens_mask=True, truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
)

# # Freeze the first n layers
# n_freeze = 6
# for param in model.transformer.h[:n_freeze].parameters():
#     param.requires_grad = False

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./gpt2-custom')
tokenizer.save_pretrained('./gpt2-custom')

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Step 1: Load the Pre-trained Model and Tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Step 2: Prepare Input Data
input_text = "Once upon a time"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Step 3: Generate Text
output_ids = model.generate(
    input_ids,
    max_length=50,            # Maximum length of the generated text
    num_beams=5,              # Number of beams for beam search
    no_repeat_ngram_size=2,   # No repeating n-grams of this size
    early_stopping=True       # Stop early when all beams have been expanded
)

# Step 4: Decode and Output Generated Text
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)