In [None]:
# 1. Enable GPU: Runtime -> Change runtime type -> GPU
# 2. Install Libraries
!pip install transformers datasets accelerate

In [None]:
from datasets import load_dataset
from transformers import GPT2Tokenizer

# --- Setup ---
# Load the dataset
dataset_name = "DialogueCharacter/english_soda_unfiltered"
dataset = load_dataset(dataset_name)

# Load the GPT-2 tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# GPT-2 requires the EOS token for padding
tokenizer.pad_token = tokenizer.eos_token 

# Define a separation token for dialogue turns.
# GPT-2's EOS token (50256) is often used for this.
SEP_TOKEN = tokenizer.eos_token 

# The SODA dataset typically uses 'dialogue' as the main column.
# Let's inspect a sample to confirm the structure and create our formatter.
# print(dataset['train'][0]['dialogue']) 
# Assuming each entry in 'dialogue' is a single string representing the whole chat.

# --- Formatting Function ---

def format_and_tokenize(examples):
    formatted_texts = []
    
    # We assume 'dialogue' column contains the raw text of the conversation.
    for dialogue_text in examples['dialogue']:
        # GPT-2 learns by predicting the next token. 
        # We want it to learn to generate the entire dialogue.
        # Adding SEP_TOKEN at the end is good practice for the model to learn 
        # when a sequence ends.
        text = f"{dialogue_text.strip()}{SEP_TOKEN}"
        formatted_texts.append(text)

    return tokenizer(
        formatted_texts, 
        truncation=True, 
        max_length=512, # Adjust this based on your average dialogue length and GPU memory
        padding='max_length' 
    )

# --- Apply Mapping ---

# Note: The SODA unfiltered dataset may have other columns, but 
# we only need to remove 'dialogue' and keep the tokenized IDs.
tokenized_datasets = dataset.map(
    format_and_tokenize, 
    batched=True, 
    remove_columns=['dialogue']
)

# Set the 'input_ids' as 'labels' for Causal Language Modeling
tokenized_datasets = tokenized_datasets.map(
    lambda examples: {'labels': examples['input_ids']}, 
    batched=True
)

print("--- Tokenized Dataset Structure ---")
print(tokenized_datasets['train'])

In [None]:
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# --- Load Model ---
model = GPT2LMHeadModel.from_pretrained(model_name)

# --- Define Training Arguments ---
output_dir = "./gpt2_soda_finetuned"

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,                   # Recommended for fine-tuning
    per_device_train_batch_size=4,        # Adjust this (4, 8, 16) based on GPU memory
    save_steps=5000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=500,
    learning_rate=5e-5,                   # Standard learning rate for fine-tuning
    fp16=True,                            # Enable mixed precision training for speed/memory
)

# --- Define Data Collator ---
# DataCollatorForLanguageModeling handles the final batching and ignores the labels 
# where the loss shouldn't be computed (like padding tokens).
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False # Crucial: False for Causal Language Modeling (GPT-style)
)

# --- Initialize and Train ---
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
)

print("Starting training...")
trainer.train()

# --- Save Model ---
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

In [None]:
from transformers import pipeline

# Load the saved model and tokenizer into a text-generation pipeline
generator = pipeline(
    'text-generation', 
    model=output_dir, 
    tokenizer=model_name, 
    device=0 # Use 0 for the first GPU
)

# Create a prompt that looks like the start of a dialogue from your dataset
prompt = "User: Hey, how was your weekend? Any good plans for next week? Bot: My weekend was great! I finally finished reading the new sci-fi book. Next week I'm planning to"

# Generate the continuation of the dialogue
result = generator(
    prompt, 
    max_length=150, 
    num_return_sequences=1, 
    pad_token_id=tokenizer.eos_token_id
)

print("\n--- Generated Dialogue ---")
print(result[0]['generated_text'])