In [1]:
! pip install transformers datasets

In [2]:
from datasets import load_dataset

dataset_name = "google/Synthetic-Persona-Chat"
dataset = load_dataset(dataset_name)

# The dataset is often split into train, validation, and test
train_dataset = dataset["train"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/Synthetic-Persona-Chat_train.csv:   0%|          | 0.00/15.9M [00:00<?, ?B/s]

Synthetic-Persona-Chat_valid.csv: 0.00B [00:00, ?B/s]

Synthetic-Persona-Chat_test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/8938 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/968 [00:00<?, ? examples/s]

In [7]:
# Load the tokenizer
from transformers import AutoTokenizer

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# GPT-2 tokenizer doesn't have a default pad token, so we set it to the EOS token
# It's also beneficial to add special tokens for dialogue formatting if you choose to use them.
tokenizer.pad_token = tokenizer.eos_token

# A function to format each example
def format_conversation(example):
    # Combine personas into a single string
    persona_1 = " ".join(example["user 1 personas"])
    persona_2 = " ".join(example["user 2 personas"])
    personas = f"P1: {persona_1} P2: {persona_2}"

    # Concatenate the conversation turns, adding a separator/EOS token after each turn
    # This structure trains the model to generate the next response after reading the last one.
    conversation = tokenizer.eos_token.join(example["Best Generated Conversation"])

    # Combine the personas and the conversation
    full_text = f"{personas} <|startofchat|> {conversation} {tokenizer.eos_token}"
    return {"text": full_text}

# Apply the formatting function
processed_dataset = train_dataset.map(format_conversation, remove_columns=train_dataset.column_names)

Map:   0%|          | 0/8938 [00:00<?, ? examples/s]

In [8]:
block_size = 128 # A common choice, you can adjust this

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=block_size)

tokenized_dataset = processed_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4, # Use multiple processes for faster tokenization
    remove_columns=["text"]
)

# Use the DataCollatorForLanguageModeling to handle chunking and Masked Language Modeling (MLM)
# or just Language Modeling (LM) task (GPT-2 is a causal LM, so we use mlm=False)
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False # Causal Language Modeling for GPT-2
)

Map (num_proc=4):   0%|          | 0/8938 [00:00<?, ? examples/s]

In [11]:
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer

# Load the model
model = GPT2LMHeadModel.from_pretrained(model_name)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-persona-chat-finetuned", # Directory for output and checkpoints
    overwrite_output_dir=True,
    num_train_epochs=3, # Number of training epochs
    per_device_train_batch_size=4, # Batch size per GPU/TPU core
    save_steps=10_000, # Save checkpoint every X steps
    save_total_limit=2, # Only keep the last 2 checkpoints
    logging_steps=500,
    prediction_loss_only=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

# Start fine-tuning
trainer.train()

# Save the final model
trainer.save_model("./final_gpt2_persona_chat")

[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 fcb0b3520578aebb0e27defaea37d6fb02abb489


[34m[1mwandb[0m: Enter your choice:

 fcb0b3520578aebb0e27defaea37d6fb02abb489


[34m[1mwandb[0m: Enter your choice:

 ehabbellkasyalx


[34m[1mwandb[0m: Enter your choice:

 ehabbellkasyalx


[34m[1mwandb[0m: Enter your choice:

 0


[34m[1mwandb[0m: Enter your choice:

 1


[34m[1mwandb[0m: You chose 'Create a W&B account'
[34m[1mwandb[0m: Create an account here: https://wandb.ai/authorize?signup=true&ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mehabbellkasyalx[0m ([33mehabbellkasyalx-depi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


KeyboardInterrupt: 

In [None]:
from transformers import pipeline

# Load the fine-tuned model into a text generation pipeline
generator = pipeline(
    'text-generation',
    model='./final_gpt2_persona_chat',
    tokenizer=tokenizer,
)

# Example prompt based on the format used during training
prompt = "P1: I love hiking and the outdoors. P2: I collect stamps. <|startofchat|> P1: Hello! How are you today?"

# Generate text
generated_text = generator(
    prompt,
    max_length=50,
    num_return_sequences=1,
    do_sample=True, # Enable sampling for creative generation
    temperature=0.7,
)

print(generated_text[0]['generated_text'])