In [None]:
# =========================================================================
# --- STEP 1: Setup and Installs ---
# =========================================================================
# Install the necessary libraries
!pip install transformers datasets torch huggingface_hub

In [None]:
# =========================================================================
# --- STEP 2: Imports, Configuration, and Login ---
# =========================================================================
from google.colab import drive
import os
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset
import torch
from huggingface_hub import notebook_login, HfFolder

In [None]:
# --- Configuration ---
# 1. Define the SOURCE (your existing, trained recipe model) - REMAINS UNTOUCHED
HF_REPO_ID_SOURCE = "EhabBelllkasy01/gpt2-all-recipes"

# 2. Define the TARGET (the new, combined model) - NEW REPOSITORY
HF_REPO_ID_TARGET = "EhabBelllkasy01/gpt2-recipe-persona"

# Local output path
DRIVE_PATH = "/content/drive/MyDrive/GPT2_Persona_Temp_Files"

# Mount Google Drive (if you want to save checkpoints externally)
drive.mount('/content/drive')
if not os.path.exists(DRIVE_PATH):
    os.makedirs(DRIVE_PATH)

# Login to Hugging Face Hub
notebook_login()

In [None]:
# =========================================================================
# --- STEP 3: Load Tokenizer and Add Custom Tokens ---
# =========================================================================
# CRITICAL: Load the tokenizer from the SOURCE model (which includes recipe tokens)
tokenizer = GPT2Tokenizer.from_pretrained(HF_REPO_ID_SOURCE)

# Set the padding token (standard for GPT-2)
tokenizer.pad_token = tokenizer.eos_token

# Add new special tokens for conversation flow and persona
tokenizer.add_special_tokens({
    'additional_special_tokens': [
        '<|persona|>',         # Start of the model's persona description block
        '<|user|>',            # Start of the interlocutor's turn (User 1)
        '<|assistant|>',       # Start of the model's (User 2's) turn
        '<|endofmessage|>'     # End of a full conversational turn
    ]
})
print(f"New vocabulary size after adding persona tokens: {len(tokenizer)}")

# =========================================================================
# --- STEP 4: Load Model and Resize Embeddings ---
# =========================================================================
# CRITICAL: Load the model weights from the SOURCE model (recipe-tuned)
model = GPT2LMHeadModel.from_pretrained(HF_REPO_ID_SOURCE)

# CRITICAL: Resize model's token embeddings to fit the new custom tokens
model.resize_token_embeddings(len(tokenizer))
print(f"Model embeddings resized to match new vocabulary size.")

# =========================================================================
# --- STEP 5: Load, Format, and Tokenize Dataset (Synthetic-Persona-Chat) (MODIFIED) ---
# =========================================================================
print("\nLoading and processing Synthetic-Persona-Chat dataset...")

# CRITICAL CHANGE: Load the specified dataset
raw_datasets = load_dataset("google/Synthetic-Persona-Chat")

def format_and_tokenize_persona(example):
    """
    Transforms the structured dialogue from Synthetic-Persona-Chat 
    into a continuous sequence for Causal Language Modeling (CLM).
    We train the model to respond as 'User 2' (Assistant).
    """
    # 1. Identify the model's persona (User 2)
    persona_str = " ".join(example['User 2 Persona'])
    
    # 2. Extract the alternating turns
    conversation = example['Conversation']
    
    # Start the sequence with the model's persona
    full_text = f"<|persona|> {persona_str} "
    
    # Iterate through the conversation turns and assign roles
    for i, utterance in enumerate(conversation):
        # User 1's turn (index 0, 2, 4...) is the interlocutor (<|user|>)
        if i % 2 == 0: 
            full_text += f"<|user|> {utterance} "
        # User 2's turn (index 1, 3, 5...) is the model (<|assistant|>)
        else: 
            full_text += f"<|assistant|> {utterance} "
            
    full_text += "<|endofmessage|>"

    return tokenizer(
        full_text,
        max_length=512,
        truncation=True
    )

# Apply the formatting and tokenization function
tokenized_datasets = raw_datasets.map(
    format_and_tokenize_persona,
    # Remove original columns: User 1 Persona, User 2 Persona, Conversation
    remove_columns=raw_datasets['train'].column_names, 
    batched=False
)

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

# Display the first tokenized sample to verify the structure
print("--- Example of the Tokenized Input Sequence ---")
print(tokenizer.decode(train_dataset[0]['input_ids']))
print(f"Training Samples: {len(train_dataset)}, Evaluation Samples: {len(eval_dataset)}")

# =========================================================================
# --- STEP 6: Configure Training and Checkpointing ---
# =========================================================================
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False # Crucial for Causal Language Modeling (CLM)
)

training_args = TrainingArguments(
    output_dir=DRIVE_PATH,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_strategy="steps",
    save_steps=1000,
    evaluation_strategy="steps",
    eval_steps=500,
    # --- Hugging Face Hub Configuration ---
    push_to_hub=True,
    # CRITICAL: Use the TARGET repo ID for pushing the new model
    hub_model_id=HF_REPO_ID_TARGET, 
    hub_token=HfFolder.get_token(),
    hub_private_repo=False,
)

# =========================================================================
# --- STEP 7: Initialize Trainer and Train ---
# =========================================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer, 
)

print("\nStarting training...")
print(f"Training will create a brand new repo: {HF_REPO_ID_TARGET}")

# The model starts with recipe knowledge and learns persona next.
trainer.train()

# =========================================================================
# --- STEP 8: Save Final Model and Push ---
# =========================================================================
# Save the final model and tokenizer locally
final_model_path = os.path.join(DRIVE_PATH, "final_persona_model")
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

# Push the final trained model and tokenizer to the Hub
print(f"\nPushing final model to Hugging Face Hub: {HF_REPO_ID_TARGET}")
trainer.push_to_hub(commit_message="Initial model fine-tuned for conversational persona using Synthetic-Persona-Chat dataset.")
print("Training complete. The new model is saved separately and your original recipe model is untouched.")