In [None]:
'''
{
    # =========================================================================
    # --- STEP 1: Setup and Installs ---
    # =========================================================================
    # Install the necessary libraries
    !pip install transformers datasets torch huggingface_hub

    # =========================================================================
    # --- STEP 2: Imports, Configuration, and Login ---
    # =========================================================================
    from google.colab import drive
    import os
    from transformers import (
        GPT2Tokenizer,
        GPT2LMHeadModel,
        DataCollatorForLanguageModeling,
        Trainer,
        TrainingArguments
    )
    from datasets import load_dataset
    import torch
    from huggingface_hub import notebook_login, HfFolder

    # --- Configuration ---
    # 1. Define the SOURCE (your existing, trained model)
    HF_REPO_ID_SOURCE = "EhabBelllkasy01/gpt2-all-recipes"

    # 2. Define the TARGET (the new, combined model)
    HF_REPO_ID_TARGET = "EhabBelllkasy01/gpt2-recipe-persona"

    # Local output path
    DRIVE_PATH = "/content/drive/MyDrive/GPT2_Persona_Temp_Files"

    # Mount Google Drive (if you want to save checkpoints externally)
    drive.mount('/content/drive')
    if not os.path.exists(DRIVE_PATH):
        os.makedirs(DRIVE_PATH)

    # Login to Hugging Face Hub
    notebook_login()

    # =========================================================================
    # --- STEP 3: Load Tokenizer and Add Custom Tokens (REVISED) ---
    # =========================================================================
    # CRITICAL CHANGE: Load the tokenizer from the SOURCE model
    tokenizer = GPT2Tokenizer.from_pretrained(HF_REPO_ID_SOURCE)

    # Set the padding token (standard for GPT-2)
    tokenizer.pad_token = tokenizer.eos_token

    # Add new special tokens for conversation flow and persona.
    # Using <|context|> to wrap the dialogue history.
    tokenizer.add_special_tokens({
        'additional_special_tokens': [
            '<|persona|>',         # Start of the persona description block
            '<|context|>',         # Start of the dialogue history/context (replaces <|user|>)
            '<|assistant|>',       # Start of the model's (assistant's/persona's) turn
            '<|endofmessage|>'     # End of a full conversational turn
        ]
    })
    print(f"New vocabulary size after adding persona tokens: {len(tokenizer)}")

    # =========================================================================
    # --- STEP 4: Load Model and Resize Embeddings ---
    # =========================================================================
    # CRITICAL CHANGE: Load the model weights from the SOURCE model
    model = GPT2LMHeadModel.from_pretrained(HF_REPO_ID_SOURCE)

    # CRITICAL: Resize model's token embeddings to fit the new custom tokens
    model.resize_token_embeddings(len(tokenizer))
    print(f"Model embeddings resized to match new vocabulary size.")

    # =========================================================================
    # --- STEP 5: Load, Format, and Tokenize Dataset (REVISED) ---
    # =========================================================================
    print("\nLoading and processing Persona-Chat dataset...")

    # Load the Persona-Chat dataset (assuming a working configuration/community version)
    # Using the base name as suggested by the dataset card.
    raw_datasets = load_dataset("personachat") 


    # Define the REVISED function to format the dialogue turn into a continuous sequence
    def format_and_tokenize_personachat_clm(example, tokenizer):
        """
        Transforms a single turn (row) from the flattened PersonaChat dataset 
        into a continuous, tokenized sequence for CLM.
        Uses confirmed columns: 'personality', 'history' (context), and 'utterance' (response).
        """
        
        # 1. Extract and format the persona
        persona_sentences = example['personality']
        persona_str = " ".join(persona_sentences)

        # 2. Extract and format the dialogue history (context)
        history_turns = example.get('history', example.get('utterances', [])) # Use 'history' or 'utterances'
        dialogue_context = " ".join(history_turns)
        
        # 3. Extract the target response
        assistant_response = example['utterance']

        # 4. Construct the full CLM sequence
        # Structure: <|persona|> {persona} <|context|> {history} <|assistant|> {response} <|endofmessage|>
        full_text = (
            f"<|persona|> {persona_str} "
            f"<|context|> {dialogue_context} "
            f"<|assistant|> {assistant_response} <|endofmessage|>"
        )

        # 5. Tokenize
        return tokenizer(
            full_text,
            max_length=512,
            truncation=True
        )

    # Apply the REVISED formatting and tokenization function
    # NOTE: We pass the tokenizer to the function in the map call now.
    tokenized_datasets = raw_datasets.map(
        lambda x: format_and_tokenize_personachat_clm(x, tokenizer),
        remove_columns=raw_datasets['train'].column_names,
        batched=False
    )

    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation"]

    # Display the first tokenized sample to verify the structure
    print("--- Example of the Tokenized Input Sequence ---")
    print(tokenizer.decode(train_dataset[0]['input_ids']))
    print(f"Training Samples: {len(train_dataset)}, Evaluation Samples: {len(eval_dataset)}")

    # =========================================================================
    # --- STEP 6: Configure Training and Checkpointing ---
    # =========================================================================
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    training_args = TrainingArguments(
        output_dir=DRIVE_PATH,
        num_train_epochs=5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        save_strategy="steps",
        save_steps=1000,
        evaluation_strategy="steps",
        eval_steps=500,
        # --- Hugging Face Hub Configuration ---
        push_to_hub=True,
        # CRITICAL CHANGE: Use the TARGET repo ID for pushing
        hub_model_id=HF_REPO_ID_TARGET, 
        hub_token=HfFolder.get_token(),
        hub_private_repo=False,
    )

    # =========================================================================
    # --- STEP 7: Initialize Trainer and Train ---
    # =========================================================================
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer, 
    )

    print("\nStarting training...")
    print(f"Training will create a brand new repo: {HF_REPO_ID_TARGET}")

    # The model starts with recipe knowledge and learns persona next.
    trainer.train()

    # =========================================================================
    # --- STEP 8: Save Final Model and Push ---
    # =========================================================================
    # Save the final model and tokenizer locally
    final_model_path = os.path.join(DRIVE_PATH, "final_persona_model")
    trainer.save_model(final_model_path)
    tokenizer.save_pretrained(final_model_path)

    # Push the final trained model and tokenizer to the Hub
    print(f"\nPushing final model to Hugging Face Hub: {HF_REPO_ID_TARGET}")
    trainer.push_to_hub(commit_message="Initial model fine-tuned for conversational persona, starting from recipe model.")
    print("Training complete. The new model is saved separately and your original model is untouched.")
}
'''

In [None]:
# =========================================================================
# --- STEP 1: Setup and Installs ---
# =========================================================================
# Install the necessary libraries
!pip install transformers datasets torch huggingface_hub

In [None]:
# =========================================================================
# --- STEP 2: Imports, Configuration, and Login ---
# =========================================================================
from google.colab import drive
import os
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset
import torch
from huggingface_hub import notebook_login, HfFolder

In [None]:
# --- Configuration ---
# 1. Define the SOURCE (your existing, trained model)
HF_REPO_ID_SOURCE = "EhabBelllkasy01/gpt2-all-recipes"

# 2. Define the TARGET (the new, combined model)
HF_REPO_ID_TARGET = "EhabBelllkasy01/gpt2-recipe-persona"

# Local output path
DRIVE_PATH = "/content/drive/MyDrive/GPT2_Persona_Temp_Files"

# Mount Google Drive (if you want to save checkpoints externally)
drive.mount('/content/drive')
if not os.path.exists(DRIVE_PATH):
    os.makedirs(DRIVE_PATH)

# Login to Hugging Face Hub
notebook_login()

In [None]:
# =========================================================================
# --- STEP 3: Load Tokenizer and Add Custom Tokens (REVISED) ---
# =========================================================================
# CRITICAL CHANGE: Load the tokenizer from the SOURCE model
tokenizer = GPT2Tokenizer.from_pretrained(HF_REPO_ID_SOURCE)

# Set the padding token (standard for GPT-2)
tokenizer.pad_token = tokenizer.eos_token

# Add new special tokens for conversation flow and persona.
# Using <|context|> to wrap the dialogue history.
tokenizer.add_special_tokens({
    'additional_special_tokens': [
        '<|persona|>',         # Start of the persona description block
        '<|context|>',         # Start of the dialogue history/context (replaces <|user|>)
        '<|assistant|>',       # Start of the model's (assistant's/persona's) turn
        '<|endofmessage|>'     # End of a full conversational turn
    ]
})
print(f"New vocabulary size after adding persona tokens: {len(tokenizer)}")

In [None]:
# =========================================================================
# --- STEP 4: Load Model and Resize Embeddings ---
# =========================================================================
# CRITICAL CHANGE: Load the model weights from the SOURCE model
model = GPT2LMHeadModel.from_pretrained(HF_REPO_ID_SOURCE)

# CRITICAL: Resize model's token embeddings to fit the new custom tokens
model.resize_token_embeddings(len(tokenizer))
print(f"Model embeddings resized to match new vocabulary size.")

In [None]:
# =========================================================================
# --- STEP 5: Load, Format, and Tokenize Dataset (REVISED) ---
# =========================================================================
print("\nLoading and processing Persona-Chat dataset...")

# Load the Persona-Chat dataset (assuming a working configuration/community version)
# Using the base name as suggested by the dataset card.
raw_datasets = load_dataset("personachat")

In [None]:
# Define the REVISED function to format the dialogue turn into a continuous sequence
def format_and_tokenize_personachat_clm(example, tokenizer):
    """
    Transforms a single turn (row) from the flattened PersonaChat dataset 
    into a continuous, tokenized sequence for CLM.
    Uses confirmed columns: 'personality', 'history' (context), and 'utterance' (response).
    """
    
    # 1. Extract and format the persona
    persona_sentences = example['personality']
    persona_str = " ".join(persona_sentences)

    # 2. Extract and format the dialogue history (context)
    history_turns = example.get('history', example.get('utterances', [])) # Use 'history' or 'utterances'
    dialogue_context = " ".join(history_turns)
    
    # 3. Extract the target response
    assistant_response = example['utterance']

    # 4. Construct the full CLM sequence
    # Structure: <|persona|> {persona} <|context|> {history} <|assistant|> {response} <|endofmessage|>
    full_text = (
        f"<|persona|> {persona_str} "
        f"<|context|> {dialogue_context} "
        f"<|assistant|> {assistant_response} <|endofmessage|>"
    )

    # 5. Tokenize
    return tokenizer(
        full_text,
        max_length=512,
        truncation=True
    )

In [None]:
# Apply the REVISED formatting and tokenization function
# NOTE: We pass the tokenizer to the function in the map call now.
tokenized_datasets = raw_datasets.map(
    lambda x: format_and_tokenize_personachat_clm(x, tokenizer),
    remove_columns=raw_datasets['train'].column_names,
    batched=False
)

In [None]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

# Display the first tokenized sample to verify the structure
print("--- Example of the Tokenized Input Sequence ---")
print(tokenizer.decode(train_dataset[0]['input_ids']))
print(f"Training Samples: {len(train_dataset)}, Evaluation Samples: {len(eval_dataset)}")

In [None]:
# =========================================================================
# --- STEP 6: Configure Training and Checkpointing ---
# =========================================================================
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir=DRIVE_PATH,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_strategy="steps",
    save_steps=1000,
    evaluation_strategy="steps",
    eval_steps=500,
    # --- Hugging Face Hub Configuration ---
    push_to_hub=True,
    # CRITICAL CHANGE: Use the TARGET repo ID for pushing
    hub_model_id=HF_REPO_ID_TARGET, 
    hub_token=HfFolder.get_token(),
    hub_private_repo=False,
)

In [None]:
# =========================================================================
# --- STEP 7: Initialize Trainer and Train ---
# =========================================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer, 
)

In [None]:
print("\nStarting training...")
print(f"Training will create a brand new repo: {HF_REPO_ID_TARGET}")

# The model starts with recipe knowledge and learns persona next.
trainer.train()

In [None]:
# =========================================================================
# --- STEP 8: Save Final Model and Push ---
# =========================================================================
# Save the final model and tokenizer locally
final_model_path = os.path.join(DRIVE_PATH, "final_persona_model")
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

# Push the final trained model and tokenizer to the Hub
print(f"\nPushing final model to Hugging Face Hub: {HF_REPO_ID_TARGET}")
trainer.push_to_hub(commit_message="Initial model fine-tuned for conversational persona, starting from recipe model.")
print("Training complete. The new model is saved separately and your original model is untouched.")