In [None]:
%pip install transformers datasets accelerate torch

In [None]:
%pip install tf-keras
%pip install transformers torch tensorflow

In [None]:
from datasets import load_dataset

# Load the persona-chat dataset
dataset = load_dataset("Cynaptics/persona-chat")

# Explore the dataset
print(dataset)
print(dataset['train'][0])  # View an example

In [None]:
from transformers import AutoTokenizer

# Load the GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
def preprocess_function(examples):
    inputs = []
    outputs = []
    
    # Add error checking for required columns
    if 'persona_b' not in examples or 'dialogue' not in examples:
        raise ValueError("Dataset must contain 'persona_b' and 'dialogue' columns. "
                       f"Available columns: {examples.keys()}")
        
    for persona, dialog in zip(examples['persona_b'], examples['dialogue']):
        persona_text = "\n".join(persona)
        for turn in dialog:
            if turn['speaker'] == "Person 1":
                inputs.append(f"Persona:\n{persona_text}\nDialogue:\n{turn['utterance']}")
            elif turn['speaker'] == "Person 2":
                outputs.append(turn['utterance'])
    
    # Add error checking for empty inputs/outputs
    if not inputs or not outputs:
        raise ValueError("No valid input-output pairs were found in the data")
        
    tokenized = tokenizer(inputs, text_target=outputs, 
                         truncation=True, 
                         padding="max_length", 
                         max_length=512)
    return tokenized

In [None]:
# 1. Import required libraries
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm.auto import tqdm
import torch

# 2. Load tokenizer and set padding token
try:
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    # Fix padding token issue
    tokenizer.pad_token = tokenizer.eos_token
    print("Tokenizer loaded successfully")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

# 3. Load dataset
try:
    dataset = load_dataset("Cynaptics/persona-chat")
    print(f"Dataset loaded with {len(dataset['train'])} examples")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# 4. Preprocessing function
def preprocess_function(examples, max_length=512):
    inputs = []
    outputs = []
    
    for persona, dialogue in zip(examples['persona_b'], examples['dialogue']):
        persona_text = " | ".join(persona) if isinstance(persona, list) else str(persona)
        
        for i in range(0, len(dialogue)-1, 2):
            current = dialogue[i].replace("Persona A: ", "").strip()
            response = dialogue[i+1].replace("Persona B: ", "").strip() if i+1 < len(dialogue) else ""
            
            if response:
                prompt = f"Background: {persona_text}\nHuman: {current}\nAssistant:"
                inputs.append(prompt)
                outputs.append(response)
    
    return tokenizer(
        inputs,
        text_target=outputs,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors=None
    )

# 5. Process dataset with progress tracking
try:
    tokenized_datasets = dataset.map(
        preprocess_function,
        batched=True,
        batch_size=16,
        remove_columns=dataset["train"].column_names,
        desc="Processing conversations"
    )
    print("Dataset preprocessing completed successfully")
except Exception as e:
    print(f"Error during preprocessing: {e}")
    raise

In [None]:
#No fine_tuning , testing gpt 2 model
import torch
from transformers import AutoModelForCausalLM

# Load model and move to device
model = AutoModelForCausalLM.from_pretrained("gpt2")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def generate_conversation(initial_prompt, num_turns=3):
    conversation = []
    context = ""
    
    for turn in range(num_turns):
        # Format prompt for Person 1
        if turn == 0:
            person1_text = initial_prompt
        else:
            person1_text = input("Person 1: ")  # Interactive input
            
        # Update context and generate Person 2's response
        context += f"Person 1: {person1_text}\n"
        input_ids = tokenizer(context + "Person 2:", return_tensors="pt").to(device)
        
        # Generate response
        outputs = model.generate(
            **input_ids,
            max_length=100,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
        
        person2_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        person2_text = person2_text.split("Person 2:")[-1].strip()
        
        # Print conversation turn
        print(f"Person 1: {person1_text}")
        print(f"Person 2: {person2_text}\n")
        
        context += f"Person 2: {person2_text}\n"

# Example usage
initial_prompt = "Hello! How are you today?"
generate_conversation(initial_prompt)

In [None]:
print(tokenized_datasets.keys())
print(dataset['train'].column_names)

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)

# Load the dataset
dataset = load_dataset("Cynaptics/persona-chat")

# Load the tokenizer and model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)

# Flatten 'train' and 'test' datasets
train_data = dataset['train'].map(preprocess_function, batched=True)
test_data = dataset['test'].map(preprocess_function, batched=True)

# Convert to torch tensors and create DataLoader
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Define the data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Not using masked language modeling
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-persona-chat",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if available
    report_to="tensorboard",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./gpt2-persona-chat")
tokenizer.save_pretrained("./gpt2-persona-chat")
