In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from datasets import load_dataset
from datasets import load_metric
from tqdm import tqdm

In [None]:
rouge_metric = load_metric('rouge')
bleu_metric = load_metric('bleu')

dataset = load_dataset("bavard/personachat_truecased")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  #gpt2-medium
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Concatenate 'persona', 'history', and 'candidates' to form the input text
def chunking(examples):
    inputs = [
        tokenizer.bos_token + "\n-----\n".join(history) + "\n-----\n" + candidate + tokenizer.eos_token
        for history, candidates in zip(examples["history"], examples["candidates"])
        for candidate in candidates[-2:-1]
    ]
    return {"chunks": inputs}

In [None]:
def tokenize(examples):
    input_data = tokenizer(examples["chunks"], padding='max_length', truncation=True, max_length=256)
    outputs = {
        "input_ids": input_data['input_ids'],
        "attention_mask": input_data['attention_mask'],
    }
    return outputs

In [None]:
tokenized_datasets = (
    dataset
    .map(chunking, batched=True, remove_columns=dataset["train"].column_names)
    .map(tokenize, batched=True, remove_columns=["chunks"])
)

In [None]:
tokenized_datasets.set_format("torch")
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
valid_dataset = tokenized_datasets["validation"]

In [None]:
train_dataloader = DataLoader(
    train_dataset.select(list(range(1000))),
    shuffle=False,
    batch_size=8,
)
valid_dataloader = DataLoader(
    valid_dataset.select(list(range(1000))),
    shuffle=True,
    batch_size=8,
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def calculate_bleu_score(generated_texts, reference_texts):
    return bleu_metric.compute(predictions=generated_texts, references=reference_texts)

In [None]:
def calculate_rouge_scores(generated_texts, reference_texts):
    rouge_hypotheses = generated_texts
    rouge_references = [{"summary": ref} for ref in reference_texts]
    rouge_results = rouge_metric.compute(predictions=rouge_hypotheses, references=rouge_references)
    return rouge_results

In [None]:
def generate_text(input_ids, attention_mask, num_return_sequences=1, max_length=513, temperature=1.0, top_k=50):
    output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length,
                            num_return_sequences=num_return_sequences,
                            temperature=temperature, top_k=top_k, repetition_penalty=5.0,
                            pad_token_id=tokenizer.eos_token_id)
    generated_texts = []
    for generated in output:
        text = tokenizer.decode(generated, skip_special_tokens=True)
        generated_texts.append(text)
    return generated_texts

In [None]:
model.train()

num_epochs = 10
accumulation_steps = 4  # Accumulate gradients over 4 steps before updating
optimizer = AdamW(model.parameters(), lr=1e-5)
optimizer.zero_grad()

for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}', leave=False)

    for step, batch in enumerate(progress_bar):

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        output_ids = batch["input_ids"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=output_ids)
        loss = outputs.loss
        total_loss += loss.item()
        
        # Perform gradient accumulation
        loss = loss / accumulation_steps  # Scale loss for gradient accumulation
        loss.backward()

        if (step + 1) % accumulation_steps == 0 or step == len(train_dataloader) - 1:
            # Update model parameters after accumulation_steps or at the end of epoch
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            optimizer.zero_grad()

        # Update progress bar
        progress_bar.set_postfix({'Loss': total_loss / (step + 1)})

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1} Average Loss: {avg_loss:.4f}")

    # Validation loop
    model.eval()
    valid_progress_bar = tqdm(valid_dataloader, desc=f'Epoch {epoch + 1}', leave=False)

    with torch.no_grad():
        total_bleu_score = 0
        total_rouge_scores = {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0}

        for step, batch in enumerate(valid_progress_bar):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            generated_texts = generate_text(input_ids, attention_mask=attention_mask, num_return_sequences=1,
                                            max_length=512)

            # BLEU evaluation
            bleu_score = calculate_bleu_score(generated_texts, batch["input_ids"])
            total_bleu_score += bleu_score

            # ROUGE evaluation
            rouge_scores = calculate_rouge_scores(generated_texts, batch["input_ids"])
            for key, value in rouge_scores.items():
                if key in total_rouge_scores:
                    total_rouge_scores[key] += value.mid.fmeasure
            valid_progress_bar.set_postfix({
                'Val_bleu_score': total_bleu_score / (step + 1),
                'Val_rouge1': total_rouge_scores['rouge1'] / (step + 1),
                'Val_rouge2': total_rouge_scores['rouge2'] / (step + 1),
                'Val_rougeL': total_rouge_scores['rougeL'] / (step + 1)
            })

        avg_bleu_score = total_bleu_score / len(valid_dataloader)
        avg_rouge_scores = {key: value / len(valid_dataloader) for key, value in total_rouge_scores.items()}

        print("Validation Results - Epoch {}: BLEU: {:.4f}, ROUGE: {}".format(epoch, avg_bleu_score, avg_rouge_scores))

    model.train()

In [45]:
output_path = 'GPT2-persona-model-new.pth'
torch.save(model.state_dict(), output_path)

In [3]:
model_path = 'GPT2-persona-model-new.pth'
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.load_state_dict(torch.load(model_path))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token_id = tokenizer.eos_token_id


In [22]:
# Example prompt
prompt = (tokenizer.bos_token + "Hello, my name is Sasha. What's your name? And how are you?" + "\n-----\n")
# prompt = (tokenizer.bos_token + "Hi. What brought you here?" + "\n-----\n")

# Generate multiple responses
num_return_sequences = 3
generated_responses = []
for _ in range(num_return_sequences):
    # Generate one response at a time
    input_data = tokenizer(prompt, truncation=True, padding=True, max_length=256)

    input_ids = torch.tensor([input_data['input_ids']])
    attention_mask = torch.tensor([input_data['attention_mask']])
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    generated_response = model.generate(input_ids=input_ids, attention_mask=attention_mask,max_new_tokens=64,
                                        num_return_sequences=1, top_k=50, repetition_penalty=1.02,
                                        pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(generated_response[0], skip_special_tokens=True)
    generated_responses.append(generated_text)
    prompt = generated_text.replace('__ SILENCE __',"")
    
# Print the generated responses
for i, response in enumerate(generated_responses):
    print(f"Generated Response {i + 1}: {response}")

Generated Response 1: Hello, my name is Sasha. What's your name? And how are you?
-----
Hi! I am jessica. How are you doing today?
-----
I'm good thanks for asking. Just got done with a long day at work.
-----
That sounds like fun. Do you have any hobbies?
-----
Yes, but mostly just watching movies and reading books. You?
Generated Response 2: Hello, my name is Sasha. What's your name? And how are you?
-----
Hi! I am jessica. How are you doing today?
-----
I'm good thanks for asking. Just got done with a long day at work.
-----
That sounds like fun. Do you have any hobbies?
-----
Yes, but mostly just watching movies and reading books. You?
-----
I love to read too. My favorite book is the bible.
-----
Oh wow that's very interesting. Are you married?
-----
No, not yet. But I do plan on starting a new life soon.
-----
What kind of music do you like?
Generated Response 3: Hello, my name is Sasha. What's your name? And how are you?
-----
Hi! I am jessica. How are you doing today?
-----
I'm