In [None]:
%pip install -U transformers datasets peft tf-keras sacrebleu rouge_score pycocoevalcap

In [None]:
# Import libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

In [None]:
import pandas as pd
from itertools import groupby
from datasets import Dataset
# Load the E2E NLG Challenge dataset
dataset_e2e = load_dataset('e2e_nlg')

# Dictionary to store datasets
datasets = {
    'E2E': dataset_e2e
}

# Hyperparameters for each dataset
hyperparams = {
    'E2E': {
        'weight_decay': 0.01,
        'dropout_prob': 0.1,
        'label_smooth': 0.1,
        'length_penalty': 0.9
    }
}

In [None]:
# Grouping function for E2E NLG test dataset
def group_e2e_test_data(test_data):
    df = pd.DataFrame(test_data)
    df.sort_values(by='meaning_representation', inplace=True)
    grouped = df.groupby('meaning_representation')['human_reference'].apply(list).reset_index()
    grouped_dataset = Dataset.from_pandas(grouped)
    return grouped_dataset

def preprocess_e2e(examples):
    inputs = examples['meaning_representation']
    targets = examples['human_reference']
    texts = [inp + ' ' + tgt for inp, tgt in zip(inputs, targets)]
    model_inputs = tokenizer(texts, truncation=True)
    return model_inputs

def preprocess_e2e_eval(examples):
    inputs = examples['meaning_representation']
    targets = examples['human_reference']
    model_inputs = tokenizer(inputs, truncation=True)
    model_inputs["meaning_representation"] = inputs
    model_inputs["human_reference"] = targets
    return model_inputs

In [None]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

def custom_collate_fn(batch):
    human_references = [item['human_reference'] for item in batch]
    meaning_representations = [item['meaning_representation'] for item in batch]
    # Remove 'human_reference' before using data_collator
    batch = [{'input_ids': item['input_ids'], 'attention_mask': item['attention_mask']} for item in batch]
    batch = data_collator(batch)
    batch['human_reference'] = human_references
    batch['meaning_representation'] = meaning_representations
    return batch

def generate_predictions(test_dataloader):
    model.eval()
    predictions = []
    references = []
    progress_bar = tqdm(test_dataloader, desc="Generating predictions")
    for batch in progress_bar:
        # Use only the meaning representation as input
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        meaning_representations = batch['meaning_representation']
        human_references = batch['human_reference']
        with torch.no_grad():
            output_ids = model.generate(
                input_ids,
                attention_mask=attention_mask,
                num_beams=10,
                length_penalty=0.9,
                no_repeat_ngram_size=4,
                early_stopping=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        for i, output in enumerate(output_ids):
            prediction = tokenizer.decode(output, skip_special_tokens=True)
            input_text = meaning_representations[i]
            if prediction.startswith(input_text):
                prediction = prediction[len(input_text):].strip()
            predictions.append(prediction)
            references.append(human_references[i])
    return predictions, references

In [None]:
# Update the training and evaluation loop
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
import sacrebleu
import torch

for dataset_name, dataset in datasets.items():
    print(f"Training on {dataset_name} dataset")
    params = hyperparams[dataset_name]
    
    # Load tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'left'
    
    model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
    model.resize_token_embeddings(len(tokenizer))
    
    # Apply LoRA
    lora_config = LoraConfig(
        r=4,
        lora_alpha=32,
        target_modules=["c_attn"],
        lora_dropout=params['dropout_prob'],
        init_lora_weights="gaussian",
        bias="none"
    )
    model = get_peft_model(model, lora_config)
    
    # Preprocess the dataset using the appropriate function
    if dataset_name == 'E2E':
        test_data = group_e2e_test_data(dataset['test'])
        preprocess_function = preprocess_e2e
        preprocess_function_eval = preprocess_e2e_eval
    
    train_data = dataset['train']
    
    train_tokenized = train_data.map(preprocess_function, batched=True, remove_columns=train_data.column_names)
    test_tokenized = test_data.map(preprocess_function_eval, batched=True, remove_columns=train_data.column_names)
    
    # Data collator and DataLoaders
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, return_tensors="pt", mlm=False)
    train_dataloader = DataLoader(train_tokenized, shuffle=True, batch_size=8, collate_fn=data_collator)
    test_dataloader = DataLoader(test_tokenized, batch_size=8, collate_fn=custom_collate_fn)
    
    # Optimizer and scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=params['weight_decay'])
    num_epochs = 5
    num_training_steps = num_epochs * len(train_dataloader)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [{dataset_name}]")
        for batch in progress_bar:
            inputs = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**inputs)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            progress_bar.set_postfix(loss=loss.item())
    
    # Evaluation
    predictions, references = generate_predictions(test_dataloader, model, tokenizer, params['length_penalty'])
    bleu = sacrebleu.corpus_bleu(predictions, references)
    print(f"{dataset_name} BLEU score: {bleu.score}")
