In [None]:
# Task1
"""
To load GEM E2E NLG datasets and train them using T5 tokenizer and generate sentences T5ForConditionalGeneration model 
Below code snippets include training and evaluation of model with METEOR, ROUGE, BERT_score metrics
"""

In [None]:
%%capture
!pip install git+https://github.com/huggingface/datasets.git
!pip install rouge_score
!pip install sentencepiece
!pip install transformers
!pip install bert-score


In [None]:
import datasets
data = datasets.load_dataset('GEM/e2e_nlg')



  0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['gem_id', 'gem_parent_id', 'meaning_representation', 'target', 'references'],
        num_rows: 33525
    })
    validation: Dataset({
        features: ['gem_id', 'gem_parent_id', 'meaning_representation', 'target', 'references'],
        num_rows: 1484
    })
    test: Dataset({
        features: ['gem_id', 'gem_parent_id', 'meaning_representation', 'target', 'references'],
        num_rows: 1847
    })
    challenge_train_sample: Dataset({
        features: ['gem_id', 'gem_parent_id', 'meaning_representation', 'target', 'references'],
        num_rows: 500
    })
    challenge_validation_sample: Dataset({
        features: ['gem_id', 'gem_parent_id', 'meaning_representation', 'target', 'references'],
        num_rows: 500
    })
    challenge_test_scramble: Dataset({
        features: ['gem_id', 'gem_parent_id', 'meaning_representation', 'target', 'references'],
        num_rows: 500
    })
})

In [None]:
# split the datasets for smaller training
"""train_data = datasets.load_dataset('GEM/e2e_nlg', split='train[:500]')"""



In [None]:
data['train'][0]

{'gem_id': 'e2e_nlg-train-0',
 'gem_parent_id': 'e2e_nlg-train-0',
 'meaning_representation': 'name[The Eagle], eatType[coffee shop], food[Japanese], priceRange[less than £20], customer rating[low], area[riverside], familyFriendly[yes], near[Burger King]',
 'target': 'The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food.',
 'references': []}

In [None]:
def construct_input_for_batch(batch):
    """Construct input strings from a batch."""
    source = [' '.join(concepts) for concepts in batch["meaning_representation"]]
    target = batch["target"]
    return source, target

In [None]:
def batch_tokenize(batch, tokenizer, max_length=32):
    """Construct the batch (source, target) and run them through a tokenizer."""
    source, target = construct_input_for_batch(batch)
    res = {
        "input_ids": tokenizer(source)["input_ids"],
        "labels": tokenizer(
            target,
            padding="max_length",
            truncation=True,
            max_length=max_length
        )["input_ids"],
    }
    return res

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

MODEL_NAME = "google/t5-v1_1-base"
MAX_LENGTH = 32

tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-base")

train_data_tokenized = data['train'].map(
    lambda batch: batch_tokenize(batch, tokenizer, max_length=MAX_LENGTH),
    batched=True
)
valid_data_tokenized = data['validation'].map(
    lambda batch: batch_tokenize(batch, tokenizer, max_length=MAX_LENGTH),
    batched=True
)
     

loading file spiece.model from cache at /root/.cache/huggingface/hub/models--google--t5-v1_1-base/snapshots/b5fc947a416ea3cb079532cb3c2bbadeb7f800fc/spiece.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--google--t5-v1_1-base/snapshots/b5fc947a416ea3cb079532cb3c2bbadeb7f800fc/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--google--t5-v1_1-base/snapshots/b5fc947a416ea3cb079532cb3c2bbadeb7f800fc/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--t5-v1_1-base/snapshots/b5fc947a416ea3cb079532cb3c2bbadeb7f800fc/config.json
Model config T5Config {
  "_name_or_path": "google/t5-v1_1-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate"

In [None]:
import torch
from transformers import (
    Seq2SeqTrainer, Seq2SeqTrainingArguments
)
  

In [None]:
from datasets import load_metric
from bert_score import score

bertscore_scorer = load_metric("bertscore")

def bertscore_metric_builder(tokenizer):
    def compute_bertscore_metrics(pred):
        labels_ids = pred.label_ids
        pred_ids = pred.predictions
        # All special tokens are removed.
        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        labels_ids[labels_ids == -100] = tokenizer.pad_token_id
        label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
        # Compute the metric.
        _, _, bertscore_results = score(pred_str, label_str, lang='en', model_type='bert-base-uncased', device=torch.device('cuda'))
        return {
            "precision": round(bertscore_results['precision'].item(), 4),
            "recall": round(bertscore_results['recall'].item(), 4),
            "f1": round(bertscore_results['f1'].item(), 4),
        }
    return compute_bertscore_metrics

bertscore_metric_fn = bertscore_metric_builder(tokenizer)



'\nfrom datasets import load_metric\nfrom bert_score import score\n\nbertscore_scorer = load_metric("bertscore")\n\ndef bertscore_metric_builder(tokenizer):\n    def compute_bertscore_metrics(pred):\n        labels_ids = pred.label_ids\n        pred_ids = pred.predictions\n        # All special tokens are removed.\n        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n        labels_ids[labels_ids == -100] = tokenizer.pad_token_id\n        label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)\n        # Compute the metric.\n        _, _, bertscore_results = score(pred_str, label_str, lang=\'en\', model_type=\'bert-base-uncased\', device=torch.device(\'cuda\'))\n        return {\n            "precision": round(bertscore_results[\'precision\'].item(), 4),\n            "recall": round(bertscore_results[\'recall\'].item(), 4),\n            "f1": round(bertscore_results[\'f1\'].item(), 4),\n        }\n    return compute_bertscore_metrics\n\nbertsc

In [None]:
from datasets import load_metric
rouge_scorer = load_metric("rouge")

def rouge_metric_builder(tokenizer):
    def compute_rouge_metrics(pred):
        labels_ids = pred.label_ids
        pred_ids = pred.predictions
        # All special tokens are removed.
        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        labels_ids[labels_ids == -100] = tokenizer.pad_token_id
        label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
        # Compute the metric.
        rouge_results = rouge_scorer.compute(
            predictions=pred_str,
            references=label_str,
            rouge_types=["rouge2", "rougeL"],
            use_aggregator=True,
            use_stemmer=False,
        )
        return {
            "rouge2": round(rouge_results['rouge2'].mid.fmeasure, 4),
            "rougeL": round(rouge_results['rougeL'].mid.fmeasure, 4),
        }
    return compute_rouge_metrics

rouge_metric_fn = rouge_metric_builder(tokenizer)


In [None]:
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
RANDOM_SEED = 42
BEAM_SIZE = 4

model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model = model.to(DEVICE)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--t5-v1_1-base/snapshots/b5fc947a416ea3cb079532cb3c2bbadeb7f800fc/config.json
Model config T5Config {
  "_name_or_path": "/home/patrick/hugging_face/t5/t5-v1_1-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.26.1",
  "use_cache": true,
  "vocab_size": 32128
}

loading weights file pytorch_model.bin from c

In [None]:
train_args = Seq2SeqTrainingArguments(
    output_dir="results/",
    evaluation_strategy="epoch", 
    save_strategy="epoch",
    logging_steps=100,
    # optimization args, the trainer uses the Adam optimizer
    # and has a linear warmup for the learning rate
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=1e-04,
    num_train_epochs=3,
    warmup_steps=1000,
    # misc args
    seed=RANDOM_SEED,
    disable_tqdm=False,
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=train_args,
    train_dataset=train_data_tokenized,
    eval_dataset=valid_data_tokenized,
    tokenizer=tokenizer,
    compute_metrics=rouge_metric_fn,
)

trainer._max_length = MAX_LENGTH
trainer._num_beams = BEAM_SIZE

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, meaning_representation, gem_id, gem_parent_id, references. If target, meaning_representation, gem_id, gem_parent_id, references are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 200
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 75
  Number of trainable parameters = 247577856


Epoch,Training Loss,Validation Loss,Rouge2,Rougel
1,No log,23.920404,0.0,0.0423


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, meaning_representation, gem_id, gem_parent_id, references. If target, meaning_representation, gem_id, gem_parent_id, references are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1484
  Batch size = 8
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transfo

KeyboardInterrupt: ignored

In [None]:
# Task2
"""
Further code is 
1. To load the last best checkpoint and generate text using Beam search decoder
2. Submit the generated sentences in a .json format
3. Evaluate the model and submission using the GEM_metrics
"""

In [None]:
#Here in the following sequence of steps, we are
# Loading the model from a best available checkpoint.

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, T5Tokenizer, T5ForConditionalGeneration

# Replaced the path with trained checkpoints
checkpoint_path = "checkpoints/checkpoint-8382"


# Load the tokenizer from the same pre-trained checkpoint as the model.
tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-base")
# Load the model from a checkpoint.
model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)

# Create the Seq2SeqTrainer.
trainer = Seq2SeqTrainer(
    model=model,
    args=train_args,
    train_dataset=train_data_tokenized,
    eval_dataset=valid_data_tokenized,
    tokenizer=tokenizer,
    compute_metrics=rouge_metric_fn,
)


loading file spiece.model from cache at /root/.cache/huggingface/hub/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/spiece.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_p

In [None]:
train_args = Seq2SeqTrainingArguments(
    output_dir="results/",
    evaluation_strategy="epoch", 
    save_strategy="epoch",
    logging_steps=100,
    # optimization args, the trainer uses the Adam optimizer
    # and has a linear warmup for the learning rate
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=1e-04,
    num_train_epochs=3,
    warmup_steps=1000,
    # misc args
    seed=RANDOM_SEED,
    disable_tqdm=False,
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
    predict_with_generate=True,
)
trainer = Seq2SeqTrainer(
    model=model,
    args=train_args,
    train_dataset=train_data_tokenized,
    eval_dataset=valid_data_tokenized,
    tokenizer=tokenizer,
    compute_metrics=rouge_metric_fn,
)

trainer._max_length = MAX_LENGTH
trainer._num_beams = BEAM_SIZE


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
def beam_generate_sentences(
    batch,
    model,
    tokenizer,
    num_beams=4,
    max_length=32,
    device='cuda:0'
):
    """Generate outputs from a model with beam search decoding."""
    # Create batch inputs.
    source, _ = construct_input_for_batch(batch)
    # Use the model's tokenizer to create the batch input_ids.
    batch_features = tokenizer(source, padding=True, return_tensors='pt')
    # Move all inputs to the device.
    batch_features = dict([(k, v.to(device)) for k, v in batch_features.items()])

    # Generate with beam search.
    generated_ids = model.generate(
        **batch_features,
        num_beams=num_beams,
        max_length=max_length,
    )

    # Use model tokenizer to decode to text.
    generated_sentences = [
        tokenizer.decode(gen_ids.tolist(), skip_special_tokens=True)
        for gen_ids in generated_ids
    ]
    return generated_sentences

In [None]:
valid_output = data['validation'].map(
    lambda batch: {'generated': beam_generate_sentences(
        batch,
        model,
        tokenizer,
        num_beams=BEAM_SIZE,
        max_length=MAX_LENGTH,
        device=DEVICE)
    },
    batched=True,
    batch_size=8,
)

Map:   0%|          | 0/1484 [00:00<?, ? examples/s]

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

In [None]:
# Evaluate for ROUGE-2/L
rouge_results = rouge_scorer.compute(
    predictions=valid_output["generated"],
    references=valid_output["target"],
    rouge_types=["rouge2", "rougeL"],
    use_aggregator=True, use_stemmer=False,
)

f"R-2: {rouge_results['rouge2'].mid.fmeasure:.3f} R-L: {rouge_results['rougeL'].mid.fmeasure:.3f}"

'R-2: 0.144 R-L: 0.269'

In [None]:
challenge_train_sample_output = data["challenge_train_sample"].map(
    lambda batch: {
        'generated': beam_generate_sentences(
            batch,
            model,
            tokenizer,
            num_beams=BEAM_SIZE,
            max_length=MAX_LENGTH,
            device=DEVICE)
    },
    batched=True,
    batch_size=8,
)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

In [None]:
valid_formatted = [o['generated'] for o in valid_output]
valid_keys = [o['gem_id'] for o in data['validation']]

challenge_train_sample_formatted = [o['generated'] for o in challenge_train_sample_output]
challenge_train_sample_keys = [o['gem_id'] for o in data['challenge_train_sample']]

In [None]:
SUBMISSION_NAME = "An identifying name of your system"
DESCRIPTION = "An optional brief description of the system that will be shown on the website"

submission_dict = {
    "submission_name": SUBMISSION_NAME ,
    "param_count": sum(p.numel() for p in model.parameters()),
    "description": DESCRIPTION,
    "tasks": {
      "e2e_validation": {
          "values": valid_formatted, 
          "keys": valid_keys
          }
    }
}


# This format is scalable to more tasks: 
# you simply need to add more outputs to the `tasks` subfield.

new_task_name = "e2e_challenged_train_sample"
new_task_data = {
    "values": challenge_train_sample_formatted, 
    "keys": challenge_train_sample_keys
} 
submission_dict["tasks"][new_task_name] = new_task_data

In [None]:
import json

with open('results/gem_submission.json', 'w') as f:
    f.write(json.dumps(submission_dict))

In [None]:
# Task3
"""
From the above mentioned gem_submission.json file 
Here is the human evaluation of few phrases
Input: 
"name[The Eagle], eatType[coffee shop], food[Japanese], priceRange[less than £20], customer rating[low], area[riverside], familyFriendly[yes], near[Burger King]",
The Eagle is a low rated coffee shop near Burger King and the riverside that is family friendly and is less than £20 for Japanese food.

"name[The Mill], eatType[coffee shop], food[French], area[riverside], near[The Sorrento]",
"Located near The Sorrento is a French Theme eatery and coffee shop called The Mill, with a price range at £20-£25 it is in the riverside area."

"name[Loch Fyne], food[French], area[riverside], near[The Rice Boat]",
"For luxurious French food, the Loch Fyne is located by the river next to The Rice Boat."

"name[The Rice Boat], eatType[restaurant], food[French], customer rating[5 out of 5], area[riverside]",
The Rice Boat is an adult French restaurant with high customer rating  located in the Riverside area.

"name[The Wrestlers], eatType[coffee shop], food[Japanese], priceRange[less than £20], area[riverside], familyFriendly[no], near[Raja Indian Cuisine]",
The Wrestlers coffee shop serves Japanese food.  It is situated near Raja Indian Cuisine at the riverside.  Prices are less than £20.  It is not family-friendly.

"name[Aromi], eatType[coffee shop], food[French], customer rating[low], area[city centre]",
"In the city centre lies Aromi, a French coffee shop for adults with a low customer rating."

"""
"""
Output: 
"The Golden Curry is a family-friendly Japanese restaurant with a price range of less than \u00a320. It is located in the city centre.", 
"The Golden Curry is a family-friendly restaurant with a price range of less than \u00a320. It is located in the city centre near Caf\u00e9 Rouge", 
"The Golden Curry is a family-friendly Japanese restaurant with a price range of less than \u00a320. It is located in the city centre.", 
"The Golden Curry is a family-friendly Japanese restaurant with a price range of less than \u00a320. It is located in the city centre.", 
"The Golden Curry is a family-friendly fast food restaurant with a high customer rating. It is located in the city centre near Caf\u00e9 Rouge.", 
"Located in the city centre, The Golden Palace is a coffee shop that serves Indian food. It is not family-friendly and has a low", 
"The Golden Curry is a family friendly restaurant with a price range of less than \u00a320. It is located in the city centre near Caf\u00e9 Rouge.", 
"Located in the city centre, The Golden Palace is a coffee shop providing Indian food in the high price range. It is near Caf\u00e9 Rouge.", 
"The Golden Curry is a family friendly restaurant with a price range of less than \u00a320. It is located in the city centre.",
"Located in the city centre, The Golden Palace is a coffee shop providing Indian food in the high price range. It is near Caf\u00e9 Rouge.", 
"The Waterman is a family-friendly Japanese restaurant with a price range of less than \u00a320. It is located in the city centre.",
"""

# Human error evaluation:
"""
1. priceRange - The price range is missing and replaced with \u00a320
2. near - The near paramter is also missing and replaced with Caf\u00e9
3. 
"""

In [None]:
# Further steps are for metrics evaluation using a GEM-metrics.git package
%%capture

!git clone https://github.com/GEM-benchmark/GEM-metrics.git
%cd GEM-metrics
!pip install -r requirements.txt

In [None]:
!python run_metrics.py  results/gem_submission.json