In [1]:
from transformers import GPT2Tokenizer, AdamW, get_scheduler, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook as tqdm
import numpy as np
import evaluate
from peft import LoraConfig, get_peft_model
import re
import time
from matplotlib import pyplot as plt 
%matplotlib inline
myrank=32

In [2]:
model_checkpoint = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token
#tokenizer.padding_side = "left"

raw_dataset = load_dataset("e2e_nlg_cleaned")
def mytokenize(example):
    ans = []
    for i, rep in enumerate(example["meaning_representation"]):
        ans.append("Write a restaurant description for the following attributes:\n" + rep + "\n" + "Description: " + example["human_reference"][i])
    return tokenizer(ans, truncation=True, padding="max_length", max_length=1024)

dataset = raw_dataset.map(mytokenize, batched=True, remove_columns=['meaning_representation', 'human_reference'])
'''print(dataset)
print(dataset["train"][0])'''

def collate_fn(batch):
    return {
        'input_ids': torch.tensor([item['input_ids'] for item in batch]),
        'attention_mask': torch.tensor([item['attention_mask'] for item in batch]),
        'labels': torch.tensor([item['input_ids'] for item in batch])  # Labels are input_ids for language modeling
    }

train_loader = DataLoader(dataset['train'], batch_size=4, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(dataset['validation'], batch_size=4, collate_fn=collate_fn)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.54k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/112k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/133k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/33525 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4299 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4693 [00:00<?, ? examples/s]

Map:   0%|          | 0/33525 [00:00<?, ? examples/s]

Map:   0%|          | 0/4299 [00:00<?, ? examples/s]

Map:   0%|          | 0/4693 [00:00<?, ? examples/s]

In [3]:
model = GPT2LMHeadModel.from_pretrained(model_checkpoint)
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [4]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    
config = LoraConfig(
    r=16, #attention heads
    lora_alpha=32, #alpha scaling
    # target_modules=["q_proj", "v_proj"], #if you know the
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 589824 || all params: 125029632 || trainable%: 0.4717473694555863




In [5]:
from transformers import Trainer, TrainingArguments
trainer = Trainer(
    model=model,
    train_dataset=dataset['train'],
    args=TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=200,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs'
    ),
    data_collator=collate_fn
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

  0%|          | 0/200 [00:00<?, ?it/s]



{'loss': 9.7129, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}
{'loss': 9.5852, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}
{'loss': 9.5952, 'learning_rate': 6e-06, 'epoch': 0.0}
{'loss': 9.3773, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.0}
{'loss': 9.6648, 'learning_rate': 1e-05, 'epoch': 0.0}
{'loss': 9.3136, 'learning_rate': 1.2e-05, 'epoch': 0.0}
{'loss': 9.645, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.0}
{'loss': 9.6284, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.0}
{'loss': 9.2037, 'learning_rate': 1.8e-05, 'epoch': 0.0}
{'loss': 9.4466, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 9.2932, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.01}
{'loss': 9.5133, 'learning_rate': 2.4e-05, 'epoch': 0.01}
{'loss': 9.357, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.01}
{'loss': 9.5052, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.01}
{'loss': 9.4774, 'learning_rate': 3e-05, 'epoch': 0.01}
{'loss': 9.2953, 'learning_rate'

TrainOutput(global_step=200, training_loss=2.491693864390254, metrics={'train_runtime': 556.0954, 'train_samples_per_second': 5.754, 'train_steps_per_second': 0.36, 'train_loss': 2.491693864390254, 'epoch': 0.1})

In [38]:
mybatch = None
i = 2
for b in valid_loader:
    mybatch = b
    i -= 1
    if i == 0:
        break

test_text = tokenizer.batch_decode(mybatch['input_ids'], skip_special_tokens=True)
for d in test_text:
    print(d)

Write a restaurant description for the following attributes:
name[Alimentum], area[city centre], familyFriendly[no]
Description: Alimentum is not a family-friendly place in the city centre.
Write a restaurant description for the following attributes:
name[Alimentum], area[city centre], familyFriendly[no]
Description: Alimentum in city centre is not a family-friendly place.
Write a restaurant description for the following attributes:
name[Alimentum], area[city centre], familyFriendly[no], near[Burger King]
Description: Alimentum is not family-friendly, and is near the Burger King in the city centre.
Write a restaurant description for the following attributes:
name[Alimentum], area[city centre], familyFriendly[no], near[Burger King]
Description: Near Burger King in city centre is the adult establishment Alimentum.


In [33]:
mybatch["input_ids"].device

device(type='cpu')

In [39]:
device = "cuda" if torch.cuda.is_available() else "cpu"
for k in mybatch.keys():
    mybatch[k] = mybatch[k].to(device)

with torch.no_grad():
    generated_tokens = model.generate(**mybatch).cpu().numpy()

decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
for dec in decoded_preds:
    print(dec)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Write a restaurant description for the following attributes:
name[Alimentum], area[city centre], familyFriendly[no]
Description: Alimentum is not a family-friendly place in the city centre.
Write a restaurant description for the following attributes:
name[Alimentum], area[city centre], familyFriendly[no]
Description: Alimentum in city centre is not a family-friendly place.
Write a restaurant description for the following attributes:
name[Alimentum], area[city centre], familyFriendly[no], near[Burger King]
Description: Alimentum is not family-friendly, and is near the Burger King in the city centre.
Write a restaurant description for the following attributes:
name[Alimentum], area[city centre], familyFriendly[no], near[Burger King]
Description: Near Burger King in city centre is the adult establishment Alimentum.


In [42]:
test_text = [
    "Write a restaurant description for the following attributes:\n name[Alimentum], area[city centre], familyFriendly[no]"
    "Write a restaurant description for the following attributes:\n name[Alimentum], area[Shimokitazawa], near[Senpai's house]"
]
tokenized_test = tokenizer(test_text, return_tensors="pt", padding=True).to(device)

with torch.no_grad():
    generated_tokens = model.generate(**tokenized_test).cpu().numpy()

decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
for dec in decoded_preds:
    print(dec)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Write a restaurant description for the following attributes:
 name[Alimentum],  familyFriendly[no]Write a restaurant description for the following attributes:
 name[Alimentum], area[Shimokitazawa], near[Senpai's house]Write


