<a href="https://colab.research.google.com/github/Tuhinm2002/t5-recipe/blob/main/T5_Fine_Tuning_Text_Gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install -q transformers

In [None]:
! pip install -q sentencepiece

In [None]:
from transformers import T5Tokenizer,T5ForConditionalGeneration
import torch

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

max_source_length = 512
max_target_length = 128

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
! pip install git+https://github.com/huggingface/datasets.git

Collecting git+https://github.com/huggingface/datasets.git
  Cloning https://github.com/huggingface/datasets.git to /tmp/pip-req-build-bmk_lku1
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/datasets.git /tmp/pip-req-build-bmk_lku1
  Resolved https://github.com/huggingface/datasets.git to commit 439e115d34a2d8737af719660c1b586ac32279dc
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
from datasets import load_dataset

dataset = load_dataset("m3hrdadfi/recipe_nlg_lite")
print(dataset)

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['uid', 'name', 'description', 'link', 'ner', 'ingredients', 'steps'],
        num_rows: 6118
    })
    test: Dataset({
        features: ['uid', 'name', 'description', 'link', 'ner', 'ingredients', 'steps'],
        num_rows: 1080
    })
})


In [None]:
dataset['train']

Dataset({
    features: ['uid', 'name', 'description', 'link', 'ner', 'ingredients', 'steps'],
    num_rows: 6118
})

In [None]:
dataset['test']

Dataset({
    features: ['uid', 'name', 'description', 'link', 'ner', 'ingredients', 'steps'],
    num_rows: 1080
})

In [None]:
dataset['train']['name'][0]

'pork chop noodle soup'

In [None]:
dataset['train']['description'][0]

"we all know how satisfying it is to make great pork tenderloin, ribs, or a roast but the end of the meal creates a new quandary what do you do with the leftover pork contrary to what you might think, it's not that difficult . how to repurpose your meal is where real cooking creativity comes into play, so let us present to you our favorite pork chop soup recipe . with this recipe, you'll discover how the natural bold flavor of pork gives this hearty soup a lift that a vegetable soup or chicken noodle soup just can't get . it's a dinner recipe to warm you up on a cold winter night or a midday restorative for a long work week . throw all the ingredients in a large pot and let it simmer on the stove for a couple hours, or turn it into a slow cooker recipe and let it percolate for an afternoon . this foolproof recipe transforms your favorite comfort food into an easy meal to warm you up again and again . the health benefits of pork pork is a great option if you're on a low carb diet or try

`['description','steps','ingredients']` <br>
`task_prefix = dataset['train']['name']`

In [None]:
a = [1,2,3,4,5,6,7,8,9,10]
print(a[:int(0.8*len(a))])

[1, 2, 3, 4, 5, 6, 7, 8]


In [None]:
def data_prepare(split,dataset):
  n = len(dataset['train'])
  name = []
  ingredients = []
  steps = []
  for i in dataset['train']['name']:
    name.append(i)
  for j in dataset['train']['ingredients']:
    ingredients.append(j)
  for k in dataset['train']['steps']:
    steps.append(k)
  if split == 'train':
    return name[:int(n*0.8)],ingredients[:int(n*0.8)],steps[:int(n*0.8)]
  else:
    return name[int(n*0.8):],ingredients[int(n*0.8):],steps[int(n*0.8):]

train_name,train_ingre,train_steps = data_prepare('train',dataset)
test_name,test_ingre,test_steps = data_prepare('test',dataset)

In [None]:
len(train_ingre),len(test_name)

(4894, 1224)

In [None]:
len(dataset['train'])

6118

In [None]:
task_prefix = "make recipe of:"

In [None]:
encoding = tokenizer(
    [task_prefix + tn for tn in train_name],
    padding = 'longest',
    max_length = max_source_length,
    truncation=True,
    return_tensors='pt'
)

In [None]:
encoding_test = tokenizer(
    [task_prefix + tn for tn in test_name],
    padding = 'longest',
    max_length = max_source_length,
    truncation=True,
    return_tensors='pt'
)

In [None]:
len(encoding.input_ids)

4894

In [None]:
target_encoding = tokenizer(
    [step for step in train_steps],
    padding = 'longest',
    max_length = max_target_length,
    return_tensors='pt'
)

In [None]:
target_encoding_test = tokenizer(
    [step for step in test_steps],
    padding = 'longest',
    max_length = max_target_length,
    return_tensors='pt'
)

In [None]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = ["make recipe of: " + item for item in sample["name"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["steps"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['uid', 'name', 'description', 'link', 'ner', 'ingredients', 'steps'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [None]:
len(target_encoding.input_ids)

4894

In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
! pip install transformers[torch]



In [None]:
! pip install accelerate -U



In [None]:
from transformers import Trainer, TrainingArguments

# Define training args
training_args = TrainingArguments(
    output_dir = './t5-mod',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=False, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=5,
    # logging & evaluation strategies
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=False,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

In [None]:
trainer.train()

Step,Training Loss
500,2.791
1000,2.5187
1500,2.4206
2000,2.3779


TrainOutput(global_step=2295, training_loss=2.5059497176691856, metrics={'train_runtime': 2845.0773, 'train_samples_per_second': 6.451, 'train_steps_per_second': 0.807, 'total_flos': 1.117681312333824e+16, 'train_loss': 2.5059497176691856, 'epoch': 3.0})

In [None]:
trainer.save_model()

In [None]:
input_ids = tokenizer("make recipe of: pork.", return_tensors="pt").input_ids.to("cuda")
outputs = model.generate(input_ids)



In [None]:
print(outputs)

tensor([[    0,  1678,  1043,    16,     3,     9,   508, 22869,   147,  2768,
          1678,     3,     5,   617, 13654,    11,  3989,    21,   204,   676]],
       device='cuda:0')


In [None]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

heat oil in a large skillet over medium heat. add pork and cook for 2 minutes


In [None]:
input_ids = tokenizer("make recipe of: chicken tanduri.", return_tensors="pt").input_ids.to("cuda")
outputs = model.generate(input_ids,max_length=60)
main_string = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(main_string)

preheat the oven to 400degf. spray a 9x13 inch baking dish with nonstick cooking spray. set aside. place the chicken breasts in a large mixing bowl and mix well. add the chicken breasts to the bowl and mix well
