In [1]:
import datasets
import pandas as pd 
import numpy as np

from datasets import load_dataset, load_metric, Dataset


model_checkpoint = 'google/flan-t5-small'
metric = load_metric("sacrebleu")

In [2]:
raw_data = pd.read_csv('../../data/finefoods/Reviews.csv')[['Summary', 'Text']]
df = pd.DataFrame()
df['Text'] = raw_data['Text'].astype(str)
df['Summary'] = raw_data['Summary'].astype(str)

df = df[0:150]
splits = Dataset.from_pandas(df).train_test_split(0.2)
train = splits['train']
val = splits['test']

In [3]:
from transformers import AutoTokenizer 

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
if model_checkpoint in ["google/flan-t5-small", "google/flan-t5-large"], "google/flan-t5-mediun": 
    prefix = "Answer the following question: "

max_input_length = 1000
max_target_length = 1000

def preprocess(d):
    inputs = [prefix + review for review in d['Text']]
    targets = [review for review in d['Summary']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation = True, padding=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_input_length, truncation = True,  padding=True)
    
    model_inputs["labels"] = labels['input_ids']
    return model_inputs



In [8]:
train['Text'][0]

'This is the same stuff you can buy at the big box stores.  There is nothing healthy about it.  It is just carbs and sugars.  Save your money and get something that at least has some taste.'

In [9]:
tokenized_train = train.map(preprocess, batched = True,)
tokenized_val = val.map(preprocess, batched = True,)


  0%|          | 0/1 [00:00<?, ?ba/s]



  0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
from transformers import  AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [11]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    num_train_epochs=1,
    fp16=True,
    #load_best_model_at_end=True,
    report_to="wandb",
    #push_to_hub=True,
)

In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [13]:

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [14]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


In [15]:
trainer.train()


The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: Summary, Text. If Summary, Text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 120
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 8
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mwinsontruong[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/8 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: CUDA out of memory. Tried to allocate 400.00 MiB (GPU 0; 10.00 GiB total capacity; 9.18 GiB already allocated; 0 bytes free; 9.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF