In [None]:
!pip install datasets
!pip install accelerate -U

In [None]:
from google.colab import drive
import pandas as pd
import os
import json
import re
from datasets import Dataset
from torch.utils.data import DataLoader
from datasets import load_from_disk
from transformers import T5ForConditionalGeneration, T5Tokenizer, MT5ForConditionalGeneration, MT5Tokenizer
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer
os.environ['WANDB_SILENT']="true"
os.environ["WANDB_DISABLED"] = "true"

In [None]:
drive.mount('/content/drive')
!cd /content/drive/MyDrive/Shared-Tasks/Num-Eval

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd /content/drive/MyDrive/Shared-Tasks/Num-Eval

/content/drive/MyDrive/Shared-Tasks/Num-Eval


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_id="google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [None]:
train_data = load_from_disk('./Dataset/train-drop-20k')
dev_data = load_from_disk('./Dataset/dev')

In [None]:
def collator(batch):

    input = batch['t5-input'] #load original sentences
    label = batch['ans'] #load noisy sentences
    inputs = tokenizer(input, text_target=label, return_tensors="pt", max_length = 512, padding='max_length',truncation=True) #tokenized sentences

    return inputs

In [None]:
train_tokenized = train_data.map(collator, remove_columns=train_data.column_names, batch_size=8, num_proc=4, batched=True)
dev_tokenized = dev_data.map(collator, remove_columns=dev_data.column_names, batch_size=8, num_proc=4, batched=True)


In [None]:
#define evaluation steps based on total dataset size
TRAIN_BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 5e-5
data_size = len(train_tokenized)
EVAL_STEPS=int(data_size/(TRAIN_BATCH_SIZE*2))
SAVE_STEPS= EVAL_STEPS




In [None]:
#define training argument object
training_args = Seq2SeqTrainingArguments(
  output_dir="./Outputs",
  group_by_length=True,
  per_device_train_batch_size=TRAIN_BATCH_SIZE,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=EPOCHS,
  fp16=False,
  save_steps=SAVE_STEPS,
  eval_steps=EVAL_STEPS,
  logging_steps=EVAL_STEPS,
  learning_rate=LEARNING_RATE,
  warmup_steps=100,
  save_total_limit=2,
)


#define training data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors='pt')


#define Seq2Seq Training object
trainer = Seq2SeqTrainer(
    model=model.cuda(),
    data_collator=data_collator,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=dev_tokenized,
    tokenizer=tokenizer
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
os.environ['WANDB_SILENT']="true"
trainer.train()

In [None]:
outputDir = "./Outputs"
trainer.save_model(f"{outputDir}/base-drop-20k")