In [1]:
import pandas as pd
import os
import json
import re
from datasets import Dataset
from torch.utils.data import DataLoader

from transformers import T5ForConditionalGeneration, T5Tokenizer, MT5ForConditionalGeneration, MT5Tokenizer
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer
os.environ['WANDB_SILENT']="true"
os.environ["WANDB_DISABLED"] = "true"



In [2]:
# cd /content/drive/MyDrive/Shared-Tasks/Num-Eval

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_id="google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [4]:
df_train = pd.read_csv('train-cot.csv')
df_dev = pd.read_csv('val-cot.csv')

In [5]:
train_data = Dataset.from_pandas(df_train)
dev_data = Dataset.from_pandas(df_dev)

In [6]:
def collator(batch):

    input = batch['inputs'] #load original sentences
    label = batch['ans_sent'] #load noisy sentences
    inputs = tokenizer(input, text_target=label, return_tensors="pt", max_length = 512, padding='max_length',truncation=True) #tokenized sentences

    return inputs

In [7]:
train_tokenized = train_data.map(collator, remove_columns=train_data.column_names, batch_size=8, num_proc=4, batched=True)
dev_tokenized = dev_data.map(collator, remove_columns=dev_data.column_names, batch_size=8, num_proc=4, batched=True)


Map (num_proc=4):   0%|          | 0/20995 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1274 [00:00<?, ? examples/s]

In [8]:
#define evaluation steps based on total dataset size
TRAIN_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 5e-5
data_size = len(train_tokenized)
EVAL_STEPS=int(data_size/(TRAIN_BATCH_SIZE*2))
SAVE_STEPS= EVAL_STEPS




In [9]:
#define training argument object
training_args = Seq2SeqTrainingArguments(
  output_dir="./Outputs",
  group_by_length=True,
  per_device_train_batch_size=TRAIN_BATCH_SIZE,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=EPOCHS,
  fp16=False,
  save_steps=SAVE_STEPS,
  eval_steps=EVAL_STEPS,
  logging_steps=EVAL_STEPS,
  learning_rate=LEARNING_RATE,
  warmup_steps=100,
  save_total_limit=2,
)


#define training data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors='pt')


#define Seq2Seq Training object
trainer = Seq2SeqTrainer(
    model=model.cuda(),
    data_collator=data_collator,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=dev_tokenized,
    tokenizer=tokenizer
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [10]:
os.environ['WANDB_SILENT']="true"
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
5248,0.6417,0.007098
10496,0.0094,0.005702
15744,0.0075,0.005184
20992,0.0067,0.004896
26240,0.0062,0.004846


TrainOutput(global_step=26245, training_loss=0.1342715281369312, metrics={'train_runtime': 12706.3733, 'train_samples_per_second': 8.262, 'train_steps_per_second': 2.065, 'total_flos': 1.95138557313024e+16, 'train_loss': 0.1342715281369312, 'epoch': 5.0})

In [11]:
outputDir = "./Outputs"
trainer.save_model(f"{outputDir}/Trial-COT-20k")

###### 