# Required Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import spacy
import json
import re
import pickle
import optuna
import torch
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
#from torchtune import Trial, RandomSearchScheduler, Reporter

In [None]:
! pip install datasets
! pip install transformers[torch]
! pip install tokenizers
! pip install evaluate
! pip install rouge_score
! pip install sentencepiece
! pip install huggingface_hub
!pip install timexy

In [None]:
import nltk
import evaluate
import datasets
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq,TFMT5ForConditionalGeneration, MT5Tokenizer, AutoModelForSeq2SeqLM
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer
from transformers import TrainingArguments, Trainer, T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset, DatasetDict,load_dataset
import timexy
from timexy import Timexy
from timexy import rule
from timexy.languages import en

In [None]:
nlp = spacy.load("en_core_web_sm")

# Optionally add config if varying from default values
config = {
    "kb_id_type": "timex3",  # possible values: 'timex3'(default), 'timestamp'
    "label": "timexy",       # default: 'timexy'
    "overwrite": False       # default: False
}
nlp.add_pipe("timexy", config=config, before="ner")

<timexy.timexy.Timexy at 0x7f52dea22ad0>

In [None]:
output_dir = "./results"

In [None]:
chkpnt = './input/checkpoints/chkpnt1'

In [None]:
torch.cuda.empty_cache()

In [None]:
MODEL_NAME = "google/flan-t5-base"

In [None]:
df = pd.read_csv('dataset.csv')
testing = pd.read_csv('testing_data.csv')

# Model Preparation

In [None]:
# Run it the first time
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
model.save_pretrained(chkpnt)
tokenizer.save_pretrained(chkpnt)

In [None]:
# Training Arguments
L_RATE = 3e-4
BATCH_SIZE = 4
PER_DEVICE_EVAL_BATCH = 2
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 5
drop_out = 0.5

In [None]:
model = T5ForConditionalGeneration.from_pretrained(chkpnt)
tokenizer = T5Tokenizer.from_pretrained(chkpnt)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
model.config.dropout_rate = drop_out

In [None]:
training_args = Seq2SeqTrainingArguments(
   output_dir=output_dir,
   evaluation_strategy="epoch",
  #  save_steps=num_train_steps_per_epoch,
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False,
   logging_dir=f"{output_dir}/logs",  # TensorBoard logs directory
   logging_steps=10,  # Adjust to control how often to log metrics
   report_to="tensorboard"  # Report metrics to TensorBoard
)

In [None]:
def preprocess_fun(batch):
  prefix = "Generate: " # to inform the model to generate on the given input
  clean_batch=[]
  check=[]
  for i in batch['Inputs']:
    x=""
    l=[]
    for j in i.split('\n')[:-1]:
      x += re.split('~__(True|False)__',j)[0] # to extract whether a sentence has temporal sense or not
      l.append(re.split('~__(True|False)__',j)[1])
    clean_batch.append(x)
    check.append(l)

  # The "inputs" are the tokenized answer:
  inputs = [(prefix + doc) for doc in clean_batch]
  model_inputs = tokenizer(inputs, max_length=512, truncation=True,padding='max_length')

  # The "labels" are the tokenized outputs:
  labels = tokenizer(text_target=batch["Outputs"],
                      max_length=512,
                      truncation=True,
                     padding='max_length')
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_preds):
    preds,refs = eval_preds

    if preds.ndim == 0:
        preds = preds.unsqueeze(0)

    if refs.ndim == 0:
        refs = refs.unsqueeze(0)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(refs, skip_special_tokens=True)

    # pred_tags = [nlp(seq) for seq in decode_preds]
    # actual_tags = [nlp(seq) for seq in decoded_labels]
    # err = nltk.metrics.scores.log_likelihood(actual_tags,pred_tags)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return result

# Dataset Preparation

In [None]:
# Get unique chunk identifiers
unique_chunks = df["Chunk"].unique()

# Shuffle the unique chunks
np.random.shuffle(unique_chunks)

# Reorder the DataFrame based on the shuffled chunks
df = pd.concat([df[df["Chunk"] == chunk] for chunk in unique_chunks], ignore_index=True)

In [None]:
c=0
for index, row in df.iterrows():
    if((row['Chunk'])<=1046):
        c+=1
train = df.iloc[:c,:]
valid = df.iloc[c:,:]

In [None]:
trd = Dataset.from_pandas(train)
vsd = Dataset.from_pandas(valid)
tsd = Dataset.from_pandas(testing)
dataset_dict = DatasetDict(
  {
      'train':trd,
      'valid':vsd,
      'test':tsd
  }
)

tokenized_dataset = dataset_dict.map(preprocess_fun,batched=True)

In [None]:
with open("tokenized_dataset.pickle", 'wb') as file:
    pickle.dump(tokenized_dataset, file)

# Validation

In [None]:
perc = 80
train_size = int((perc/100)*max(df['Chunk']))

In [None]:
def crossValid(k=5):
  sm = int(np.floor(train_size/k)) # each batch size
  split_data = [(i*sm,(i+1)*sm) for i in range(k-1)] + [((k-1)*sm,train_size)]
  evals = [] # model evaluations and model

  for split in split_data:
    t_i,v_i = [],[]
    for index,row in df.iterrows():
        if(split[0]<=row['Chunk']<=split[1]):
            v_i.append(index)
        else:
            t_i.append(index)

    valid = df.iloc[v_i]
    train = df.iloc[t_i]

    trd = Dataset.from_pandas(train)
    vdd = Dataset.from_pandas(valid)

    dataset_dict = DatasetDict(
      {
          'train':trd,
          'valid':vdd,
          'test':tsd
      }
    )

    tokenized_dataset = dataset_dict.map(preprocess_fun,batched=True)

    model = T5ForConditionalGeneration.from_pretrained(chkpnt)
    tokenizer = T5Tokenizer.from_pretrained(chkpnt)
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    model.config.dropout_rate = drop_out

    trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    )
    trainer.train()
    evals.append((trainer.evaluate(),model))
  return evals

In [None]:
evals = crossValid()

In [None]:
s = f'evals_{datetime.now()}.txt'
with open(s,'w') as f:
    print(evals,file=f)

# Training

In [None]:
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["valid"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics
)

In [None]:
trainer.train()

# Testing

In [None]:
trainer.predict(tokenized_dataset["test"])

# Saving

In [None]:
c = datetime.now()
s1 = f"./model_lr_3e_4_drop_out_5_epoch_5/model"
s2 = f"./model_lr_3e_4_drop_out_5_epoch_5/tokenizer"
model.save_pretrained(s1)
tokenizer.save_pretrained(s2)

# Manual Testing

In [None]:
# Example text for generation
text = '''
Generate:
How many days in a week?'''

# Tokenize the text and generate output
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs)

# Decode the output tokens to text
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated output:", result)