<a href="https://colab.research.google.com/github/Satwikram/NLP-Implementations/blob/main/Language/Fine-Tuning%20T5%20model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram K

### Setup

In [None]:
!pip install transformers SentencePiece datasets livelossplot

### Importing Dependencies

In [8]:
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
from transformers import TFMT5ForConditionalGeneration, MT5Tokenizer, DataCollatorForSeq2Seq
from datasets import load_dataset
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import *



import numpy as np
import pandas as pd

import os
from pathlib import Path
from livelossplot import PlotLossesKeras

In [9]:
df = pd.read_csv("/content/FHR_NLP_Queries - Sheet1.csv")

In [10]:
df.head()

Unnamed: 0,S. No.,Natural Language,SQL query,Result
0,1,Retrieve the mother's prenatal delivery record...,SELECT * FROM mother_prenatal_delivery WHERE m...,Not executed
1,2,Get the number of prenatal delivery records fo...,"SELECT contributor_system_cd, COUNT(*) FROM mo...",Not executed
2,3,Find the prenatal delivery records where the r...,SELECT * FROM mother_prenatal_delivery WHERE r...,Not executed
3,4,Retrieve the prenatal delivery records with ab...,SELECT * FROM mother_prenatal_delivery WHERE c...,Not executed
4,5,Get the earliest and latest valid dates for th...,"SELECT MIN(valid_from_dt_tm), MAX(valid_until_...",Not executed


### Global Variables

In [11]:
checkpoint = "google/mt5-small"

In [12]:
tokenizer = MT5Tokenizer.from_pretrained(checkpoint)
model = TFMT5ForConditionalGeneration.from_pretrained(checkpoint)

All model checkpoint layers were used when initializing TFMT5ForConditionalGeneration.

All the layers of TFMT5ForConditionalGeneration were initialized from the model checkpoint at google/mt5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMT5ForConditionalGeneration for predictions without further training.


### Tokenization

In [13]:
dataset = load_dataset("csv", data_files="/content/FHR_NLP_Queries - Sheet1.csv")
dataset = dataset["train"].shuffle(seed=42)



  0%|          | 0/1 [00:00<?, ?it/s]



In [14]:
df.columns

Index(['S. No.', 'Natural Language ', 'SQL query', 'Result'], dtype='object')

In [15]:
def preprocess_function(examples):

    padding = "max_length"
    max_length = 200

    inputs = [ex for ex in examples["Natural Language "]]
    targets = [ex for ex in examples["SQL query"]]

    model_inputs = tokenizer(inputs, max_length=max_length, padding=padding, truncation=True)

    labels = tokenizer(targets, max_length=max_length, padding=padding, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [16]:
train_dataset = dataset.map(preprocess_function, batched=True, desc="Running tokenizer")

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=tokenizer.pad_token_id,
    pad_to_multiple_of=64,
    return_tensors="np")

tf_train_dataset = model.prepare_tf_dataset(
    train_dataset,
    collate_fn=data_collator,
    batch_size=8,
    shuffle=True)



### Compiling the model

In [20]:
model.compile(optimizer=Adam(3e-5), metrics=["accuracy"])

### Callbacks

In [21]:
def callbacks() -> list:

    run_name = "run 1"
    save_path = Path("models")
    os.makedirs(save_path/"logs", exist_ok=True)

    checkpoint = ModelCheckpoint(save_path, monitor="val_loss", save_best_only=True,
                                                    verbose=1)

    earlystopping = EarlyStopping(monitor="val_loss", verbose=1, restore_best_weights = True,
                                                    patience=5)

    logger = TensorBoard(save_path/"logs"/run_name, histogram_freq=2, write_graph=True, write_images=True)

    lr = ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=2, verbose=1,
                                         min_delta=0.0001, cooldown=0, min_lr=0)

    return [checkpoint, earlystopping, lr, logger, PlotLossesKeras()]

### Training the model

In [None]:
model.fit(tf_train_dataset, epochs=10, callbacks=callbacks())

Epoch 1/10

