In [2]:
pip install transformers[torch]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
import torch
import pandas as pd
from torch.utils.data import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from transformers import Trainer, TrainingArguments

In [3]:
class SQLDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.questions = df['question'].tolist()
        self.queries = df['sql'].tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        query = self.queries[idx]

        inputs = self.tokenizer.encode_plus(
            question,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        labels = self.tokenizer.encode(
            query,
            padding='max_length',
            truncation=True,
            max_length=64,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs.input_ids.squeeze(),
            'attention_mask': inputs.attention_mask.squeeze(),
            'labels': labels.squeeze()
        }

In [4]:
df = pd.read_csv('cleaned_data.csv')

In [5]:
df.head()

Unnamed: 0,question,sql
0,Tell me what the notes are for South Australia,SELECT Notes FROM table WHERE Current slogan =...
1,What is the current series where the new serie...,SELECT Current series FROM table WHERE Notes =...
2,What is the format for South Australia?,SELECT Format FROM table WHERE State/territory...
3,Name the background colour for the Australian ...,SELECT Text/background colour FROM table WHERE...
4,how many times is the fuel propulsion is cng?,SELECT COUNT Fleet Series (Quantity) FROM tabl...


In [6]:
train_data = df
valid_data = df[40000:45000]
#test_data = df[45000:]

In [7]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
dataset1 = SQLDataset(train_data, tokenizer)
dataset2 = SQLDataset(valid_data, tokenizer)

In [9]:
dataset1[0]

{'input_ids': tensor([8779,  140,  125,    8, 3358,   33,   21, 1013, 2051,    1,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0

In [10]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./output',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-5,
    save_total_limit=1,
    fp16=True,  # Enable mixed-precision training if available
    logging_dir='./logs',
    logging_steps=100,
    save_steps=500,
)

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset1,
    eval_dataset=dataset2,
)


In [13]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [14]:
trainer.train()



Step,Training Loss
100,6.155
200,1.0516
300,0.4956
400,0.3884
500,0.3322
600,0.3123
700,0.292
800,0.2768
900,0.2558
1000,0.2512


TrainOutput(global_step=4053, training_loss=0.39829802636655504, metrics={'train_runtime': 1461.7334, 'train_samples_per_second': 44.36, 'train_steps_per_second': 2.773, 'total_flos': 9871511884922880.0, 'train_loss': 0.39829802636655504, 'epoch': 1.0})

In [15]:
output_dir = './fine-tuned-model'
trainer.save_model(output_dir)

In [17]:
results = trainer.evaluate(dataset2)
print(results)

{'eval_loss': 0.13556520640850067, 'eval_runtime': 28.4914, 'eval_samples_per_second': 175.491, 'eval_steps_per_second': 10.986, 'epoch': 1.0}


In [18]:
input_question = "select average working hours of employee named aman"
input_encoded = tokenizer.encode_plus(
    input_question,
    padding='max_length',
    truncation=True,
    max_length=128,
    return_tensors='pt'
).to(device)

generated = model.generate(
    input_ids=input_encoded.input_ids,
    attention_mask=input_encoded.attention_mask,
    max_length=64,
    num_beams=4,
    early_stopping=True
)
generated_query = tokenizer.decode(generated.squeeze())

print("Generated SQL Query:", generated_query)

Generated SQL Query: <pad> SELECT Average Working Hours FROM table WHERE Name = aman</s>
