In [None]:
!pip install datasets transformers torch



In [None]:
import torch
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load the dataset
dataset = load_dataset("xlangai/spider")

# Load T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Define preprocessing function with explicit padding and truncation
def preprocess_function(examples):
    # Tokenize questions and queries
    inputs = tokenizer(examples['question'], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    outputs = tokenizer(examples['query'], padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    # Ensure tensors have consistent shapes
    input_ids = inputs['input_ids'].squeeze()
    attention_mask = inputs['attention_mask'].squeeze()
    labels = outputs['input_ids'].squeeze()

    # Check tensor dimensions and handle potential issues
    if input_ids.dim() == 1:
        input_ids = input_ids.unsqueeze(0)
        attention_mask = attention_mask.unsqueeze(0)
        labels = labels.unsqueeze(0)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Format the dataset for PyTorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Load T5 model
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Define training arguments with 'output_dir'
training_args = TrainingArguments(
    output_dir='./results',  # Directory to save model checkpoints and logs
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)



In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation results:", results)

def generate_query(question):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move model to the appropriate device

    inputs = tokenizer(question, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Perform inference with no gradient calculation
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)

    query = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return query



Epoch,Training Loss,Validation Loss
1,0.2798,0.294733
2,0.1843,0.279092
3,0.2134,0.277712


Evaluation results: {'eval_loss': 0.2777121365070343, 'eval_runtime': 23.3283, 'eval_samples_per_second': 44.324, 'eval_steps_per_second': 5.573, 'epoch': 3.0}


In [None]:
# Test the function with a sample question
sample_question = "Total number of employees with salary above 200000 usd"
print("Generated SQL query:", generate_query(sample_question))



Generated SQL query: SELECT count(*) FROM employees WHERE salary > 200000
