### Language to SQL transformers 
#### with graph relationships Graphix T5

In [1]:
#let's exlore the data
import datasets


from datasets import load_dataset

ds = load_dataset("xlangai/spider")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 7000/7000 [00:00<00:00, 156800.60 examples/s]
Generating validation split: 100%|██████████| 1034/1034 [00:00<00:00, 117166.29 examples/s]


In [None]:
print(ds['train'][0])


{'db_id': 'department_management', 'query': 'SELECT count(*) FROM head WHERE age  >  56', 'question': 'How many heads of the departments are older than 56 ?', 'query_toks': ['SELECT', 'count', '(', '*', ')', 'FROM', 'head', 'WHERE', 'age', '>', '56'], 'query_toks_no_value': ['select', 'count', '(', '*', ')', 'from', 'head', 'where', 'age', '>', 'value'], 'question_toks': ['How', 'many', 'heads', 'of', 'the', 'departments', 'are', 'older', 'than', '56', '?']}


In [None]:
import pandas as pd

df = pd.DataFrame(ds['train'])

df_test = pd.DataFrame(ds['validation'])

In [7]:
df.head(2)

Unnamed: 0,db_id,query,question,query_toks,query_toks_no_value,question_toks
0,department_management,SELECT count(*) FROM head WHERE age > 56,How many heads of the departments are older th...,"[SELECT, count, (, *, ), FROM, head, WHERE, ag...","[select, count, (, *, ), from, head, where, ag...","[How, many, heads, of, the, departments, are, ..."
1,department_management,"SELECT name , born_state , age FROM head ORD...","List the name, born state and age of the heads...","[SELECT, name, ,, born_state, ,, age, FROM, he...","[select, name, ,, born_state, ,, age, from, he...","[List, the, name, ,, born, state, and, age, of..."


In [None]:
import pandas as pd

# Set display options to see the full output
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

tables = pd.read_json('tables.json')

# Now print the DataFrame
print(tables[tables['db_id'] == 'department_management']['column_names'])

Unnamed: 0,column_names,column_names_original,column_types,db_id,foreign_keys,primary_keys,table_names,table_names_original
159,"[[-1, *], [0, department id], [0, name], [0, creation], [0, ranking], [0, budget in billions], [0, num employees], [1, head id], [1, name], [1, born state], [1, age], [2, department id], [2, head id], [2, temporary acting]]","[[-1, *], [0, Department_ID], [0, Name], [0, Creation], [0, Ranking], [0, Budget_in_Billions], [0, Num_Employees], [1, head_ID], [1, name], [1, born_state], [1, age], [2, department_ID], [2, head_ID], [2, temporary_acting]]","[text, number, text, text, number, number, number, number, text, text, number, number, number, text]",department_management,"[[12, 7], [11, 1]]","[1, 7, 11]","[department, head, management]","[department, head, management]"


In [20]:
def extract_schema_for_db(db_id, tables_df):
    # Find the schema for the specific database ID
    
    tables_data = tables_df.to_dict(orient='records')
    schema = next((db for db in tables_data if db["db_id"] == db_id), None)
    if not schema:
        raise ValueError(f"Schema for db_id {db_id} not found.")

    # Extract tables and columns
    tables = schema["table_names"]
    columns = schema["column_names"]
    primary_keys = schema["primary_keys"]
    foreign_keys = schema["foreign_keys"]

    # Prepare nodes and edges for the graph
    nodes = tables + [f"{col[1]}" for col in columns if col[0] != -1]  # Exclude global columns
    edges = []

    # Table-Column edges
    for col in columns:
        if col[0] != -1:  # Skip global columns
            edges.append((tables[col[0]], col[1]))

    # Primary Key edges
    for pk in primary_keys:
        edges.append((f"{columns[pk][1]}", f"{columns[pk][1]} (PK)"))

    # Foreign Key edges
    for fk in foreign_keys:
        from_col = f"{columns[fk[0]][1]}"
        to_col = f"{columns[fk[1]][1]}"
        edges.append((from_col, to_col))

    return {"nodes": nodes, "edges": edges}




In [21]:
# Example usage
db_id = "department_management"

schema_graph = extract_schema_for_db(db_id, tables)
schema_graph

{'nodes': ['department',
  'head',
  'management',
  'department id',
  'name',
  'creation',
  'ranking',
  'budget in billions',
  'num employees',
  'head id',
  'name',
  'born state',
  'age',
  'department id',
  'head id',
  'temporary acting'],
 'edges': [('department', 'department id'),
  ('department', 'name'),
  ('department', 'creation'),
  ('department', 'ranking'),
  ('department', 'budget in billions'),
  ('department', 'num employees'),
  ('head', 'head id'),
  ('head', 'name'),
  ('head', 'born state'),
  ('head', 'age'),
  ('management', 'department id'),
  ('management', 'head id'),
  ('management', 'temporary acting'),
  ('department id', 'department id (PK)'),
  ('head id', 'head id (PK)'),
  ('department id', 'department id (PK)'),
  ('head id', 'head id'),
  ('department id', 'department id')]}

In [None]:
def prepare_t5_inputs_targets(data, tables_data):
    # Convert data to list of dictionaries if it's a DataFrame
    if hasattr(data, "to_dict"):
        data = data.to_dict(orient="records")

    inputs = []
    targets = []

    for record in data:
        db_id = record["db_id"]
        question = record["question"]
        query = record["query"]

        # Extract schema for the db_id
        schema = extract_schema_for_db(db_id, tables_data)

        # Serialize the schema
        tables = ", ".join(schema["nodes"][: len(schema["nodes"]) // 2])
        columns = ", ".join([node for node in schema["nodes"] if '.' in node])
        primary_keys = ", ".join([f"{edge[0]} (PK)" for edge in schema["edges"] if "(PK)" in edge[0]])
        foreign_keys = ", ".join([f"{edge[0]} -> {edge[1]}" for edge in schema["edges"] if "->" in edge[1]])

        serialized_schema = (
            f"Tables: {tables} | Columns: {columns} | "
            f"Primary Keys: {primary_keys} | Foreign Keys: {foreign_keys}"
        )

        # Prepare input and target
        input_text = f"translate natural language to SQL: {question}? <schema> {serialized_schema}"
        target_text = query

        inputs.append(input_text)
        targets.append(target_text)
        
        print(inputs)
        print(targets)

    return inputs, targets

In [65]:
print(len(df))

7000


In [None]:
#let's test on a subset
data = df[:1]
inputs, targets = prepare_t5_inputs_targets(data, tables)


In [95]:
print(inputs)

['translate natural language to SQL: How many heads of the departments are older than 56 ?? <schema> Tables: department, head, management, department id, name, creation, ranking, budget in billions | Columns:  | Primary Keys:  | Foreign Keys: ']


In [96]:
print(inputs)
print(targets)

['translate natural language to SQL: How many heads of the departments are older than 56 ?? <schema> Tables: department, head, management, department id, name, creation, ranking, budget in billions | Columns:  | Primary Keys:  | Foreign Keys: ']
['SELECT count(*) FROM head WHERE age  >  56']


In [41]:
import wandb
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW

# Custom Dataset Class
class TextToSQLDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_length=512):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]

        # Tokenize inputs and targets
        input_encodings = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        target_encodings = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": input_encodings["input_ids"].squeeze(),
            "attention_mask": input_encodings["attention_mask"].squeeze(),
            "labels": target_encodings["input_ids"].squeeze(),
        }

# Initialize Tokenizer and Model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Prepare Dataset
train_dataset = TextToSQLDataset(inputs, targets, tokenizer)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move Model to GPU if Available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize W&B
wandb.init(
    project="text-to-sql", 
    name="t5-finetuning", 
    config={
        "epochs": 10,
        "batch_size": 8,
        "learning_rate": 5e-5,
        "max_length": 512,
    }
)

# Training Loop
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        # Move batch to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward Pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss

        # Backward Pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Log loss to W&B
        wandb.log({"batch_loss": loss.item()})

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

    # Log epoch loss to W&B
    wandb.log({"epoch": epoch + 1, "epoch_loss": avg_loss})

# Finish W&B Logging
wandb.finish()




Epoch 1/10, Loss: 7.3578
Epoch 2/10, Loss: 2.0614
Epoch 3/10, Loss: 1.3021
Epoch 4/10, Loss: 1.1318
Epoch 5/10, Loss: 0.9810
Epoch 6/10, Loss: 0.7947
Epoch 7/10, Loss: 0.6875
Epoch 8/10, Loss: 0.5651
Epoch 9/10, Loss: 0.4937
Epoch 10/10, Loss: 0.4715


0,1
batch_loss,█▆▅▃▃▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
epoch_loss,█▃▂▂▂▁▁▁▁▁

0,1
batch_loss,0.49255
epoch,10.0
epoch_loss,0.47147


In [50]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ------ --------------------------------- 1.8/11.0 MB 11.2 MB/s eta 0:00:01
   ------------- -------------------------- 3.7/11.0 MB 10.4 MB/s eta 0:00:01
   -------------------- ------------------- 5.8/11.0 MB 9.5 MB/s eta 0:00:01
   ----------------------------- ---------- 8.1/11.0 MB 10.1 MB/s eta 0:00:01
   ------------------------------------ --- 10.0/11.0 MB 9.7 MB/s eta 0:00:01
   ------------------------------

In [91]:
# #training with validation

# import wandb
# import torch
# from transformers import T5Tokenizer, T5ForConditionalGeneration
# from torch.utils.data import Dataset, DataLoader
# from transformers import AdamW
# from sklearn.model_selection import train_test_split

# # Custom Dataset Class
# class TextToSQLDataset(Dataset):
#     def __init__(self, inputs, targets, tokenizer, max_length=512):
#         self.inputs = inputs
#         self.targets = targets
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.inputs)

#     # def __getitem__(self, idx):
#     #     input_text = self.inputs[idx]
#     #     target_text = self.targets[idx]

#     #     # Tokenize inputs and targets
#     #     input_encodings = self.tokenizer(
#     #         input_text,
#     #         max_length=self.max_length,
#     #         padding="max_length",
#     #         truncation=True,
#     #         return_tensors="pt",
#     #     )
#     #     target_encodings = self.tokenizer(
#     #         target_text,
#     #         max_length=self.max_length,
#     #         padding="max_length",
#     #         truncation=True,
#     #         return_tensors="pt",
#     #     )

#     #     return {
#     #         "input_ids": input_encodings["input_ids"].squeeze(),
#     #         "attention_mask": input_encodings["attention_mask"].squeeze(),
#     #         "labels": target_encodings["input_ids"].squeeze(),
#     #     }
# def __getitem__(self, idx):
#     input_text = self.inputs[idx]
#     target_text = self.targets[idx]

#     # Tokenize inputs and targets
#     input_encodings = self.tokenizer(
#         input_text,
#         max_length=self.max_length,
#         padding="max_length",
#         truncation=True,
#         return_tensors="pt",
#     )
#     target_encodings = self.tokenizer(
#         target_text,
#         max_length=self.max_length,
#         padding="max_length",
#         truncation=True,
#         return_tensors="pt",
#     )

#     # Replace padding token id with -100 in labels
#     labels = target_encodings["input_ids"]
#     labels[labels == self.tokenizer.pad_token_id] = -100

#     return {
#         "input_ids": input_encodings["input_ids"].squeeze(0),
#         "attention_mask": input_encodings["attention_mask"].squeeze(0),
#         "labels": labels.squeeze(0),
#     }



In [89]:
# create text class

class TextToSQLDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_length=512):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]

        # Tokenize inputs and targets
        input_encodings = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        target_encodings = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        # Replace padding token id with -100 in labels
        labels = target_encodings["input_ids"]
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_encodings["input_ids"].squeeze(0),
            "attention_mask": input_encodings["attention_mask"].squeeze(0),
            "labels": labels.squeeze(0),
        }


In [90]:
## add training loop

import wandb
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from sklearn.model_selection import train_test_split

# Initialize Tokenizer and Model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.config.pad_token_id = tokenizer.pad_token_id

# Split data into train and validation
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    inputs, targets, test_size=0.2, random_state=42
)

# # Debugging Dataset
# train_dataset = TextToSQLDataset(
#     inputs=["Translate this English question into SQL: What are the names of all employees?"],
#     targets=["SELECT * FROM employees"],
#     tokenizer=tokenizer,
#     max_length=20  # Keep it short for overfitting
# )
# print("this should be input ids", train_dataset[0])  # Ensure this works
# print(len(train_dataset))

# print(train_dataset)

#Prepare Datasets
train_dataset = TextToSQLDataset(train_inputs, train_targets, tokenizer)
val_dataset = TextToSQLDataset(val_inputs, val_targets, tokenizer)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move Model to GPU if Available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize W&B
wandb.init(
    project="text-to-sql", 
    name="t5-finetuning", 
    config={
        "epochs": 10,
        "batch_size": 1,
        "learning_rate": 1e-3,
        "max_length": 512,
    }
)

# Training Loop
epochs = 10
for epoch in range(epochs):
    # Training Phase
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        optimizer.zero_grad()

        # Move batch to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Verify labels
        print("Sample labels:", labels[0])  # Check if padding is replaced with -100

        # Forward Pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss

        # Backward Pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    wandb.log({"batch_loss": loss.item()})

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")
    
    # Debug the model's output every 10 epochs
    if (epoch + 1) % 10 == 0:
        model.eval()
        with torch.no_grad():
            inputs = tokenizer("Translate this English question into SQL: What are the names of all employees?", return_tensors="pt", padding=True, truncation=True).to(device)
            outputs = model.generate(inputs["input_ids"], max_length=20)
            print(f"Epoch {epoch + 1}: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")

    # Log epoch loss to W&B
    wandb.log({"epoch": epoch + 1, "epoch_loss": avg_loss})

    # # Validation Phase
    # model.eval()
    # val_loss = 0
    # with torch.no_grad():
    #     for batch in val_loader:
    #         # Move batch to device
    #         input_ids = batch["input_ids"].to(device)
    #         attention_mask = batch["attention_mask"].to(device)
    #         labels = batch["labels"].to(device)

    #         # Forward Pass
    #         outputs = model(
    #             input_ids=input_ids,
    #             attention_mask=attention_mask,
    #             labels=labels
    #         )
    #         val_loss += outputs.loss.item()

    # avg_val_loss = val_loss / len(val_loader)
    # print(f"Validation Loss: {avg_val_loss:.4f}")

    # # Log validation loss to W&B
    # wandb.log({"val_loss": avg_val_loss})

# Finish W&B Logging
wandb.finish()




Sample labels: tensor([    3, 23143, 14196,   332,  5787,  8725,   834, 14814,   834,  5525,
         1152, 21680,  4341,  6157,   332,   536,     3, 15355,  3162,  4775,
          834,  3974,     7,     7,  4128,  6157,   332,   357,  9191,   332,
         5411,  1608, 12416,   342,   834,    23,    26,  3274,   332,  4416,
         1608, 12416,   342,   834,    23,    26,     3, 15355,  3162,   151,
         6157,   332,   519,  9191,   332,  5411,  1608, 12416,   342,   834,
           23,    26,  3274,   332,  5787,  6075,   834,    23,    26,   549,
        17444,   427,   332,  4416,     9,  2260,     7,   297,   834,   670,
          287,    15,   834,  4978,  3274,    96,   371,     9,   173,   121,
            1,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,

0,1
batch_loss,█▄▄▆▄▁▁▃▃▃
epoch,▁▂▃▃▄▅▆▆▇█
epoch_loss,█▅▄▃▃▂▂▁▁▁

0,1
batch_loss,1.48244
epoch,10.0
epoch_loss,0.88459


In [None]:
train_dataset = TextToSQLDataset(
    inputs=["Translate this to SQL"],
    targets=["SELECT * FROM table"],
    tokenizer=tokenizer,
)

# Check if the dataset and DataLoader work
print(train_dataset[0])  # Should output a dictionary with tokenized tensors
train_loader = DataLoader(train_dataset, batch_size=1)
for batch in train_loader:
    print("tris is batch", batch)

{'input_ids': tensor([30355,    15,    48,    12, 12558,     1,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [92]:
def test_inference(question, model, tokenizer, device):
    """
    Run inference on a single question to generate an SQL query.

    Args:
    - question (str): The natural language question.
    - model (T5ForConditionalGeneration): The fine-tuned T5 model.
    - tokenizer (T5Tokenizer): The T5 tokenizer.
    - device (torch.device): The device to run the model on.

    Returns:
    - str: The generated SQL query.
    """
    # Add the task prefix
    #input_text = f"translate natural language to SQL: {question}"
    input_text = question

    # Tokenize the input
    input_ids = tokenizer(
        input_text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
    ).input_ids.to(device)
    
    print("Tokenized input text:", tokenizer.convert_ids_to_tokens(input_ids[0]))
    print("Input ids:", input_ids)
    print(type(input_ids))
    # Ensure model is in eval mode
    model.eval()

    # Generate SQL query
    outputs = model.generate(input_ids, max_length=128, num_beams=5, early_stopping=True)
    
    print("Generated output tensor:", outputs)

    # Decode the output tokens
    sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Decoded SQL query:", sql_query)

    return sql_query





In [None]:
# # Example Question
# example_question = "What are the names of all employees??"

# # Run Inference
# model.eval()
# generated_query = test_inference(example_question, model, tokenizer, device)
# print(len(generated_query))
# print(f"Input Question: {example_question}")
# print(f"Generated SQL Query: {generated_query}")

Tokenized input text: ['▁What', '▁are', '▁the', '▁names', '▁of', '▁all', '▁employees', '??', '</s>']
Input ids: tensor([[ 363,   33,    8, 3056,   13,   66, 1652, 8546,    1]])
<class 'torch.Tensor'>
Generated output tensor: tensor([[   0,  363,   33,    8, 3056,   13, 1652,   58,    1]])
Decoded SQL query: What are the names of employees?
32
Input Question: What are the names of all employees??
Generated SQL Query: What are the names of employees?


In [93]:
example_question = df.iloc[0]["question"]  # First question in the dataset
expected_query = df.iloc[0]["query"]      # First query in the dataset
# Run Inference
model.eval()
generated_query = test_inference(example_question, model, tokenizer, device)
print(f"Input Question: {example_question}")
print(f"Expected Query: {expected_query}")
print(f"Generated SQL Query: {generated_query}")

Tokenized input text: ['▁How', '▁many', '▁heads', '▁of', '▁the', '▁departments', '▁are', '▁older', '▁than', '▁56', '▁', '?', '</s>']
Input ids: tensor([[  571,   186,  7701,    13,     8, 10521,    33,  2749,   145, 11526,
             3,    58,     1]])
<class 'torch.Tensor'>
Generated output tensor: tensor([[    0,   571,   186,  7701,    13, 10521,    33,  2749,   145, 11526,
             3,    58,     1]])
Decoded SQL query: How many heads of departments are older than 56?
Input Question: How many heads of the departments are older than 56 ?
Expected Query: SELECT count(*) FROM head WHERE age  >  56
Generated SQL Query: How many heads of departments are older than 56?


TypeError: 'float' object is not iterable