In [1]:
!pip install datasets


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


### Language to SQL transformers 
#### with graph relationships Graphix T5

In [3]:
#let's exlore the data
import datasets


from datasets import load_dataset

ds = load_dataset("xlangai/spider")

In [4]:
print(ds['train'][0])


{'db_id': 'department_management', 'query': 'SELECT count(*) FROM head WHERE age  >  56', 'question': 'How many heads of the departments are older than 56 ?', 'query_toks': ['SELECT', 'count', '(', '*', ')', 'FROM', 'head', 'WHERE', 'age', '>', '56'], 'query_toks_no_value': ['select', 'count', '(', '*', ')', 'from', 'head', 'where', 'age', '>', 'value'], 'question_toks': ['How', 'many', 'heads', 'of', 'the', 'departments', 'are', 'older', 'than', '56', '?']}


In [5]:
import pandas as pd

df = pd.DataFrame(ds['train'])

df_test = pd.DataFrame(ds['validation'])

In [6]:
df.head(2)

Unnamed: 0,db_id,query,question,query_toks,query_toks_no_value,question_toks
0,department_management,SELECT count(*) FROM head WHERE age > 56,How many heads of the departments are older th...,"[SELECT, count, (, *, ), FROM, head, WHERE, ag...","[select, count, (, *, ), from, head, where, ag...","[How, many, heads, of, the, departments, are, ..."
1,department_management,"SELECT name , born_state , age FROM head ORD...","List the name, born state and age of the heads...","[SELECT, name, ,, born_state, ,, age, FROM, he...","[select, name, ,, born_state, ,, age, from, he...","[List, the, name, ,, born, state, and, age, of..."


In [7]:
import pandas as pd

# Set display options to see the full output
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

tables = pd.read_json('tables.json')

# Now print the DataFrame
print(tables[tables['db_id'] == 'department_management']['column_names'])

159    [[-1, *], [0, department id], [0, name], [0, creation], [0, ranking], [0, budget in billions], [0, num employees], [1, head id], [1, name], [1, born state], [1, age], [2, department id], [2, head id], [2, temporary acting]]
Name: column_names, dtype: object


In [8]:
def extract_schema_for_db(db_id, tables_df):
    # Find the schema for the specific database ID
    
    tables_data = tables_df.to_dict(orient='records')
    schema = next((db for db in tables_data if db["db_id"] == db_id), None)
    if not schema:
        raise ValueError(f"Schema for db_id {db_id} not found.")

    # Extract tables and columns
    tables = schema["table_names"]
    columns = schema["column_names"]
    primary_keys = schema["primary_keys"]
    foreign_keys = schema["foreign_keys"]

    # Prepare nodes and edges for the graph
    nodes = tables + [f"{col[1]}" for col in columns if col[0] != -1]  # Exclude global columns
    edges = []

    # Table-Column edges
    for col in columns:
        if col[0] != -1:  # Skip global columns
            edges.append((tables[col[0]], col[1]))

    # Primary Key edges
    for pk in primary_keys:
        edges.append((f"{columns[pk][1]}", f"{columns[pk][1]} (PK)"))

    # Foreign Key edges
    for fk in foreign_keys:
        from_col = f"{columns[fk[0]][1]}"
        to_col = f"{columns[fk[1]][1]}"
        edges.append((from_col, to_col))

    return {"nodes": nodes, "edges": edges}




In [9]:
# Example usage
db_id = "department_management"

schema_graph = extract_schema_for_db(db_id, tables)
schema_graph

{'nodes': ['department',
  'head',
  'management',
  'department id',
  'name',
  'creation',
  'ranking',
  'budget in billions',
  'num employees',
  'head id',
  'name',
  'born state',
  'age',
  'department id',
  'head id',
  'temporary acting'],
 'edges': [('department', 'department id'),
  ('department', 'name'),
  ('department', 'creation'),
  ('department', 'ranking'),
  ('department', 'budget in billions'),
  ('department', 'num employees'),
  ('head', 'head id'),
  ('head', 'name'),
  ('head', 'born state'),
  ('head', 'age'),
  ('management', 'department id'),
  ('management', 'head id'),
  ('management', 'temporary acting'),
  ('department id', 'department id (PK)'),
  ('head id', 'head id (PK)'),
  ('department id', 'department id (PK)'),
  ('head id', 'head id'),
  ('department id', 'department id')]}

In [10]:
def prepare_t5_inputs_targets(data, tables_data):
    # Convert data to list of dictionaries if it's a DataFrame
    if hasattr(data, "to_dict"):
        data = data.to_dict(orient="records")

    inputs = []
    targets = []

    for record in data:
        db_id = record["db_id"]
        question = record["question"]
        query = record["query"]

        # Extract schema for the db_id
        schema = extract_schema_for_db(db_id, tables_data)

        # Serialize the schema
        tables = ", ".join(schema["nodes"][: len(schema["nodes"]) // 2])
        columns = ", ".join([node for node in schema["nodes"] if '.' in node])
        primary_keys = ", ".join([f"{edge[0]} (PK)" for edge in schema["edges"] if "(PK)" in edge[0]])
        foreign_keys = ", ".join([f"{edge[0]} -> {edge[1]}" for edge in schema["edges"] if "->" in edge[1]])

        serialized_schema = (
            f"Tables: {tables} | Columns: {columns} | "
            f"Primary Keys: {primary_keys} | Foreign Keys: {foreign_keys}"
        )

        # Prepare input and target
        input_text = f"translate natural language to SQL: {question}? <schema> {serialized_schema}"
        target_text = query

        inputs.append(input_text)
        targets.append(target_text)

    return inputs, targets



In [11]:
#let's test on a subset
data = df
inputs, targets = prepare_t5_inputs_targets(data, tables)

In [12]:
# create text class
from torch.utils.data import Dataset

class TextToSQLDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_length=512):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]

        # Tokenize inputs and targets
        input_encodings = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        target_encodings = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        # Replace padding token id with -100 in labels
        labels = target_encodings["input_ids"]
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_encodings["input_ids"].squeeze(0),
            "attention_mask": input_encodings["attention_mask"].squeeze(0),
            "labels": labels.squeeze(0),
        }


In [13]:
import wandb
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from sklearn.model_selection import train_test_split

In [14]:
# Initialize Tokenizer and Model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.config.pad_token_id = tokenizer.pad_token_id

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [15]:
# Split data into train and validation
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    inputs, targets, test_size=0.2, random_state=42
)

#Prepare Datasets
train_dataset = TextToSQLDataset(train_inputs, train_targets, tokenizer)
val_dataset = TextToSQLDataset(val_inputs, val_targets, tokenizer)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move Model to GPU if Available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [16]:
# Initialize W&B
wandb.init(
    project="text-to-sql", 
    name="t5-finetuning", 
    config={
        "epochs": 5,
        "batch_size": 16,
        "learning_rate": 5e-5,
        "max_length": 512,
    }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mliquid-candidate[0m ([33mliquid-candidate-personal[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [17]:
# Training Loop
epochs = 5
for epoch in range(epochs):
    # Training Phase
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        optimizer.zero_grad()

        # Move batch to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Verify labels
       # print("Sample labels:", labels[0])  # Check if padding is replaced with -100

        # Forward Pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss

        # Backward Pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    wandb.log({"batch_loss": loss.item()})

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")
    
    # Debug the model's output every 10 epochs
    if (epoch + 1) % 2 == 0:
        model.eval()
        with torch.no_grad():
            inputs = tokenizer("Translate this English question into SQL: What are the names of all employees?", return_tensors="pt", padding=True, truncation=True).to(device)
            outputs = model.generate(inputs["input_ids"], max_length=20)
            print(f"Epoch {epoch + 1}: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")

    # Log epoch loss to W&B
    wandb.log({"epoch": epoch + 1, "epoch_loss": avg_loss})

    # Validation Phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            # Move batch to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward Pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

    # Log validation loss to W&B
    wandb.log({"val_loss": avg_val_loss})

#inish W&B Logging
wandb.finish()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/5, Loss: 1.4201
Validation Loss: 0.6996
Epoch 2/5, Loss: 0.7121
Epoch 2: SELECT T1.name FROM ENG
Validation Loss: 0.4896
Epoch 3/5, Loss: 0.5180
Validation Loss: 0.3943
Epoch 4/5, Loss: 0.4150
Epoch 4: SELECT Name FROM Employees
Validation Loss: 0.3443
Epoch 5/5, Loss: 0.3459
Validation Loss: 0.2924


0,1
batch_loss,█▇▅▅▁
epoch,▁▃▅▆█
epoch_loss,█▃▂▁▁
val_loss,█▄▃▂▁

0,1
batch_loss,0.06244
epoch,5.0
epoch_loss,0.34594
val_loss,0.29238


In [18]:
print("hello world")

hello world


In [25]:
def test_inference(question, model, tokenizer, device):
    """
    Run inference on a single question to generate an SQL query.

    Args:
    - question (str): The natural language question.
    - model (T5ForConditionalGeneration): The fine-tuned T5 model.
    - tokenizer (T5Tokenizer): The T5 tokenizer.
    - device (torch.device): The device to run the model on.

    Returns:
    - str: The generated SQL query.
    """
    # Add the task prefix
    #input_text = f"translate natural language to SQL: {question}"
    
    prefix = "translate natural language to SQL:"
    input_text = prefix + question

    # Tokenize the input
    input_ids = tokenizer(
        input_text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
    ).input_ids.to(device)
    
    print("Tokenized input text:", tokenizer.convert_ids_to_tokens(input_ids[0]))
    print("Input ids:", input_ids)
    print(type(input_ids))
    # Ensure model is in eval mode
    model.eval()

    # Generate SQL query
    outputs = model.generate(input_ids, max_length=128, num_beams=5, early_stopping=True)
    
    print("Generated output tensor:", outputs)

    # Decode the output tokens
    sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Decoded SQL query:", sql_query)

    return sql_query

In [28]:
#example_question = df.iloc[0]["question"]  # First question in the dataset
#print(example_question)
example_question = "get all student names"
#expected_query = df.iloc[0]["query"]      # First query in the dataset
# Run Inference
model.eval()
generated_query = test_inference(example_question, model, tokenizer, device)
print(f"Input Question: {example_question}")
#print(f"Expected Query: {expected_query}")
print(f"Generated SQL Query: {generated_query}")

Tokenized input text: ['▁translate', '▁natural', '▁language', '▁to', '▁SQL', ':', 'get', '▁all', '▁student', '▁names', '</s>']
Input ids: tensor([[13959,   793,  1612,    12, 12558,    10,  2782,    66,  1236,  3056,
             1]], device='cuda:0')
<class 'torch.Tensor'>
Generated output tensor: tensor([[    0,     3, 23143, 14196,   564, 21680,  5097, 10161,  6431,     1]],
       device='cuda:0')
Decoded SQL query: SELECT name FROM STUDENT
Input Question: get all student names
Generated SQL Query: SELECT name FROM STUDENT


In [None]:
#save the model weights 
