In [1]:
import os
import json
import sqlite3
import re
import torch
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, get_linear_schedule_with_warmup
from peft import get_peft_model, LoraConfig, TaskType
from torch.optim import AdamW
from torch.amp import autocast, GradScaler
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
from google.colab import drive

In [2]:
drive.mount('/content/drive')
spider_dir = "/content/drive/MyDrive/spider-upload"

Mounted at /content/drive


In [3]:
# load the training data
def load_spider_data(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    return data

In [4]:
training_data = load_spider_data(os.path.join(spider_dir, "train_spider.json"))

In [5]:
len(training_data)

7000

In [6]:
def format_schema(db_id, tables_json):
    db_info = None
    for db in tables_json:
        if db['db_id'] == db_id:
            db_info = db
            break

    if not db_info:
        return "", {}

    # get tables present in each db
    tables = db_info['table_names_original']
    table_schema_text = []

    schema_dict = {
        'db_id': db_id,
        'tables': {}
    }

    for table_idx, table_name in enumerate(tables):
        columns = []
        table_dict = {
            'columns': [],
            'primary_keys': [],
            'foreign_keys': {}
        }

        # get columns in each table
        for col_idx, (tab_idx, col_name) in enumerate(db_info['column_names_original']):
            if tab_idx == table_idx and col_name != '*':
                # get column type
                col_type = db_info['column_types'][col_idx]

                # check primary key
                is_primary = col_idx in db_info['primary_keys']
                pk_marker = " [PK]" if is_primary else ""

                if is_primary:
                    table_dict['primary_keys'].append(col_name)

                # check foreign key
                fk_info = ""
                for fk_col1, fk_col2 in db_info['foreign_keys']:
                    if col_idx == fk_col1 or col_idx == fk_col2:
                        # Find the other table/column in the relationship
                        other_col_idx = fk_col2 if col_idx == fk_col1 else fk_col1
                        other_tab_idx = db_info['column_names_original'][other_col_idx][0]
                        other_col_name = db_info['column_names_original'][other_col_idx][1]
                        other_tab_name = tables[other_tab_idx]

                        fk_info = f" [FK -> {other_tab_name}.{other_col_name}]"

                        table_dict['foreign_keys'][col_name] = {
                            'table': other_tab_name,
                            'column': other_col_name
                        }
                        break

                columns.append(f"{col_name} ({col_type}){pk_marker}{fk_info}")

                table_dict['columns'].append({
                    'name': col_name,
                    'type': col_type,
                    'is_primary_key': is_primary,
                    'is_foreign_key': bool(fk_info)
                })

        # Create the table schema text
        if columns:
            cols_text = ", ".join(columns)
            table_schema_text.append(f"Table: {table_name} ({cols_text})")
            schema_dict['tables'][table_name] = table_dict

    schema_text = "\n".join(table_schema_text)

    return schema_text, schema_dict

In [7]:
# load schemas from tables.json file
def load_all_schemas(tables_json_path):
    with open(tables_json_path, 'r') as f:
        tables_json = json.load(f)

    # test with 5 tables
    # tables_json = tables_json[:5]

    text_schema_dict = {}
    struct_schema_dict = {}
    db_ids = set(db['db_id'] for db in tables_json)

    for db_id in db_ids:
        schema_text, schema_dict = format_schema(db_id, tables_json)
        text_schema_dict[db_id] = schema_text
        struct_schema_dict[db_id] = schema_dict

    print(f"Loaded schema information for {len(struct_schema_dict)} databases")
    return text_schema_dict, struct_schema_dict

In [8]:
text_schema_dict, struct_schema_dict = load_all_schemas(os.path.join(spider_dir, "tables.json"))

Loaded schema information for 166 databases


In [9]:
text_schema_dict

{'soccer_2': 'Table: College (cName (text) [PK] [FK -> Tryout.cName], state (text), enr (number))\nTable: Player (pID (number) [PK] [FK -> Tryout.pID], pName (text), yCard (text), HS (number))\nTable: Tryout (pID (number) [PK] [FK -> Player.pID], cName (text) [FK -> College.cName], pPos (text), decision (text))',
 'school_bus': 'Table: driver (Driver_ID (number) [PK] [FK -> school_bus.Driver_ID], Name (text), Party (text), Home_city (text), Age (number))\nTable: school (School_ID (number) [PK] [FK -> school_bus.School_ID], Grade (text), School (text), Location (text), Type (text))\nTable: school_bus (School_ID (number) [PK] [FK -> school.School_ID], Driver_ID (number) [FK -> driver.Driver_ID], Years_Working (number), If_full_time (others))',
 'orchestra': 'Table: conductor (Conductor_ID (number) [PK] [FK -> orchestra.Conductor_ID], Name (text), Age (number), Nationality (text), Year_of_Work (number))\nTable: orchestra (Orchestra_ID (number) [PK] [FK -> performance.Orchestra_ID], Orches

In [10]:
training_text_schema = {}
training_struct_schema = {}

dataset_dir = os.path.join(spider_dir, "database")
db_dirs = [d for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d))]

for db in db_dirs:
    if db in text_schema_dict:
        training_text_schema.update({db: text_schema_dict[db]})
        training_struct_schema.update({db: struct_schema_dict[db]})

In [11]:
print(f"Loaded schema information for {len(training_struct_schema)} databases")

Loaded schema information for 133 databases


In [12]:
final_training_data = []

for data in training_data:
    if data['db_id'] in training_text_schema.keys():
        final_training_data.append(data)

len(final_training_data)

6654

In [13]:
train_examples, temp_examples = train_test_split(
        final_training_data, test_size=0.2, random_state=42)

val_examples, test_examples = train_test_split(
    temp_examples, test_size=0.5, random_state=42)

print(f"Training examples: {len(train_examples)}")
print(f"Validation examples: {len(val_examples)}")
print(f"Test examples: {len(test_examples)}")

Training examples: 5323
Validation examples: 665
Test examples: 666


In [14]:
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [15]:
model.gradient_checkpointing_enable()

In [16]:
def create_input_text(question, db_id, schema_dict):
    schema = schema_dict[db_id]
    return f"translate to SQL: {question} \n{schema}"

In [17]:
class SQLDataset(Dataset):
    def __init__(self, examples, schema_dict, tokenizer, max_input_length=512, max_target_length=128):
        self.examples = examples
        self.schema_dict = schema_dict
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        ex = self.examples[idx]
        question = ex["question"]
        sql = ex["query"]
        db_id = ex["db_id"]

        input_text = create_input_text(question, db_id, self.schema_dict)

        # Tokenize inputs and outputs
        input_encodings = self.tokenizer(
            input_text,
            max_length=self.max_input_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        target_encodings = self.tokenizer(
            sql,
            max_length=self.max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Important: Replace padding token id with -100 so it's ignored in loss calculation
        target_encodings.input_ids[target_encodings.input_ids == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_encodings.input_ids.squeeze(),
            'attention_mask': input_encodings.attention_mask.squeeze(),
            'labels': target_encodings.input_ids.squeeze(),
            'question': question,
            'sql': sql,
            'db_id': db_id
        }

In [18]:
train_dataset = SQLDataset(train_examples, training_text_schema, tokenizer)
val_dataset = SQLDataset(val_examples, training_text_schema, tokenizer)
test_dataset = SQLDataset(test_examples, training_text_schema, tokenizer)

In [19]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

In [20]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 5
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1*num_training_steps),
    num_training_steps=num_training_steps
)
scaler = GradScaler()

In [None]:
def save_checkpoint(checkpoint_dir, model, optimizer, scheduler, scaler, epoch, step,
                   train_losses, val_losses, best_val_loss, is_best=False, is_latest=True):
    checkpoint = {
        'epoch': epoch,
        'step': step,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'scaler_state_dict': scaler.state_dict(),
        'train_losses': train_losses,
        'val_losses': val_losses,
        'best_val_loss': best_val_loss
    }

    # latest model checkpoint
    if is_latest:
        torch.save(checkpoint, os.path.join(checkpoint_dir, 'latest_checkpoint.pt'))

    # best model checkpoint
    if is_best:
        torch.save(model.state_dict(), os.path.join(checkpoint_dir, 'best_model.pt'))

In [None]:
def load_checkpoint(checkpoint_path, model, optimizer, scheduler, scaler):
    if not os.path.exists(checkpoint_path):
        print(f"No checkpoint found at {checkpoint_path}, starting from scratch")
        return 0, 0, [], [], float('inf')

    checkpoint = torch.load(checkpoint_path)

    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    scaler.load_state_dict(checkpoint['scaler_state_dict'])

    epoch = checkpoint['epoch']
    step = checkpoint['step']
    train_losses = checkpoint['train_losses']
    val_losses = checkpoint['val_losses']
    best_val_loss = checkpoint['best_val_loss']

    print(f"Loaded checkpoint from epoch {epoch+1}, step {step+1}")
    print(f"Best validation loss so far: {best_val_loss:.4f}")

    return epoch + 1, step, train_losses, val_losses, best_val_loss

In [None]:
def plot_training_progress(train_losses, val_losses, checkpoint_dir):
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.savefig(os.path.join(checkpoint_dir, 'training_progress.png'))
    plt.close()

In [None]:
def train_model(model, train_loader, val_loader, optimizer, scheduler, num_epochs=5,
                gradient_accumulation_steps=4, start_epoch=0, checkpoint_dir=f"{spider_dir}/checkpoints"):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    train_losses = []
    val_losses = []
    best_val_loss = float('inf')

    # Training loop
    for epoch in range(start_epoch, num_epochs):
        model.train()
        total_train_loss = 0

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        for step, batch in enumerate(progress_bar):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass with mixed precision
            with autocast(device_type="cuda"):
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss / gradient_accumulation_steps

            # Backward pass with gradient scaling
            scaler.scale(loss).backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()

            total_train_loss += loss.item() * gradient_accumulation_steps
            progress_bar.set_postfix({"loss": loss.item() * gradient_accumulation_steps})

            # Save periodic checkpoint (every 1000 steps)
            if (step + 1) % 1000 == 0:
                save_checkpoint(checkpoint_dir, model, optimizer, scheduler, scaler, epoch, step,
                    train_losses, val_losses, best_val_loss, is_best=False, is_latest=True)

        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        print(f"Average training loss: {avg_train_loss:.4f}")

        # Validation
        model.eval()
        total_val_loss = 0

        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        print(f"Average validation loss: {avg_val_loss:.4f}")

        # Check if this is the best model
        is_best = avg_val_loss < best_val_loss
        if is_best:
            best_val_loss = avg_val_loss
            print(f"New best model with validation loss: {best_val_loss:.4f}")

        # Save epoch checkpoint
        save_checkpoint(checkpoint_dir, model, optimizer, scheduler, scaler,epoch, len(train_loader) - 1,
            train_losses, val_losses, best_val_loss, is_best=is_best, is_latest=True)

        # Plot training progress after each epoch
        plot_training_progress(train_losses, val_losses, checkpoint_dir)

    return train_losses, val_losses

In [None]:
checkpoint_dir = f"{spider_dir}/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

In [None]:
start_epoch = 0
train_losses = []
val_losses = []

In [None]:
checkpoint_path = os.path.join(checkpoint_dir, 'latest_checkpoint.pt')

if os.path.exists(checkpoint_path):

    # Load from checkpoint
    start_epoch, _, train_losses, val_losses, _ = load_checkpoint(
        checkpoint_path, model, optimizer, scheduler, scaler)
else:
    print("No checkpoint found. Starting training from scratch.")

No checkpoint found. Starting training from scratch.


In [None]:
print(f"Starting model training from epoch {start_epoch+1}...")
train_losses, val_losses = train_model(model=model, train_loader=train_loader, val_loader=val_loader, optimizer=optimizer,
    scheduler=scheduler, num_epochs=5, gradient_accumulation_steps=4, start_epoch=start_epoch, checkpoint_dir=checkpoint_dir)

Starting model training from epoch 1...


Epoch 1/5 [Train]:   0%|          | 0/666 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Average training loss: 2.5908


Epoch 1/5 [Val]:   0%|          | 0/84 [00:00<?, ?it/s]

Average validation loss: 0.8172
New best model with validation loss: 0.8172


Epoch 2/5 [Train]:   0%|          | 0/666 [00:00<?, ?it/s]

Average training loss: 0.7154


Epoch 2/5 [Val]:   0%|          | 0/84 [00:00<?, ?it/s]

Average validation loss: 0.4072
New best model with validation loss: 0.4072


Epoch 3/5 [Train]:   0%|          | 0/666 [00:00<?, ?it/s]

Average training loss: 0.4405


Epoch 3/5 [Val]:   0%|          | 0/84 [00:00<?, ?it/s]

Average validation loss: 0.2958
New best model with validation loss: 0.2958


Epoch 4/5 [Train]:   0%|          | 0/666 [00:00<?, ?it/s]

Average training loss: 0.3360


Epoch 4/5 [Val]:   0%|          | 0/84 [00:00<?, ?it/s]

Average validation loss: 0.2359
New best model with validation loss: 0.2359


Epoch 5/5 [Train]:   0%|          | 0/666 [00:00<?, ?it/s]

Average training loss: 0.2754


Epoch 5/5 [Val]:   0%|          | 0/84 [00:00<?, ?it/s]

Average validation loss: 0.2069
New best model with validation loss: 0.2069


In [21]:
print("Loading best model for evaluation...")
# best_model_path = os.path.join(checkpoint_dir, 'best_model.pt')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.load_state_dict(torch.load(best_model_path))
model.to(device)
model.eval()

Loading best model for evaluation...


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [22]:
def normalize_sql(sql):
    sql = re.sub(r'\s+', ' ', sql).strip()
    sql = sql.lower()
    sql = re.sub(r'(=|<|>|<=|>=)', lambda m: f" {m.group(0)} ", sql)
    sql = re.sub(r'\s+', ' ', sql).strip()
    return sql

In [23]:
def execute_sql(sql, db_path):
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute(sql)
        results = cursor.fetchall()
        conn.close()
        return results
    except Exception as e:
        return str(e)

In [24]:
def evaluate_model(model, tokenizer, val_dataset, schema_dict, db_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    exact_match = 0
    execution_match = 0
    execution_true = 0
    simple_count = 0
    simple_exact_match = 0
    simple_execution_match = 0
    complex_count = 0
    complex_exact_match = 0
    complex_execution_match = 0
    total = len(val_dataset)

    simple_query = True

    for i in tqdm(range(total), desc="Evaluating"):
        example = val_dataset[i]
        question = example['question']
        db_id = example['db_id']
        true_sql = example['sql']

        # Format input
        input_text = create_input_text(question, db_id, schema_dict)

        # Generate SQL
        input_ids = tokenizer(
            input_text,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        ).input_ids.to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                max_length=128,
                num_beams=5,
                early_stopping=True
            )

        # Decode prediction
        pred_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)

        print("True", true_sql)
        print("Pred", pred_sql)

        # Normalize for comparison
        pred_sql_norm = normalize_sql(pred_sql)
        true_sql_norm = normalize_sql(true_sql)

        if re.search(r'\bjoin\b', true_sql_norm, re.IGNORECASE):
            complex_count += 1
            simple_query = False
        else:
            simple_count += 1
            simple_query = True

        # Exact match check
        if pred_sql_norm == true_sql_norm:
            exact_match += 1

            if simple_query:
                simple_exact_match += 1
            else:
                complex_exact_match += 1


        # Execution match check
        try:
            db_path = os.path.join(db_dir, f"{db_id}/{db_id}.sqlite")
            pred_results = execute_sql(pred_sql, db_path)
            true_results = execute_sql(true_sql, db_path)

            if str(pred_results):
                execution_true += 1

            if str(pred_results) == str(true_results):
                execution_match += 1

                if simple_query:
                    simple_execution_match += 1
                else:
                    complex_execution_match += 1
        except:
            pass

    # Calculate metrics
    exact_match_acc = exact_match / total
    execution_acc = execution_match / total
    execution_true_acc = execution_true / total
    simple_exact_match_acc = simple_exact_match / simple_count
    simple_execution_acc = simple_execution_match / simple_count
    complex_exact_match_acc = complex_exact_match / complex_count
    complex_execution_acc = complex_execution_match / complex_count


    print(f"Exact Match Accuracy: {exact_match_acc:.4f}")
    print(f"Execution Match Accuracy: {execution_acc:.4f}")
    print(f"Execution True Accuracy: {execution_true_acc:.4f}")
    print(f"Simple Count: {simple_count}")
    print(f"Simple Exact Match Accuracy: {simple_exact_match_acc:.4f}")
    print(f"Simple Execution Match Accuracy: {simple_execution_acc:.4f}")
    print(f"Complex Count: {complex_count}")
    print(f"Complex Exact Match Accuracy: {complex_exact_match_acc:.4f}")
    print(f"Complex Execution Match Accuracy: {complex_execution_acc:.4f}")

    return {
        "exact_match": exact_match_acc,
        "execution_match": execution_acc,
        "execution_true": execution_true_acc,
        "simple_count": simple_count,
        "simple_exact_match": simple_exact_match_acc,
        "simple_execution_match": simple_execution_acc,
        "complex_count": complex_count,
        "complex_exact_match": complex_exact_match_acc,
        "complex_execution_match": complex_execution_acc
    }


In [25]:
print("Evaluating on validation set...")
val_metrics = evaluate_model(model, tokenizer, val_dataset, text_schema_dict, dataset_dir)

Evaluating on validation set...


Evaluating:   0%|          | 0/665 [00:00<?, ?it/s]

True SELECT eid ,  name FROM Employee ORDER BY salary DESC LIMIT 1
Pred SQL: Show the ID and name of the employee with maximum salary. Table: flight (flno (number) [PK], origin (text), destination (text), distance (number), departure_date (time), arrival_date (time), price (number), aid (number) [FK -> aircraft.aid]) Table: aircraft (aid (number) [PK] [FK -> flight.aid], name (text), distance (number)) Table: employee (eid (
True SELECT t3.customer_details FROM claim_headers AS t1 JOIN policies AS t2 ON t1.policy_id  =  t2.policy_id JOIN customers AS t3 ON t2.customer_id  =  t3.customer_id WHERE t1.amount_piad  =  (SELECT min(amount_piad) FROM claim_headers)
Pred SQL: Which customer made the smallest amount of claim in one claim? Return the customer details. Table: Customers (Customer_ID (number) [PK] [FK -> Policies.Customer_ID], Customer_Details (text)) Table: Staff (Staff_ID (number) [PK] [FK -> Claims_Documents.Created_by_Staff_ID], Staff_Details (text)) Table: Policies (Policy_ID


KeyboardInterrupt: 

In [None]:
print("Evaluating on test set...")
val_metrics = evaluate_model(model, tokenizer, test_dataset, text_schema_dict, dataset_dir)

Evaluating on test set...


Evaluating:   0%|          | 0/666 [00:00<?, ?it/s]

Exact Match Accuracy: 0.2943
Execution Match Accuracy: 0.4249
Execution True Accuracy: 1.0000
Simple Count: 399
Simple Exact Match Accuracy: 0.4712
Simple Execution Match Accuracy: 0.5815
Complex Count: 267
Complex Exact Match Accuracy: 0.0300
Complex Execution Match Accuracy: 0.1910
