# Final ML Tuning
## Team A

Python 3.12 works
1. delete any prior virtual env that are located within the base directory of this repo
2. select another kernel -> python environment -> create python environment -> venv -> python 3.12.X -> select requirement.txt and click ok
3. before running through the notebook, you need to decide if you want to use your cpu or cuda

Cuda will speed up model training significantly. Please note that your MUST have a nvidia GPU in order to use CUDA.
If you want to use your CPU, run 'pip install torch' to install torch. no other setup is needed.

If you choose to use cuda, first install release version 12.8 of [nvcc](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html). 
Afterwards, in a new terminal instance, run 'nvcc --version' and verify that you see something similar to `Cuda compilation tools, release 12.8, V12.8.93`
Next, run `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126` to install the proper version of torch, torchvision, and torchaudio, which makes cuda functionality available. The index says 12.6, however, this still works with version 12.8 of nvcc.
We provide the --index-url because by default, pip does not include the cuda compatible version of torch.

4. finally, restart vscode/the python notebook kernel

In [1]:
import pandas as pd
import sqlite3
import evaluate
import os
import json
import matplotlib.pyplot as plt
import seaborn
from typing import Set
import numpy as np
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
import torch
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def execute_query(query, print_padding, verbose=True):
    try:
        conn = sqlite3.connect("data\\reza_data_cars.db")
        cursor = conn.cursor()
        cursor.execute(query)
        if verbose:
            print(f"{'SQL query:':<{print_padding}}{query}")
            print(f"{'Query results:':<{print_padding}}{cursor.fetchall()}")
        return cursor.fetchall()
    except sqlite3.Error as error:
        print(f"Error executing sql {error}")
        
    finally:
        conn.close()

# Load the tokenizer and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('cssupport/t5-small-awesome-text-to-sql').to(device)

# fine-tune the model
# t5 fine-tuning article https://medium.com/nlplanet/a-full-guide-to-finetuning-t5-for-text2text-and-building-a-demo-with-streamlit-c72009631887


model.eval()

def generate_sql(input_prompt):
    """Generate SQL query from natural language input."""
    inputs = tokenizer(input_prompt, padding=True, truncation=True, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=512)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

padding = 30
print("Good examples:\n")
# example 
natural_language_query = "How many makers are there?"
input_prompt = f"""tables:
CREATE TABLE "car_makers" ( 
	Id INTEGER PRIMARY KEY, 
	Maker TEXT, 
	FullName TEXT, 
	Country TEXT,
	FOREIGN KEY (Country) REFERENCES countries(CountryId)
)
### Write an SQL query to answer the question:
{natural_language_query}"""

generated_sql = generate_sql(input_prompt)
print(f"{'Original query:':<{padding}}{natural_language_query}\n{'Generated SQL:':<{padding}}{generated_sql}")
execute_query(generated_sql, padding)

# example 
natural_language_query = "Who is the heaviest car?"
input_prompt = f"""tables:
CREATE TABLE "cars_data" (
	Id INTEGER PRIMARY KEY, 
	MPG TEXT, 
	Cylinders INTEGER, 
	Edispl REAL, 
	Horsepower TEXT, 
	Weight INTEGER, 
	Accelerate REAL, 
	Year INTEGER,
	FOREIGN KEY (Id) REFERENCES car_names (MakeId)
)
CREATE TABLE "car_names" ( 
	MakeId INTEGER PRIMARY KEY, 
	Model TEXT, 
	Make TEXT,
	FOREIGN KEY (Model) REFERENCES model_list (Model)
)
### Write an SQL query to answer the question:
{natural_language_query}"""

generated_sql = generate_sql(input_prompt)
print(f"\n{'Original query:':<{padding}}{natural_language_query}\n{'Generated SQL:':<{padding}}{generated_sql}")
execute_query(generated_sql, padding)

# example 
natural_language_query = "How many makers are there?"
input_prompt = f"""tables:
CREATE TABLE "model_list" (
	Model TEXT UNIQUE
)
### Write an SQL query to answer the question:
{natural_language_query}"""

generated_sql = generate_sql(input_prompt)
print(f"\n{'Original query:':<{padding}}{natural_language_query}\n{'Generated SQL:':<{padding}}{generated_sql}")
execute_query(generated_sql, padding)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Good examples:

Original query:               How many makers are there?
Generated SQL:                SELECT COUNT(*) FROM "car_makers"
SQL query:                    SELECT COUNT(*) FROM "car_makers"
Query results:                [(22,)]

Original query:               Who is the heaviest car?
Generated SQL:                SELECT Car_names FROM "heaviest"
Error executing sql no such table: heaviest

Original query:               How many makers are there?
Generated SQL:                SELECT COUNT(*) FROM "model_list"
SQL query:                    SELECT COUNT(*) FROM "model_list"
Query results:                [(36,)]


[]

## Improve model with fine-tuning


Training data to fine-tune the model

In [3]:
# Example data to fine-tune the model
training_data = [
    {
        "input": """### Given the following table definitions:
CREATE TABLE "car_names" ( 
    MakeId INTEGER PRIMARY KEY, 
    Model TEXT, 
    FOREIGN KEY (Model) REFERENCES model_list (Model)
)
CREATE TABLE "model_list" ( 
    ModelId INTEGER PRIMARY KEY, 
    Maker INTEGER, 
    Model TEXT UNIQUE,
    FOREIGN KEY (Maker) REFERENCES car_makers (Id)
)
CREATE TABLE "car_makers" ( 
    Id INTEGER PRIMARY KEY, 
    Maker TEXT, 
    FullName TEXT
)
### Write an SQL query to answer the question:
List all unique car models along with their maker full names.""",
        "target": """
select DISTINCT(car_names.Model), car_makers.FullName 
from car_names 
join model_list on car_names.Model = model_list.Model 
join car_makers on model_list.Maker = car_makers.Id
""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    Id INTEGER PRIMARY KEY, 
    Year INTEGER,
    FOREIGN KEY (Id) REFERENCES car_names (MakeId)
)
CREATE TABLE "car_names" ( 
    MakeId INTEGER PRIMARY KEY, 
    Model TEXT,
    FOREIGN KEY (Model) REFERENCES model_list (Model)
)
CREATE TABLE "model_list" (
    ModelId INTEGER PRIMARY KEY,
    Maker INTEGER,
    Model TEXT UNIQUE,
    FOREIGN KEY (Maker) REFERENCES car_makers (Id)
)
CREATE TABLE "car_makers" ( 
    Id INTEGER PRIMARY KEY,
    Country TEXT,
    FOREIGN KEY (Country) REFERENCES countries(CountryId)
)
CREATE TABLE "countries" (
    CountryId INTEGER PRIMARY KEY, 
    CountryName TEXT
)
### Write an SQL query to answer the question:
Show each car's model, year, and country of origin.""",
        "target": """
SELECT car_names.Model, cars_data.Year, countries.CountryName
FROM cars_data
JOIN car_names ON cars_data.Id = car_names.MakeId
JOIN model_list ON car_names.Model = model_list.Model
JOIN car_makers ON model_list.Maker = car_makers.Id
JOIN countries ON car_makers.Country = countries.CountryId
""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "car_makers" ( 
    Id INTEGER PRIMARY KEY, 
    FullName TEXT, 
    Country TEXT,
    FOREIGN KEY (Country) REFERENCES countries(CountryId)
)
CREATE TABLE "countries" (
    CountryId INTEGER PRIMARY KEY, 
    CountryName TEXT
)
### Write an SQL query to answer the question:
Find all car makers from Japan.""",
        "target": """
SELECT car_makers.FullName
FROM car_makers
JOIN countries ON car_makers.Country = countries.CountryId
WHERE countries.CountryName LIKE 'japan'
""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    Id INTEGER PRIMARY KEY, 
    Horsepower TEXT,
    FOREIGN KEY (Id) REFERENCES car_names (MakeId)
)
CREATE TABLE "car_names" ( 
    MakeId INTEGER PRIMARY KEY, 
    Model TEXT,
    FOREIGN KEY (Model) REFERENCES model_list (Model)
)
CREATE TABLE "model_list" (
    ModelId INTEGER PRIMARY KEY,
    Maker INTEGER,
    Model TEXT UNIQUE,
    FOREIGN KEY (Maker) REFERENCES car_makers (Id)
)
CREATE TABLE "car_makers" ( 
    Id INTEGER PRIMARY KEY, 
    Country TEXT,
    FOREIGN KEY (Country) REFERENCES countries(CountryId)
)
CREATE TABLE "countries" (
    CountryId INTEGER PRIMARY KEY, 
    CountryName TEXT
)
### Write an SQL query to answer the question:
What is the average horsepower of cars made in Germany?""",
        "target": """
SELECT AVG(CAST(cars_data.Horsepower AS FLOAT)) AS avg_hp
FROM cars_data
JOIN car_names ON cars_data.Id = car_names.MakeId
JOIN model_list ON car_names.Model = model_list.Model
JOIN car_makers ON model_list.Maker = car_makers.Id
JOIN countries ON car_makers.Country = countries.CountryId
WHERE countries.CountryName LIKE 'Germany'
""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "car_makers" ( 
    Id INTEGER PRIMARY KEY, 
    Country TEXT,
    FOREIGN KEY (Country) REFERENCES countries(CountryId)
)
CREATE TABLE "countries" (
    CountryId INTEGER PRIMARY KEY, 
    Continent INTEGER
)
CREATE TABLE "continents" ( 
    ContId INTEGER PRIMARY KEY, 
    Continent TEXT 
)
### Write an SQL query to answer the question:
Which continent has the most car makers?""",
        "target": """
SELECT continents.Continent, COUNT(*) AS maker_count
FROM car_makers
JOIN countries ON car_makers.Country = countries.CountryId
JOIN continents ON countries.Continent = continents.ContId
GROUP BY continents.Continent
ORDER BY maker_count DESC
LIMIT 1
""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    Id INTEGER PRIMARY KEY,
    MPG TEXT,
    Year INTEGER
)
### Write an SQL query to answer the question:
What is the average MPG per year?""",
        "target": """
SELECT Year, AVG(CAST(MPG AS FLOAT)) AS avg_mpg
FROM cars_data
GROUP BY Year
ORDER BY Year
""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    Id INTEGER PRIMARY KEY,
    Cylinders INTEGER
)
CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Model TEXT
)
### Write an SQL query to answer the question:
What car models have more than 6 cylinders?""",
        "target": """
SELECT car_names.Model
FROM cars_data
JOIN car_names ON cars_data.Id = car_names.MakeId
WHERE cars_data.Cylinders > 6
""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "car_makers" (
    Id INTEGER PRIMARY KEY,
    Country TEXT
)
CREATE TABLE "countries" (
    CountryId INTEGER PRIMARY KEY,
    Continent INTEGER
)
CREATE TABLE "continents" (
    ContId INTEGER PRIMARY KEY,
    Continent TEXT
)
### Write an SQL query to answer the question:
How many car makers are there in each continent?""",
        "target": """
SELECT continents.Continent, COUNT(*) AS maker_count
FROM car_makers
JOIN countries ON car_makers.Country = countries.CountryId
JOIN continents ON countries.Continent = continents.ContId
GROUP BY continents.Continent
ORDER BY maker_count DESC
""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Model TEXT
)
CREATE TABLE "cars_data" (
    Id INTEGER PRIMARY KEY,
    Weight INTEGER
)
CREATE TABLE "model_list" (
    ModelId INTEGER PRIMARY KEY,
    Maker INTEGER,
    Model TEXT UNIQUE
)
CREATE TABLE "car_makers" (
    Id INTEGER PRIMARY KEY,
    FullName TEXT
)
### Write an SQL query to answer the question:
What is the lightest car made by each maker?""",
        "target": """
SELECT car_makers.FullName, MIN(cars_data.Weight) AS lightest
FROM cars_data
JOIN car_names ON cars_data.Id = car_names.MakeId
JOIN model_list ON car_names.Model = model_list.Model
JOIN car_makers ON model_list.Maker = car_makers.Id
GROUP BY car_makers.FullName
ORDER BY lightest ASC
""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    Id INTEGER PRIMARY KEY,
    Year INTEGER
)
CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Model TEXT
)
### Write an SQL query to answer the question:
Which car models were released in the 1970s?""",
        "target": """
SELECT car_names.Model
FROM cars_data
JOIN car_names ON cars_data.Id = car_names.MakeId
WHERE Year BETWEEN 1970 AND 1979
""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Make TEXT
)
### Write an SQL query to answer the question:
Find all car models that contain the word 'Civic'.""",
        "target": """
SELECT Make
FROM car_names
WHERE Make LIKE '%Civic%'
""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    Id INTEGER PRIMARY KEY,
    Accelerate REAL
)
CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Model TEXT
)
### Write an SQL query to answer the question:
What are the top 3 fastest accelerating cars?""",
        "target": """
SELECT car_names.Model, cars_data.Accelerate
FROM cars_data
JOIN car_names ON cars_data.Id = car_names.MakeId
ORDER BY cars_data.Accelerate ASC
LIMIT 3
""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Model TEXT
)
CREATE TABLE "cars_data" (
    Id INTEGER PRIMARY KEY,
    Weight INTEGER
)
CREATE TABLE "model_list" (
    ModelId INTEGER PRIMARY KEY,
    Maker INTEGER,
    Model TEXT
)
CREATE TABLE "car_makers" (
    Id INTEGER PRIMARY KEY,
    FullName TEXT
)
### Write an SQL query to answer the question:
Which makers have built a car lighter than 2000 pounds?""",
        "target": """
SELECT DISTINCT car_makers.FullName
FROM cars_data
JOIN car_names ON cars_data.Id = car_names.MakeId
JOIN model_list ON car_names.Model = model_list.Model
JOIN car_makers ON model_list.Maker = car_makers.Id
WHERE cars_data.Weight < 2000
""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    Id INTEGER PRIMARY KEY,
    Year INTEGER
)
CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Model TEXT
)
### Write an SQL query to answer the question:
Show me all car models not made in the 1980s.""",
        "target": """
SELECT car_names.Model
FROM cars_data
JOIN car_names ON cars_data.Id = car_names.MakeId
WHERE Year < 1980 OR Year > 1989
""",
    },
    # Complex queries seemed to have intorduced some catastrophic forgetting in the model on how to handle simple queries.
    # Adding in more single table queries to help the model learn how to handle them again and rebalance training data.
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    Id INTEGER PRIMARY KEY,
    MPG TEXT,
    Cylinders INTEGER,
    Year INTEGER
)
### Write an SQL query to answer the question:
List all car IDs with MPG listed.""",
        "target": """SELECT Id FROM cars_data WHERE MPG IS NOT NULL""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Model TEXT
)
### Write an SQL query to answer the question:
Show all car models.""",
        "target": """SELECT Model FROM car_names""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    Id INTEGER PRIMARY KEY,
    Year INTEGER
)
### Write an SQL query to answer the question:
List cars made in 1975.""",
        "target": """SELECT Id FROM cars_data WHERE Year = 1975""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Model TEXT
)
### Write an SQL query to answer the question:
Show all distinct car models.""",
        "target": """SELECT DISTINCT Model FROM car_names""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    Id INTEGER PRIMARY KEY,
    Cylinders INTEGER
)
### Write an SQL query to answer the question:
Find all cars with exactly 8 cylinders.""",
        "target": """SELECT Id FROM cars_data WHERE Cylinders = 8""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    Id INTEGER PRIMARY KEY,
    Year INTEGER
)
### Write an SQL query to answer the question:
Get the newest car's year.""",
        "target": """SELECT MAX(Year) FROM cars_data""",
    },
    {
        "input": "translate English to SQL: What are all the car models?",
        "target": "SELECT Model FROM car_names",
    },
    {
        "input": "translate English to SQL: List all cars made after 1980.",
        "target": "SELECT Id FROM cars_data WHERE Year > 1980",
    },
    {
        "input": "translate English to SQL: Show cars that have more than 4 cylinders.",
        "target": "SELECT Id FROM cars_data WHERE Cylinders > 4",
    },
    {
        "input": "translate English to SQL: Get all model names from the car_names table.",
        "target": "SELECT Model FROM car_names",
    },
    {
        "input": "translate English to SQL: How many unique models are there?",
        "target": "SELECT COUNT(DISTINCT Model) FROM car_names",
    },
    {
        "input": "translate English to SQL: Which years are present in the cars_data table?",
        "target": "SELECT DISTINCT Year FROM cars_data",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "car_names" (
    Id INTEGER PRIMARY KEY,
    Model TEXT,
    Make TEXT
)
### Write an SQL query to answer the question:
What are all the makes from the model ford?""",
        "target": """SELECT Make FROM car_names WHERE Model = 'ford'""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "continents" (
    ContId INTEGER PRIMARY KEY,
    Continent TEXT
)
CREATE TABLE "countries" (
    CountryId INTEGER PRIMARY KEY,
    CountryName TEXT,
    Continent INTEGER,
    FOREIGN KEY REFERENCES continents(contId)
)

### Write an SQL query to answer the question:
What country names are part of the European continent?""",
        "target": """SELECT CountryName FROM countries cr LEFT JOIN continents co ON cr.Continent = ContId WHERE co.Continent = 'europe'""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "countries" (
    CountryId INTEGER PRIMARY KEY
)

### Write an SQL query to answer the question:
How many countries produce vehicles?""",
        "target": """SELECT COUNT(CountryId) FROM countries""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Make TEXT
)

### Write an SQL query to answer the question:
How many makes start with toyota?""",
        "target": """SELECT COUNT(Make) FROM car_names WHERE Make LIKE 'toyota %'""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Make TEXT
)

### Write an SQL query to answer the question:
How many makes contain the word ford?""",
        "target": """SELECT COUNT(Make) FROM car_names WHERE Make LIKE '%ford%'""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Make TEXT
)

### Write an SQL query to answer the question:
What makes contain the word mustang?""",
        "target": """SELECT Make FROM car_names WHERE Make LIKE '%mustang%'""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "car_makers" (
    id INTEGER PRIMARY KEY,
    Maker TEXT,
    FullName TEXT
)

### Write an SQL query to answer the question:
What is the full name of each Maker?""",
        "target": """SELECT FullName FROM car_makers""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    id INTEGER PRIMARY KEY,
    Horsepower TEXT
)

### Write an SQL query to answer the question:
How many cars have an unknown Horsepower?""",
        "target": """SELECT COUNT(id) FROM cars_data WHERE Horsepower IS NULL""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    id INTEGER PRIMARY KEY,
    Cylinders INTEGER,
    Edispl REAL
)

### Write an SQL query to answer the question:
What is the largest engine displacement for each cylinder count?""",
        "target": """SELECT Cylinders, MAX(Edispl) FROM cars_data GROUP BY Cylinders""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    id INTEGER PRIMARY KEY,
    Cylinders INTEGER
)

### Write an SQL query to answer the question:
How many cars exist per cylinder?""",
        "target": """SELECT Cylinders, COUNT(id) FROM cars_data GROUP BY Cylinders""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    id INTEGER PRIMARY KEY,
    Accelerate REAL
)

### Write an SQL query to answer the question:
For all cars, what is the lowest acceleration time?""",
        "target": """SELECT MIN(Accelerate) FROM cars_data""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    id INTEGER PRIMARY KEY,
    Cylinders INTEGER,
    Accelerate REAL
)

### Write an SQL query to answer the question:
For all cars, what is the lowest acceleration time per cylinder?""",
        "target": """SELECT Cylinders, MIN(Accelerate) FROM cars_data GROUP BY Cylinders""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    id INTEGER PRIMARY KEY,
    Accelerate REAL,
    Year INTEGER
)

### Write an SQL query to answer the question:
For every year, what is the slowest acceleration time?""",
        "target": """SELECT YEAR, MIN(Accelerate) FROM cars_data GROUP BY Year""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    id INTEGER PRIMARY KEY,
    Cylinders INTEGER,
    Accelerate REAL
)

CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Make TEXT,
    FOREIGN KEY MakeId REFERENCES cars_data(id)
)

### Write an SQL query to answer the question:
What car make has the highest acceleration four cylinder?""",
        "target": """SELECT cn.Make, MAX(cd.Accelerate) FROM car_names cn LEFT JOIN cars_data cd ON cn.MakeId = cd.id WHERE Cylinders = 4""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    id INTEGER PRIMARY KEY,
    Cylinders INTEGER,
    Weight INTEGER
)

CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Make TEXT,
    FOREIGN KEY MakeId REFERENCES cars_data(id)
)

### Write an SQL query to answer the question:
What make produces the heaviest six cylinder?""",
        "target": """SELECT cn.Make, MAX(cd.Weight) FROM car_names cn LEFT JOIN cars_data cd ON cn.MakeId = cd.id WHERE Cylinders = 6""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    id INTEGER PRIMARY KEY,
    Cylinders INTEGER,
    MPG TEXT
)

CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Make TEXT,
    FOREIGN KEY MakeId REFERENCES cars_data(id)
)

### Write an SQL query to answer the question:
What eight cylinder make has the maximum mpg?""",
        "target": """SELECT cn.Make, MAX(CAST(cd.MPG AS FLOAT)) FROM car_names cn LEFT JOIN cars_data cd ON cn.MakeId = cd.id WHERE Cylinders = 8""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    id INTEGER PRIMARY KEY,
    EDispl REAL,
    Horsepower TEXT
)

CREATE TABLE "car_names" (
    MakeId INTEGER PRIMARY KEY,
    Make TEXT,
    FOREIGN KEY MakeId REFERENCES cars_data(id)
)

### Write an SQL query to answer the question:
What car has the highest horse power to displacement ratio, excluding any cars that have an unknown Horspower and Engine Displacement?""",
        "target": """SELECT cn.Make, cd.horsePower, cd.Edispl, MIN(CAST(cd.horsePower AS FLOAT)/CAST(cd.Edispl AS FLOAT)) FROM car_names cn LEFT JOIN cars_data cd ON cn.MakeId = cd.id WHERE cd.Edispl IS NOT NULL AND cd.Horsepower != 'null'""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    id INTEGER PRIMARY KEY,
    Cylinders INTEGER,
    MPG TEXT
)

### Write an SQL query to answer the question:
How many eight cylinders have a mpg greater than 20?""",
        "target": """SELECT COUNT(*) FROM cars_data WHERE Cylinders = 8 AND CAST(MPG AS FLOAT) > 20""",
    },
    {
        "input": """### Given the following table definitions:
CREATE TABLE "cars_data" (
    id INTEGER PRIMARY KEY,
    EDispl REAL,
    Year TEXT
)

### Write an SQL query to answer the question:
What is the greatest engine displacement for every year?""",
        "target": """SELECT max(EDispl) FROM cars_data GROUP BY YEAR""",
    }
]

In [4]:
# re-pull model to ensure I'm running against the right model in noteboook
# Use pre-trained model to test output of training data before fine-tuning
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('cssupport/t5-small-awesome-text-to-sql').to(device)

match_count = 0
for example in training_data:
    input_prompt = example["input"]
    expected_sql = example["target"]
    generated_sql = generate_sql(input_prompt)
    generated_results = execute_query(generated_sql, padding, verbose=False)
    print(f"\n{'Generated SQL:':<{padding}}{generated_results}")
    expected_sql_results = execute_query(expected_sql, padding, verbose=False)
    print(f"{'Expected SQL:':<{padding}}{expected_sql_results}")
    print("**" * 50)
    if generated_results == expected_sql_results:
        match_count += 1

print(f"Total Matches: {match_count} out of {len(training_data)}")

Error executing sql no such column: Model

Generated SQL:                None
Expected SQL:                 [('chevrolet', 'General Motors'), ('buick', 'General Motors'), ('plymouth', 'Chrysler'), ('amc', 'American Motor Company'), ('ford', 'Ford Motor Company'), ('pontiac', 'General Motors'), ('citroen', 'Citroen'), ('dodge', 'Chrysler'), ('toyota', 'Toyota'), ('datsun', 'Nissan Motors'), ('volkswagen', 'Volkswagen'), ('peugeot', 'Peugeaut'), ('audi', 'Volkswagen'), ('saab', 'Saab'), ('bmw', 'BMW'), ('mercury', 'Ford Motor Company'), ('opel', 'Opel'), ('fiat', 'Fiat'), ('oldsmobile', 'General Motors'), ('chrysler', 'Chrysler'), ('mazda', 'Mazda'), ('volvo', 'Volvo'), ('renault', 'Renault'), ('honda', 'Honda'), ('subaru', 'Subaru'), ('capri', 'Ford Motor Company'), ('mercedes-benz', 'Daimler Benz'), ('cadillac', 'General Motors'), ('mercedes', 'Daimler Benz'), ('triumph', 'Triumph'), ('nissan', 'Nissan Motors')]
**************************************************************************

In [5]:
class TextToSQLDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data[idx]['input']
        target_text = self.data[idx]['target']
        
        # Tokenize inputs and targets
        inputs = self.tokenizer(input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        targets = self.tokenizer(target_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors="pt")
        
        labels = targets.input_ids.squeeze(0)
        labels[labels == self.tokenizer.pad_token_id] = -100 # ! T5 expects padding in labels to be set to -100

        
        return {
            'input_ids': inputs.input_ids.squeeze(0),
            'attention_mask': inputs.attention_mask.squeeze(0),
            'labels': labels
        }

In [6]:
from sklearn.model_selection import train_test_split

train_examples, val_examples = train_test_split(training_data, test_size=0.3, random_state=123)
train_dataset = TextToSQLDataset(tokenizer, train_examples)
val_dataset   = TextToSQLDataset(tokenizer, val_examples)

In [12]:
meteor_metric = evaluate.load("meteor")
bleu_metric = evaluate.load("bleu")


def hallucination_rate(pred: str, schema_tokens: Set[str], reference: str):
    # tokens in pred not in schema or reference
    pred_tokens = set(pred.split())
    ref_tokens = set(reference.split())
    extra = pred_tokens - schema_tokens - ref_tokens
    print(reference)
    print(pred)
    print(extra)
    return len(extra) / len(pred_tokens)


def compute_metrics(eval_pred):
    # With Seq2SeqTrainer & predict_with_generate=True,
    # eval_pred.predictions are already token IDs from .generate()
    preds, labels = eval_pred.predictions, eval_pred.label_ids
    
    # Decode predictions & labels
    decoded_preds  = tokenizer.batch_decode(preds,  skip_special_tokens=True)
    # replace -100 in the labels as pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # clean whitespace
    preds_clean = [p.strip() for p in decoded_preds]
    labs_clean  = [[l.strip()] for l in decoded_labels]

    # BLEU & METEOR
    bleu   = bleu_metric.compute(predictions=preds_clean, references=labs_clean)["bleu"]
    meteor = meteor_metric.compute(predictions=preds_clean, references=labs_clean)["meteor"]

    # Exact match
    exact = sum(p == l[0] for p,l in zip(preds_clean, labs_clean)) / len(labs_clean)

    # TODO
    # we need to train the model to use full table names or standardize them across the board
    # and use those as tokens. I don't think providing the 'table.column' name has any value
    schema_tokens = {
        # car_makers tables and columns
        "car_makers",
        "car_makers.Id",
        "Id",
        "car_makers.Maker",
        "Maker",
        "car_makers.FullName",
        "FullName",
        "car_makers.Country",
        "Country",
        # car_names tables and columns
        "car_names",
        "car_names.MakeId",
        "MakeId",
        "car_names.Model",
        "Model",
        "car_names.Make",
        "Make",
        # cars_data tables and columns
        "cars_data",
        "cars_data.Id",
        "Id",
        "cars_data.MPG",
        "MPG",
        "cars_data.Cylinders",
        "Cylinders",
        "cars_data.Edispl",
        "Edispl",
        "cars_data.Horsepower",
        "Horsepower",
        "cars_data.Weight",
        "Weight",
        "cars_data.Accelerate",
        "Accelerate",
        "cars_data.Year",
        "Year",
        # continents tables and columns
        "continents",
        "continents.ContId",
        "ContId",
        "continents.Continent",
        "Continent",
        # countries tables and columns
        "countries",
        "countries.CountryId",
        "CountryId",
        "countries.CountryName",
        "CountryName",
        "countries.Continent",
        "Continent",
        # model_list tables and columns
        "model_list",
        "model_list.ModelId",
        "ModelId",
        "model_list.Maker",
        "Maker",
        "model_list.Model",
        "Model",
    }  # column names and table names for tokens to validate against hallucinations
    rates = [
        hallucination_rate(p, schema_tokens, l)
        for p, l in zip(decoded_preds, decoded_labels)
    ]
    hallu = sum(rates) / len(rates)

    return {
        "bleu": float(bleu),
        "meteor": float(meteor),
        "exact_match": float(exact),
        "hallucination_rate": float(hallu),
    }

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\spenc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\spenc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\spenc\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [13]:
# re-pull model to ensure I'm running against the right model in noteboook
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('cssupport/t5-small-awesome-text-to-sql').to(device)

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=50, # Will be overfitting the model
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=3e-4,
    weight_decay=0.0,
    logging_dir='./logs',
    # logging_steps=1, # commented out for smaller export
    save_strategy="no",
    remove_unused_columns=False,
    report_to="none",
    predict_with_generate=True,
)


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

Step,Training Loss


('./fine-tuned-model\\tokenizer_config.json',
 './fine-tuned-model\\special_tokens_map.json',
 './fine-tuned-model\\spiece.model',
 './fine-tuned-model\\added_tokens.json')

In [14]:
trainer.evaluate()

SELECT car_makers.FullName, MIN(cars_data.Weight) AS lightest FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId JOIN model_list ON car_names.Model = model_list.Model JOIN car_makers ON model_list.Maker = car_makers.Id GROUP BY car_makers.FullName ORDER BY lightest ASC
SELECT cn.FullName, MIN(CAST(cd
{'MIN(CAST(cd', 'cn.FullName,'}
SELECT car_names.Model, cars_data.Accelerate FROM cars_data JOIN car_names ON cars_data.Id = car_names.MakeId ORDER BY cars_data.Accelerate ASC LIMIT 3
SELECT fastest AS c1 FROM cars_data
{'fastest', 'AS', 'c1'}
SELECT CountryName FROM countries cr LEFT JOIN continents co ON cr.Continent = ContId WHERE co.Continent = 'europe'
SELECT CountryName FROM countries WHERE continents.Continent = 'Europe'
{"'Europe'"}
SELECT COUNT(Make) FROM car_names WHERE Make LIKE '%ford%'
SELECT COUNT(Make) FROM car_names WHERE Make LIKE
set()
SELECT Id FROM cars_data WHERE Cylinders = 8
SELECT Id FROM cars_data WHERE Cylinders = 8
set()
SELECT car_names.Model FROM 

{'eval_loss': 0.8281341195106506,
 'eval_bleu': 0.07704865484337504,
 'eval_meteor': 0.4522193475567605,
 'eval_exact_match': 0.07142857142857142,
 'eval_hallucination_rate': 0.21751700680272107,
 'eval_runtime': 0.9115,
 'eval_samples_per_second': 15.36,
 'eval_steps_per_second': 4.389,
 'epoch': 50.0}

In [10]:
# Use the newly fine-tuned model to generate SQL queries
# I am using the same questions as what was provided in the fine-tuning data
# to see if the fine-tuned model can generate the correct SQL queries after the training.
fine_tuned_model_path = './fine-tuned-model'
tokenizer = T5Tokenizer.from_pretrained(fine_tuned_model_path)
model = T5ForConditionalGeneration.from_pretrained(fine_tuned_model_path).to(device)

def generate_sql_fine_tuned(input_prompt):
    """Generate SQL query from natural language input using the fine-tuned model."""
    inputs = tokenizer(input_prompt, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=256)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
# Print the generated SQL query's results and compare them with the expected query's results from training
match_count = 0
for example in training_data:
    input_prompt = example["input"]
    expected_sql = example["target"]
    generated_sql = generate_sql_fine_tuned(input_prompt)
    generated_results = execute_query(generated_sql, padding, verbose=False)
    expected_sql_results = execute_query(expected_sql, padding, verbose=False)
    if generated_results == expected_sql_results:
        match_count += 1
    else:
        print(f"\n{'Generated SQL:':<{padding}}{generated_sql}")
        print(f"{'Expected SQL:':<{padding}}{expected_sql}")

print(f"Total Matches: {match_count} out of {len(training_data)}")

Error executing sql no such table: country

Generated SQL:                SELECT country, COUNT(*) AS c1 FROM car_names JOIN countries ON car_names.Country = countries.CountryId JOIN model_list ON countries.Maker = car_names.Id JOIN country ON countries.Country = countries.CountryId JOIN car_names ON countries.Maker = car_makers.Id JOIN countries ON countries.Country = countries.CountryId WHERE countries.Country LIKE '%Country%'
Expected SQL:                 
SELECT car_names.Model, cars_data.Year, countries.CountryName
FROM cars_data
JOIN car_names ON cars_data.Id = car_names.MakeId
JOIN model_list ON car_names.Model = model_list.Model
JOIN car_makers ON model_list.Maker = car_makers.Id
JOIN countries ON car_makers.Country = countries.CountryId

Error executing sql no such column: country.Country

Generated SQL:                SELECT country.Country, COUNT(*) AS c1 FROM car_names JOIN countries ON car_names.MakeId = countries.CountryId JOIN model_list ON countries.Country = countries.

# Why does it still NOT understand `<` and `>` :(

In [11]:
# ************************************************************** #
# Queries that weren't part of the training data
input_prompt = """### Given the following table definitions:
CREATE TABLE "car_names" ( 
	MakeId INTEGER PRIMARY KEY, 
	Model TEXT
)
### Write an SQL query to answer the question:
how many cars did bmw make"""
generated_sql = generate_sql_fine_tuned(input_prompt)
print(f"Original query: {input_prompt}\nGenerated SQL: {generated_sql}")
execute_query(generated_sql, padding)
print("\n" + "*" * 50 + "\n")

input_prompt = """### Given the following table definitions:
CREATE TABLE "car_names" ( 
	MakeId INTEGER PRIMARY KEY, 
	Model TEXT
)
CREATE TABLE "cars_data" (
	Id INTEGER PRIMARY KEY,
	Year INTEGER,
	FOREIGN KEY (Id) REFERENCES car_names (MakeId)
)
### Write an SQL query to answer the question:
how many cars with of the BMW model were made just the year 1970"""
generated_sql = generate_sql_fine_tuned(input_prompt)
print(f"Original query: {input_prompt}\nGenerated SQL: {generated_sql}")
execute_query(generated_sql, padding)
print("\n" + "*" * 50 + "\n")

input_prompt = """### Given the following table definitions:
CREATE TABLE "car_names" ( 
	"MakeId" INTEGER PRIMARY KEY,
	"Make" TEXT
)
### Write an SQL query to answer the question:
How many makes of cars have 'civic' in the name?"""
generated_sql = generate_sql_fine_tuned(input_prompt)
print(f"Original query: {input_prompt}\nGenerated SQL: {generated_sql}")
execute_query(generated_sql, padding)
print("\n" + "*" * 50 + "\n")

input_prompt = """### Given the following table definitions:
CREATE TABLE "car_names" ( 
	MakeId INTEGER PRIMARY KEY,
	Make TEXT
)
CREATE TABLE "model_list" ( 
	ModelId INTEGER PRIMARY KEY, 
	Maker INTEGER, 
	Model TEXT UNIQUE,
	FOREIGN KEY (Maker) REFERENCES car_makers (Id)
)
CREATE TABLE "car_makers" ( 
	Id INTEGER PRIMARY KEY, 
	Maker TEXT
)
### Write an SQL query to answer the question:
What is the name of the maker that produces the car with the make name 'civic'?"""
generated_sql = generate_sql_fine_tuned(input_prompt)
print(f"Original query: {input_prompt}\nGenerated SQL: {generated_sql}")
execute_query(generated_sql, padding)
print("\n" + "*" * 50 + "\n")


Original query: ### Given the following table definitions:
CREATE TABLE "car_names" ( 
	MakeId INTEGER PRIMARY KEY, 
	Model TEXT
)
### Write an SQL query to answer the question:
how many cars did bmw make
Generated SQL: SELECT COUNT(Make) FROM car_names WHERE Model = 'bmw'
SQL query:                    SELECT COUNT(Make) FROM car_names WHERE Model = 'bmw'
Query results:                [(2,)]

**************************************************

Original query: ### Given the following table definitions:
CREATE TABLE "car_names" ( 
	MakeId INTEGER PRIMARY KEY, 
	Model TEXT
)
CREATE TABLE "cars_data" (
	Id INTEGER PRIMARY KEY,
	Year INTEGER,
	FOREIGN KEY (Id) REFERENCES car_names (MakeId)
)
### Write an SQL query to answer the question:
how many cars with of the BMW model were made just the year 1970
Generated SQL: SELECT COUNT(*) FROM cars_data WHERE Year = 1970
SQL query:                    SELECT COUNT(*) FROM cars_data WHERE Year = 1970
Query results:                [(35,)]

**********

The last question I'm asking it to test is so close!

> What is the name of the car maker that produces a make with the name 'civic'?

The query it generates it this: ❌
```SQL
SELECT car_makers.Maker FROM car_names 
JOIN model_list ON car_names.Model = model_list.Model 
JOIN car_makers ON model_list.Maker = car_makers.Id 
WHERE car_makers.Maker LIKE '%civic%'
```

but the query I would expect is this: ✅
```SQL
SELECT car_makers.Maker FROM car_names 
JOIN model_list ON car_names.Model = model_list.Model 
JOIN car_makers ON model_list.Maker = car_makers.Id 
WHERE car_names.make LIKE '%civic%';
```

All it has to do is switch the where clause, I think having such similarly named columns are whats throwing it off there.