# EDA YOUNG

In [27]:
!pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.18.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------------------ --------------------- 0.8/1.7 MB 8.5 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 9.4 MB/s eta 0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.18.0


In [28]:
import pandas as pd
import sqlite3
from transformers import pipeline, AutoTokenizer
import evaluate
import os
import json
import matplotlib.pyplot as plt
import seaborn
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
from torch.utils.data import Dataset, DataLoader




Examples of queries that show need for fine tuning
- Query: which player has the longest name.
     - Output: SELECT player_name FROM Player ORDER BY height DESC LIMIT 1

- Query: how many players are over 6 feet tall
     - Output: SELECT COUNT(*) FROM Player WHERE height > 6

In [78]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

def execute_query(query, print_padding):
    try:
        conn = sqlite3.connect("cyoung_eda.db")
        cursor = conn.cursor()
        cursor.execute(query)

        print(f"{'Query results:':<{print_padding}}{cursor.fetchall()}")
    except sqlite3.Error:
        print("Error executing sql")
    finally:
        conn.close()

# Load the tokenizer and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('cssupport/t5-small-awesome-text-to-sql').to(device)

# fine-tune the model
# t5 fine-tuning article https://medium.com/nlplanet/a-full-guide-to-finetuning-t5-for-text2text-and-building-a-demo-with-streamlit-c72009631887


model.eval()

def generate_sql(input_prompt):
    """Generate SQL query from natural language input."""
    inputs = tokenizer(input_prompt, padding=True, truncation=True, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=512)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

padding = 30
print("Good examples:\n")
# example 
natural_language_query = "How many players are there?"
input_prompt = f"""tables:
CREATE TABLE Player (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    player_api_id INTEGER UNIQUE,
    player_name TEXT,
    player_fifa_api_id INTEGER UNIQUE,
    birthday TEXT,
    height INTEGER,
    weight INTEGER
)
query for: {natural_language_query}"""

generated_sql = generate_sql(input_prompt)
print(f"{'Original query:':<{padding}}{natural_language_query}\n{'Generated SQL:':<{padding}}{generated_sql}")
execute_query(generated_sql, padding)

# example 
natural_language_query = "Who is the heaviest player?"
input_prompt = f"""tables:
CREATE TABLE Player (
    player_name TEXT,
    weight INTEGER
)
query for: {natural_language_query}"""

generated_sql = generate_sql(input_prompt)
print(f"\n{'Original query:':<{padding}}{natural_language_query}\n{'Generated SQL:':<{padding}}{generated_sql}")
execute_query(generated_sql, padding)

# example 
natural_language_query = "How old is the oldest player?"
input_prompt = f"""tables:
CREATE TABLE Player (
    birthday TEXT
)
query for: {natural_language_query}"""

generated_sql = generate_sql(input_prompt)
print(f"\n{'Original query:':<{padding}}{natural_language_query}\n{'Generated SQL:':<{padding}}{generated_sql}")
execute_query(generated_sql, padding)

print("\nExamples showing need for fine-tuning:")
# example 
natural_language_query = "Who has the shortest name?"
input_prompt = f"""tables:
CREATE TABLE Player (
    player_name TEXT
)
query for: {natural_language_query}"""

generated_sql = generate_sql(input_prompt)
print(f"\n{'Original query:':<{padding}}{natural_language_query}\n{'Generated SQL:':<{padding}}{generated_sql}")
execute_query(generated_sql, padding)

# example 
natural_language_query = "Does anybody have a birthday on January 1st?"
input_prompt = f"""tables:
CREATE TABLE Player (
    birthday TEXT
)
query for: {natural_language_query}"""

generated_sql = generate_sql(input_prompt)
print(f"\n{'Original query:':<{padding}}{natural_language_query}\n{'Generated SQL:':<{padding}}{generated_sql}")
execute_query(generated_sql, padding)

# example 
natural_language_query = "Who has the first birthday of the year?"
input_prompt = f"""tables:
CREATE TABLE Player (
    player_name TEXT,
    birthday TEXT
)
query for: {natural_language_query}"""

generated_sql = generate_sql(input_prompt)
print(f"\n{'Original query:':<{padding}}{natural_language_query}\n{'Generated SQL:':<{padding}}{generated_sql}")
execute_query(generated_sql, padding)



Good examples:

Original query:               How many players are there?
Generated SQL:                SELECT COUNT(*) FROM Player
Query results:                [(11060,)]

Original query:               Who is the heaviest player?
Generated SQL:                SELECT player_name FROM Player WHERE weight = (SELECT MAX(weight) FROM Player)
Query results:                [('Kristof van Hout',), ('Tim Wiese',)]

Original query:               How old is the oldest player?
Generated SQL:                SELECT MIN(birthday) FROM Player
Query results:                [('1967-01-23 00:00:00',)]

Examples showing need for fine-tuning:

Original query:               Who has the shortest name?
Generated SQL:                SELECT player_name FROM Player ORDER BY player_name LIMIT 1
Query results:                [('Aaron Appindangoye',)]

Original query:               Does anybody have a birthday on January 1st?
Generated SQL:                SELECT DISTINCT birthday FROM Player WHERE birthday = "Jan