In [1]:
import json
import os
os.chdir('./..')

with open('./data/grid_search_subsampled_dev_set.json', 'r') as f:
  dev_set = json.load(f)

In [2]:
candidates = [q['SQL'] for q in dev_set][:100]

In [3]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from collections import Counter
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from typing import List
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from typing import List
import numpy as np

def cluster_sql_queries_kmeans(embeddings: np.ndarray, n_clusters: int = 5) -> List[int]:
    """
    Cluster SQL query embeddings using K-means.
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(embeddings)
    return clusters.tolist()

def cluster_sql_queries_agglomerative(embeddings: np.ndarray, n_clusters: int = 5) -> List[int]:
    """
    Cluster SQL query embeddings using Agglomerative Clustering.
    """
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    clusters = clustering.fit_predict(embeddings)
    return clusters.tolist()

def cluster_sql_queries_dbscan(embeddings: np.ndarray, eps: float = 10, min_samples: int = 1) -> List[int]:
    """
    Cluster SQL query embeddings using DBSCAN.
    """
    # Normalize embeddings
    scaler = StandardScaler()
    normalized_embeddings = scaler.fit_transform(embeddings)

    # Apply DBSCAN
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan.fit_predict(normalized_embeddings)

    return clusters.tolist()


# Load CodeBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/codebert-base')
model = AutoModel.from_pretrained('microsoft/codebert-base')


def embed_sql_queries(queries: List[str]) -> np.ndarray:
    """
    Embed a list of SQL queries using CodeBERT and return their embeddings.
    """
    embeddings = []
    for query in queries:
        inputs = tokenizer(query, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            output = model(**inputs)
        embeddings.append(output.last_hidden_state[:, 0, :].squeeze().cpu().numpy())

    return np.array(embeddings)


def calculate_semantic_entropy(clusters: List[int], total_candidates: int) -> float:
    """
    Calculate the semantic entropy of the clusters using the formula:
    -Sigma(Pi * log(Pi)), where Pi is the number of candidates in cluster i divided by all candidates.
    """
    cluster_counts = Counter(clusters)
    entropy = 0
    for cluster_id, count in cluster_counts.items():
        Pi = count / total_candidates
        entropy -= Pi * np.log(Pi)

    return entropy


def process_sql_queries_for_entropy(queries: List[str]) -> float:
    """
    Given a list of SQL queries, this function:
    1. Embeds the queries using CodeBERT.
    2. Clusters the queries.
    3. Calculates and returns the semantic entropy of the clusters.
    """
    # Step 1: Embed SQL queries
    embeddings = embed_sql_queries(queries)
    print(embeddings)
    # Step 2: Cluster the embeddings using DBSCAN
    clusters_db = cluster_sql_queries_dbscan(embeddings)
    clusters_kmeans = cluster_sql_queries_kmeans(embeddings)
    clusters_agg = cluster_sql_queries_agglomerative(embeddings)
    print(f"DB CLUSTERS: {clusters_db}")
    print(f"KM CLUSTERS: {clusters_kmeans}")
    print(f"AGG CLUSTERS: {clusters_agg}")
    # Step 3: Calculate semantic entropy
    entropy_db = calculate_semantic_entropy(clusters_db, len(queries))
    entropy_kmeans = calculate_semantic_entropy(clusters_kmeans, len(queries))
    entropy_agg = calculate_semantic_entropy(clusters_agg, len(queries))
    print(f"DB ENT: {entropy_db}")
    print(f"KM ENT: {entropy_kmeans}")
    print(f"AGG ENT: {entropy_agg}")

    return entropy_db, entropy_agg, entropy_kmeans


# Example usage:
queries = [
    "SELECT * FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'HR';",
    """SELECT cost 
FROM employees 
WHERE department = 'HR' 
ORDER BY cost DESC 
LIMIT 1;
""",
    "SELECT max(cost) FROM employees WHERE department = 'HR';",
    "SELECT * FROM employees WHERE department = 'ISG';",
    "SELECT * FROM employees WHERE department = 'ISG';",
    "SELECT * FROM employees WHERE department = 'CFS AI COE';"
]


# queries = queries

_1, _2, _3 = process_sql_queries_for_entropy(queries)
print(f"Semantic Entropy: {_1} {_2} {_3}")


  from .autonotebook import tqdm as notebook_tqdm


[[-0.16112804  0.4182117  -0.01404823 ... -0.29508042 -0.4093419
   0.42982963]
 [-0.16112804  0.4182117  -0.01404823 ... -0.29508042 -0.4093419
   0.42982963]
 [-0.16112804  0.4182117  -0.01404823 ... -0.29508042 -0.4093419
   0.42982963]
 ...
 [-0.16436335  0.4135335  -0.00980083 ... -0.30677006 -0.39898145
   0.42767558]
 [-0.16436335  0.4135335  -0.00980083 ... -0.30677006 -0.39898145
   0.42767558]
 [-0.15903974  0.38337898  0.03022209 ... -0.28304553 -0.4233109
   0.40668446]]
DB CLUSTERS: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 3, 4]
KM CLUSTERS: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 4, 3]
AGG CLUSTERS: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 0, 0, 2]
DB ENT: 0.8953799046713389
KM ENT: 0.8953799046713389
AGG ENT: 0.8953799046713389
Semantic Entropy: 0.8953799046713389 0.8953799046713389 0.8953799046713389


In [1]:
import re

def preprocess_table(headers, rows):
    """
    Preprocesses a table by:
    1. Including headers with `<thead>`.
    2. Replacing spaces with underscores in all cell values.
    3. Replacing numbers with '@' and special characters with '*'.
    
    Args:
        headers (list): List of header strings.
        rows (list of tuples): List of rows, each represented as a tuple of strings.
    
    Returns:
        list: List of serialized strings ready for the embedding model.
    """
    def preprocess_cell(cell):
        # Remove spaces and replace with underscores
        cell = cell.replace(" ", "_")
        # Replace numbers with '@'
        cell = re.sub(r'\d', '@', cell)
        # Replace special characters (excluding underscores) with '*'
        cell = re.sub(r'[^\w@_]', '*', cell)
        return cell

    # Preprocess headers
    headers = [preprocess_cell(header) for header in headers]
    serialized_headers = "<thead> " + " ".join(headers)

    # Preprocess rows
    preprocessed_rows = []
    for row in rows:
        preprocessed_row = " ".join(preprocess_cell(cell) for cell in row)
        preprocessed_rows.append(preprocessed_row)

    # Combine headers and rows
    serialized_table = [serialized_headers] + preprocessed_rows
    return serialized_table


In [12]:
# Tabular embeddings

# Preprocess the data
header = 'Departments'
rows = [("Alice", "30", "HR"), ("Bob", "40", "Finance")]
headers = ["Name", "Age", "Department"]
preprocessed = preprocess_table(headers, rows)
print(preprocessed)

from src.table_embed.embedding.fasttext_web_table_embeddings import FastTextWebTableModel as TableEmbeddingModel
print('downloading model...')
model = TableEmbeddingModel.load_model('ddrg/web_table_embeddings_plain64')

print('getting embedding...')
# embedding = model.get_header_vector(preprocessed[0])
header_vec = model.get_header_vector(header)
data_vec = model.get_data_vector(preprocessed[0])
plain_vec = model.get_plain_vector(preprocessed[0])

['<thead> Name Age Department', 'Alice @@ HR', 'Bob @@ Finance']
downloading model...
getting embedding...


In [15]:
candidates = [q['SQL'] for q in dev_set][:100]


proprocessed = [preprocess_table() for q in queries]
embeddings = [model.get_data_vector(p[0]) for p in preprocessed]

TypeError: preprocess_table() missing 1 required positional argument: 'rows'

In [1]:
import sqlite3
import os
os.chdir('./..')

import json
import re

def preprocess_table(headers, rows):
    """
    Preprocesses a table by:
    1. Including headers with `<thead>`.
    2. Replacing spaces with underscores in all cell values.
    3. Replacing numbers with '@' and special characters with '*'.
    
    Args:
        headers (list): List of header strings.
        rows (list of tuples): List of rows, each represented as a tuple of strings.
    
    Returns:
        list: List of serialized strings ready for the embedding model.
    """
    def preprocess_cell(cell):
        if not isinstance(cell, str):
            cell = str(cell)
        # Remove spaces and replace with underscores
        cell = cell.replace(" ", "_")
        # Replace numbers with '@'
        cell = re.sub(r'\d+\.\d+|\d+', '@', cell)
        # Replace special characters (excluding underscores) with '*'
        cell = re.sub(r'[^\w@_]', '*', cell)
        return cell

    # Preprocess headers
    headers = [preprocess_cell(header) for header in headers]
    serialized_headers = " ".join(headers)

    # Preprocess rows
    preprocessed_rows = []
    for row in rows:
        preprocessed_row = " ".join(preprocess_cell(cell) for cell in row)
        preprocessed_rows.append(preprocessed_row)

    # Combine headers and rows
    serialized_table = [serialized_headers] + preprocessed_rows
    return serialized_table

with open('./data/grid_search_subsampled_dev_set.json', 'r') as f:
    dev_set = json.load(f)

# Path to your SQLite database
candidates = [(q['SQL'], q['db_id']) for q in dev_set][:100]

# from src.table_embed.embedding.fasttext_web_table_embeddings import FastTextWebTableModel as TableEmbeddingModel
# print('downloading model...')
# model = TableEmbeddingModel.load_model('ddrg/web_table_embeddings_plain64')

for candidate in candidates:
    db_path = f"{os.environ['DB_ROOT_DIRECTORY']}/{candidate[1]}/{candidate[1]}.sqlite"

    # Your SQL query
    query = candidate[0]

    try:
        # Connect to the SQLite database
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        
        # Execute the query
        cursor.execute(query)
        column_names = [description[0] for description in cursor.description]

        # Fetch all results
        results = cursor.fetchall()
        rows = results
        headers = column_names
        # print(rows)
        # print(headers)
        
        preprocessed = preprocess_table(headers=headers, rows=rows)
        
        print(preprocessed)
        # header_vec = model.get_header_vector(header)
        # data_vecs = []
        # for data in preprocessed[1:]:
        #     data_vec = model.get_data_vector(data)
        #     data_vecs.append(data_vec)

    except sqlite3.Error as e:
        print(f"An error occurred: {e}")
    finally:
        # Close the connection
        if conn:
            conn.close()

downloading model...


  from .autonotebook import tqdm as notebook_tqdm


['District_Code', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@', '@']
['County COUNT*Virtual*', 'San_Diego @']
['School', 'Millikan_High', 'Polytechnic_High', 'Troy_High']
['COUNT*School*', '@']
['CAST*T@**FRPM_Count_*K*@**_AS_REAL*_*_T@**Enrollment_*K*@**', '@', '@', '@', '@', '@']
['MailStreet 

In [4]:
re.sub(r'\d+', '@', str(30))

'@'

In [6]:
header = preprocessed[0]

'team_long_name'

In [None]:
header = [('School',)]
data = [
    ('Millikan_High',),
    ('Polytechnic_High',),
    ('Troy_High',)
]

# Generate HTML content
html_content = """
<body>
    <table>
        <thead>
            <tr>
                <th>{}</th>
            </tr>
        </thead>
        <tbody>
""".format(header[0][0])

# Add rows for the data
for row in data:
    html_content += f"""
            <tr>
                <td>{row[0]}</td>
            </tr>
    """

# Close the HTML table and body
html_content += """
        </tbody>
    </table>
</body>
"""

In [4]:
print(html_content)


<body>
    <table>
        <thead>
            <tr>
                <th>School</th>
            </tr>
        </thead>
        <tbody>

            <tr>
                <td>Millikan_High</td>
            </tr>
    
            <tr>
                <td>Polytechnic_High</td>
            </tr>
    
            <tr>
                <td>Troy_High</td>
            </tr>
    
        </tbody>
    </table>
</body>

