In [22]:
import json
import re
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [23]:
# read data from .json file

with open("plain_statement.json") as f:
    json_data = json.load(f)

plain_sql = [item['sql'] for item in json_data]
plain_sql = [sql.lower() for sql in plain_sql]

In [24]:
# split data into tokens

pattern = r'[\s()\-,:;]'
string_literal_pattern = r"'([^']*)'"
placeholder = "<string>"

# replace content inside single quotes by <string>
plain_sql_ph = [re.sub(string_literal_pattern, placeholder, sql) for sql in plain_sql]

# split the statements with placeholder
tokenized_sql = [re.split(pattern, sql) for sql in plain_sql_ph]

# remove empty tokens
tokenized_sql = [token for token in tokenized_sql if token]

# replace numbers by placeholder
for sql in tokenized_sql:
    for i, token in enumerate(sql):
        # if re.match(r'^[\'\"].*[\'\"]$', token):  # Check if token is a string literal
        #     sql[i] = '<string>'
        if re.match(r'^[0-9]+(\.[0-9]+)?$', token):  # Check if token is a number
            sql[i] = '<number>'

# remove empty tokens
for i, sql in enumerate(tokenized_sql):
    tokenized_sql[i] = [token for token in tokenized_sql[i] if token]

In [25]:
# build the vocab
vocab_set = set()
for sql in tokenized_sql:
    vocab_set.update(sql)

vocab_dict = {word: idx for idx, word in enumerate(vocab_set)}

In [26]:
# get the runtimes
runtime = [item['runtime_ms'] for item in json_data]
runtime = np.array(runtime)

# classify the runtimes, label 0 for runtime <=3000ms, 1 for runtime >3000ms
label = np.where(runtime > 3000, 1, 0)

sklearn CountVectorizer

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb

vectorizer = CountVectorizer(vocabulary=vocab_set)
X = vectorizer.fit_transform(plain_sql_ph)

In [28]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, label, test_size=0.2)

# Initialize the random forest classifier
clf = xgb.XGBClassifier()

# Train the random forest classifier
clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7291861552853134


PyTorch Embedding

In [29]:
# define embedding layer
vocab_size = len(vocab_dict)
embedding_dim = 10
embedding = nn.Embedding(vocab_size, embedding_dim)

In [30]:
# convert tokens to indices for each sample
indices = [torch.LongTensor([vocab_dict[token] for token in sql]) for sql in tokenized_sql]

X_torch = []

for index in indices:
    emb = embedding(index)
    sum = torch.sum(emb, dim=0)
    X_torch.append(sum.tolist())

X_torch = np.array(X_torch)

In [31]:
# Split the data into training and testing sets
X_train_torch, X_test_torch, y_train_torch, y_test_torch = train_test_split(X_torch, label, test_size=0.2)

# Initialize the random forest classifier
clf_torch = xgb.XGBClassifier()

# Train the random forest classifier
clf_torch.fit(X_train_torch, y_train_torch)

# Predict the labels for the test set
y_pred_torch = clf_torch.predict(X_test_torch)

# Evaluate the accuracy of the model
accuracy_torch = accuracy_score(y_test_torch, y_pred_torch)
print("Accuracy:", accuracy_torch)

Accuracy: 0.7492984097287184


In [34]:
# Fine tune on embedding dimension
vocab_size = len(vocab_dict)
dim_candidate = [10, 30, 50, 70, 90] 
for embedding_dim in dim_candidate:
    embedding = nn.Embedding(vocab_size, embedding_dim)
    indices = [torch.LongTensor([vocab_dict[token] for token in sql]) for sql in tokenized_sql]

    X_torch = []

    for index in indices:
        emb = embedding(index)
        sum = torch.sum(emb, dim=0)
        X_torch.append(sum.tolist())
    X_torch = np.array(X_torch)
    # Split the data into training and testing sets
    X_train_torch, X_test_torch, y_train_torch, y_test_torch = train_test_split(X_torch, label, test_size=0.2)

    # Initialize the XGboost classifier
    clf_torch = xgb.XGBClassifier()

    # Train the XGboost classifier
    clf_torch.fit(X_train_torch, y_train_torch)

    # Predict the labels for the test set
    y_pred_torch = clf_torch.predict(X_test_torch)

    # Evaluate the accuracy of the model
    accuracy_torch = accuracy_score(y_test_torch, y_pred_torch)
    print("Accuracy at embedding_dim",embedding_dim, ": ", accuracy_torch)

Accuracy at embedding_dim 10 :  0.7464920486435921
Accuracy at embedding_dim 30 :  0.7759588400374181
Accuracy at embedding_dim 50 :  0.8063610851262862
Accuracy at embedding_dim 70 :  0.8129092609915809
Accuracy at embedding_dim 90 :  0.8208606173994387
