In [17]:
import json
import re
import torch
import torch.nn as nn
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [18]:
# read data from .json file

with open("plain_statement.json") as f:
    json_data = json.load(f)

plain_sql = [item['sql'] for item in json_data]
plain_sql = [sql.lower() for sql in plain_sql]

In [19]:
# split data into tokens

pattern = r'[\s()\-,:;]'
string_literal_pattern = r"'([^']*)'"
placeholder = "<string>"

# replace content inside single quotes by <string>
plain_sql_ph = [re.sub(string_literal_pattern, placeholder, sql) for sql in plain_sql]

# split the statements with placeholder
tokenized_sql = [re.split(pattern, sql) for sql in plain_sql_ph]

# remove empty tokens
tokenized_sql = [token for token in tokenized_sql if token]

# replace numbers by placeholder
for sql in tokenized_sql:
    for i, token in enumerate(sql):
        # if re.match(r'^[\'\"].*[\'\"]$', token):  # Check if token is a string literal
        #     sql[i] = '<string>'
        if re.match(r'^[0-9]+(\.[0-9]+)?$', token):  # Check if token is a number
            sql[i] = '<number>'

# remove empty tokens
for i, sql in enumerate(tokenized_sql):
    tokenized_sql[i] = [token for token in tokenized_sql[i] if token]

In [20]:
# build the vocab
vocab_set = set()
for sql in tokenized_sql:
    vocab_set.update(sql)

vocab_dict = {word: idx for idx, word in enumerate(vocab_set)}

In [21]:
# get the runtimes
runtime = [item['runtime_ms'] for item in json_data]
runtime = np.array(runtime)

# classify the runtimes, label 0 for runtime <=3000ms, 1 for runtime >3000ms
label = np.where(runtime > 3000, 1, 0)

sklearn CountVectorizer

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

# Step 3: Count Vectorization
vectorizer = CountVectorizer(vocabulary=vocab_set)
X = vectorizer.fit_transform(plain_sql_ph)

In [23]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, label, test_size=0.2)

# Initialize the random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the random forest classifier
rf_classifier.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7048643592142189


PyTorch Embedding

In [24]:
# define embedding layer
vocab_size = len(vocab_dict)
embedding_dim = 10
embedding = nn.Embedding(vocab_size, embedding_dim)

In [25]:
# convert tokens to indices for each sample
indices = [torch.LongTensor([vocab_dict[token] for token in sql]) for sql in tokenized_sql]

X = []

for index in indices:
    emb = embedding(index)
    sum = torch.sum(emb, dim=0)
    X.append(sum.tolist())

X = np.array(X)

In [26]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, label, test_size=0.2)

# Initialize the random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the random forest classifier
rf_classifier.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7675397567820393
