In [1]:
import json
import re
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [23]:
seed = 42

In [28]:

def tokenize(path):
  with open(path) as f:
    json_data = json.load(f)

  plain_sql = [item['sql'] for item in json_data]
  plain_sql = [sql.lower() for sql in plain_sql]

  # split data into tokens

  pattern = r'[\s()\-,:;]'
  string_literal_pattern = r"'([^']*)'"
  placeholder = "<string>"
  
  # replace content inside single quotes by <string>
  plain_sql_ph = [re.sub(string_literal_pattern, placeholder, sql) for sql in   plain_sql]
  
  # split the statements with placeholder
  tokenized_sql = [re.split(pattern, sql) for sql in plain_sql_ph]
  
  # remove empty tokens
  tokenized_sql = [token for token in tokenized_sql if token]
  
  # replace numbers by placeholder
  for sql in tokenized_sql:
      for i, token in enumerate(sql):
          # if re.match(r'^[\'\"].*[\'\"]$', token):  # Check if token is a   string literal
          #     sql[i] = '<string>'
          if re.match(r'^[0-9]+(\.[0-9]+)?$', token):  # Check if token is a  number
              sql[i] = '<number>'
  
  # remove empty tokens
  for i, sql in enumerate(tokenized_sql):
      tokenized_sql[i] = [token for token in tokenized_sql[i] if token]

    # build the vocab
  vocab_set = set()
  for sql in tokenized_sql:
      vocab_set.update(sql)

  vocab_dict = {word: idx for idx, word in enumerate(vocab_set)}

  # get the runtimes
  runtime = [item['runtime_ms'] for item in json_data]
  runtime = np.array(runtime)

  # classify the runtimes, label 0 for runtime <=3000ms, 1 for runtime >3000ms
  label = np.where(runtime > 3000, 1, 0)

  return vocab_set, plain_sql_ph, label

In [29]:
vocab_set_15k, plain_sql_ph_15k, label_15k = tokenize("../datasets/plain_text/plain_statement.json")
vocab_set_5k, plain_sql_ph_5k, label_5k = tokenize("../datasets/plain_text/plain_statement_5000.json")

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(vocabulary=vocab_set)
cv_mat_15k = vectorizer.fit_transform(plain_sql_ph_15k)
cv_mat_5k = vectorizer.fit_transform(plain_sql_ph_5k)

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_mat_15k = tfidf_vectorizer.fit_transform(plain_sql_ph_15k)
tfidf_mat_15k = tfidf_mat_15k.toarray()

tfidf_mat_5k = tfidf_vectorizer.fit_transform(plain_sql_ph_5k)
tfidf_mat_5k = tfidf_mat_5k.toarray()
# tfidf_mat.shape

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score

In [46]:
def run_exp(cv_mat, tfidf_mat, label):
  X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_mat, label, test_size=0.2, random_state=seed)

  X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(cv_mat, label, test_size=0.2, random_state=seed)

  
  def experiment_cv(model):
    model.fit(X_train_cv, y_train_cv)
    y_pred = model.predict(X_test_cv)

    accuracy = accuracy_score(y_test_cv, y_pred)

    precision_positive = precision_score(y_test_cv, y_pred, pos_label=1)
    recall_positive = recall_score(y_test_cv, y_pred, pos_label=1)
    precision_negative = precision_score(y_test_cv, y_pred, pos_label=0)
    recall_negative = recall_score(y_test_cv, y_pred, pos_label=0)

    return accuracy, precision_positive, recall_positive, precision_negative, recall_negative

  def experiment_tfidf(model):
    model.fit(X_train_tfidf, y_train_tfidf)
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test_tfidf, y_pred)

    precision_positive = precision_score(y_test_tfidf, y_pred, pos_label=1)
    recall_positive = recall_score(y_test_tfidf, y_pred, pos_label=1)
    precision_negative = precision_score(y_test_tfidf, y_pred, pos_label=0)
    recall_negative = recall_score(y_test_tfidf, y_pred, pos_label=0)

    return accuracy, precision_positive, recall_positive, precision_negative, recall_negative
  
  models = {"LR": LogisticRegression(max_iter=1000), "XGB": xgb.XGBClassifier(), "RF": RandomForestClassifier(n_estimators=100, random_state=seed)}

  result_cv = dict()
  result_tfidf = dict()

  for key in models:
    model = models[key]

    result_cv[key] = experiment_cv(model)
    print("Completed training of model {key} for BoW")
    result_tfidf[key] = experiment_tfidf(model)
    print("Completed training of model {key} for TF-IDF")

  result_cv["SVM"] = experiment_cv(SVC())
  print("Completed training of model SVM for BoW")

  return result_cv, result_tfidf

In [47]:
r_cv_5k, r_tfidf_5k = run_exp(cv_mat_5k, tfidf_mat_5k, label_5k)
r_cv_15k, r_tfidf_15k = run_exp(cv_mat_15k, tfidf_mat_15k, label_15k)

Completed training of model {key} for BoW
Completed training of model {key} for TF-IDF
Completed training of model {key} for BoW
Completed training of model {key} for TF-IDF
Completed training of model {key} for BoW
Completed training of model {key} for TF-IDF
Completed training of model SVM for BoW
Completed training of model {key} for BoW
Completed training of model {key} for TF-IDF
Completed training of model {key} for BoW
Completed training of model {key} for TF-IDF
Completed training of model {key} for BoW
Completed training of model {key} for TF-IDF
Completed training of model SVM for BoW


In [48]:
print("Results of BoW - 5k")
for key, value in r_cv_5k.items():
    print(f"{key}: {value}")
print("Results of BoW - 15k")
for key, value in r_cv_15k.items():
    print(f"{key}: {value}")

Results of BoW - 5k
LR: (0.711, 0.5682539682539682, 0.5391566265060241, 0.7766423357664234, 0.7964071856287425)
XGB: (0.71, 0.5734265734265734, 0.4939759036144578, 0.7647058823529411, 0.8173652694610778)
RF: (0.692, 0.54, 0.4879518072289157, 0.7571428571428571, 0.7934131736526946)
SVM: (0.731, 0.64, 0.43373493975903615, 0.7574193548387097, 0.8787425149700598)
Results of BoW - 15k
LR: (0.7146866230121609, 0.5768463073852296, 0.42066957787481807, 0.7568723274282224, 0.853893866299104)
XGB: (0.7240411599625819, 0.5987780040733197, 0.4279475982532751, 0.761384335154827, 0.8642315644383184)
RF: (0.7226379794200187, 0.5967078189300411, 0.42212518195050946, 0.7596852300242131, 0.864920744314266)
SVM: (0.7188961646398503, 0.6452702702702703, 0.2780203784570597, 0.7307274701411509, 0.9276361130254996)


In [49]:
print("Results of TF-IDF - 5k")
for key, value in r_tfidf_5k.items():
    print(f"{key}: {value}")
print("Results of TF-IDF - 15k")
for key, value in r_tfidf_15k.items():
    print(f"{key}: {value}")

Results of TF-IDF - 5k
LR: (0.796, 0.6927710843373494, 0.6927710843373494, 0.8473053892215568, 0.8473053892215568)
XGB: (0.786, 0.6766467065868264, 0.6807228915662651, 0.8408408408408409, 0.8383233532934131)
RF: (0.794, 0.688622754491018, 0.6927710843373494, 0.8468468468468469, 0.844311377245509)
Results of TF-IDF - 15k
LR: (0.8016838166510758, 0.7138211382113822, 0.6390101892285298, 0.8371634931057124, 0.8787043418332184)
XGB: (0.7792329279700655, 0.6616541353383458, 0.6404657933042213, 0.8323150033944331, 0.844934527911785)
RF: (0.7834424695977549, 0.6812297734627831, 0.6128093158660844, 0.825, 0.8642315644383184)


In [None]:
### PyTorch Embedding

# define embedding layer

# vocab_size = len(vocab_dict)
# embedding_dim = 10
# embedding = nn.Embedding(vocab_size, embedding_dim)

In [None]:
# convert tokens to indices for each sample
# indices = [torch.LongTensor([vocab_dict[token] for token in sql]) for sql in tokenized_sql]

# X_torch = []

# for index in indices:
#     emb = embedding(index)
#     sum = torch.sum(emb, dim=0)
#     X_torch.append(sum.tolist())

# X_torch = np.array(X_torch)