In [1]:
import json
import re
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
seed = 42

In [3]:

def tokenize(path):
  with open(path) as f:
    json_data = json.load(f)

  plain_sql = [item['sql'] for item in json_data]
  plain_sql = [sql.lower() for sql in plain_sql]

  # split data into tokens

  pattern = r'[\s()\-,:;]'
  string_literal_pattern = r"'([^']*)'"
  placeholder = "<string>"
  
  # replace content inside single quotes by <string>
  plain_sql_ph = [re.sub(string_literal_pattern, placeholder, sql) for sql in plain_sql]
  
  # split the statements with placeholder
  tokenized_sql = [re.split(pattern, sql) for sql in plain_sql_ph]
  
  # remove empty tokens
  tokenized_sql = [token for token in tokenized_sql if token]
  
  # replace numbers by placeholder
  # for sql in tokenized_sql:
  #     for i, token in enumerate(sql):
  #         # if re.match(r'^[\'\"].*[\'\"]$', token):  # Check if token is a   string literal
  #         #     sql[i] = '<string>'
  #         if re.match(r'^[0-9]+(\.[0-9]+)?$', token):  # Check if token is a  number
  #             sql[i] = '<number>'
  
  # remove empty tokens
  for i, sql in enumerate(tokenized_sql):
      tokenized_sql[i] = [token for token in tokenized_sql[i] if token]

    # build the vocab
  vocab_set = set()
  for sql in tokenized_sql:
      vocab_set.update(sql)

  vocab_dict = {word: idx for idx, word in enumerate(vocab_set)}

  # get the runtimes
  runtime = [item['runtime_ms'] for item in json_data]
  runtime = np.array(runtime)

  # classify the runtimes, label 0 for runtime <=3000ms, 1 for runtime >3000ms
  label = np.where(runtime > 3000, 1, 0)

  res = [" ".join(tokenized).lower() for tokenized in tokenized_sql]

  return vocab_set, res, label

In [4]:
embedding_dict_w2v = {}
with open("glove.6B.300d.txt", 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embedding_dict_w2v[word] = vector

In [5]:
import gensim
import numpy as np

# Path to the GloVe file (after unzipping)
glove_file = 'glove.6B.300d.txt'  # Modify the path and file name as necessary

# Function to load GloVe vectors into a Gensim KeyedVectors object
def load_glove_model(glove_file):
    with open(glove_file, 'r', encoding='utf-8') as f:
        glove_vectors = {}
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_vectors[word] = vector
    
    # Create a KeyedVectors object
    word2vec_model = gensim.models.KeyedVectors(vector_size=len(vector))
    word2vec_model.add_vectors(list(glove_vectors.keys()), list(glove_vectors.values()))
    return word2vec_model

# Load the GloVe model
word2vec_model = load_glove_model(glove_file)

In [6]:
def sentence_to_vector(sentence, embeddings, dimension=50):
    words = sentence.lower().split()
    word_vectors = [embeddings[word] for word in words if word in embeddings]
    
    if not word_vectors:
        # Return a zero vector if no words are found in the embeddings
        return np.zeros(dimension)
    
    # Combine the word vectors by averaging
    sentence_vector = np.mean(word_vectors, axis=0)
    return sentence_vector

def vectorize_w2v(plain_sql, embeddings, dimension=50):
    vectors = [sentence_to_vector(sentence, embeddings, dimension) for sentence in plain_sql]
    return vectors

In [7]:
vocab_set_15k, plain_sql_ph_15k, label_15k = tokenize("../../datasets/plain_text/plain_statement.json")
vocab_set_5k, plain_sql_ph_5k, label_5k = tokenize("../../datasets/plain_text/plain_statement_5000.json")

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer_15k = TfidfVectorizer()
tfidf_matrix_15k = tfidf_vectorizer_15k.fit_transform(plain_sql_ph_15k)
feature_names_15k = tfidf_vectorizer_15k.get_feature_names_out()

tfidf_vectorizer_5k = TfidfVectorizer()
tfidf_matrix_5k = tfidf_vectorizer_5k.fit_transform(plain_sql_ph_5k)
feature_names_5k = tfidf_vectorizer_5k.get_feature_names_out()

In [9]:
import numpy as np

def get_weighted_word2vec(tfidf_matrix, feature_names, word2vec_model):
    doc_vectors = []
    for doc_idx in range(tfidf_matrix.shape[0]):
        feature_index = tfidf_matrix[doc_idx,:].nonzero()[1]
        tfidf_scores = zip(feature_index, [tfidf_matrix[doc_idx, x] for x in feature_index])
        
        weighted_word_vec = np.zeros((word2vec_model.vector_size,))
        weight_sum = 0
        for word_idx, score in tfidf_scores:
            word = feature_names[word_idx]
            if word in word2vec_model.key_to_index:
                weighted_word_vec += score * word2vec_model[word]
                weight_sum += score
        
        if weight_sum != 0:
            weighted_word_vec /= weight_sum
        
        doc_vectors.append(weighted_word_vec)
    return np.array(doc_vectors)

In [10]:
weighted_w2v_mat_5k = get_weighted_word2vec(tfidf_matrix_5k, feature_names_5k, word2vec_model)
weighted_w2v_mat_15k = get_weighted_word2vec(tfidf_matrix_15k, feature_names_15k, word2vec_model)

In [11]:
w2v_mat_15k = vectorize_w2v(plain_sql_ph_15k, embedding_dict_w2v, dimension=300)
w2v_mat_5k = vectorize_w2v(plain_sql_ph_5k, embedding_dict_w2v, dimension=300)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(vocabulary=vocab_set_5k)
cv_mat_15k = vectorizer.fit_transform(plain_sql_ph_15k)
cv_mat_5k = vectorizer.fit_transform(plain_sql_ph_5k)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_mat_15k = tfidf_vectorizer.fit_transform(plain_sql_ph_15k)
tfidf_mat_15k = tfidf_mat_15k.toarray()

tfidf_mat_5k = tfidf_vectorizer.fit_transform(plain_sql_ph_5k)
tfidf_mat_5k = tfidf_mat_5k.toarray()
# tfidf_mat.shape

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score

In [15]:
def run_exp(cv_mat, tfidf_mat, label):
  X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_mat, label, test_size=0.2, random_state=seed)

  X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(cv_mat, label, test_size=0.2, random_state=seed)

  
  def experiment_cv(model):
    model.fit(X_train_cv, y_train_cv)
    y_pred = model.predict(X_test_cv)

    accuracy = accuracy_score(y_test_cv, y_pred)

    precision_positive = precision_score(y_test_cv, y_pred, pos_label=1)
    recall_positive = recall_score(y_test_cv, y_pred, pos_label=1)
    precision_negative = precision_score(y_test_cv, y_pred, pos_label=0)
    recall_negative = recall_score(y_test_cv, y_pred, pos_label=0)

    return accuracy, precision_positive, recall_positive, precision_negative, recall_negative

  def experiment_tfidf(model):
    model.fit(X_train_tfidf, y_train_tfidf)
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test_tfidf, y_pred)

    precision_positive = precision_score(y_test_tfidf, y_pred, pos_label=1)
    recall_positive = recall_score(y_test_tfidf, y_pred, pos_label=1)
    precision_negative = precision_score(y_test_tfidf, y_pred, pos_label=0)
    recall_negative = recall_score(y_test_tfidf, y_pred, pos_label=0)

    return accuracy, precision_positive, recall_positive, precision_negative, recall_negative
  
  models = {"LR": LogisticRegression(max_iter=1000), "XGB": xgb.XGBClassifier(), "RF": RandomForestClassifier(n_estimators=100, random_state=seed)}

  result_cv = dict()
  result_tfidf = dict()

  for key in models:
    model = models[key]

    result_cv[key] = experiment_cv(model)
    print(f"Completed training of model {key} for BoW")
    result_tfidf[key] = experiment_tfidf(model)
    print(f"Completed training of model {key} for TF-IDF")

  result_cv["SVM"] = experiment_cv(SVC())
  print("Completed training of model SVM for BoW")

  return result_cv, result_tfidf

In [16]:
def run_exp_w2v(w2v_mat, label):
  X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(w2v_mat, label, test_size=0.2, random_state=seed)

  def experiment_w2v(model):
    model.fit(X_train_w2v, y_train_w2v)
    y_pred = model.predict(X_test_w2v)

    accuracy = accuracy_score(y_test_w2v, y_pred)

    precision_positive = precision_score(y_test_w2v, y_pred, pos_label=1)
    recall_positive = recall_score(y_test_w2v, y_pred, pos_label=1)
    precision_negative = precision_score(y_test_w2v, y_pred, pos_label=0)
    recall_negative = recall_score(y_test_w2v, y_pred, pos_label=0)

    return accuracy, precision_positive, recall_positive, precision_negative, recall_negative
  
  models = {"LR": LogisticRegression(max_iter=1000), "XGB": xgb.XGBClassifier(), "RF": RandomForestClassifier(n_estimators=100, random_state=seed)}

  result_w2v = dict()

  for key in models:
    model = models[key]
    result_w2v[key] = experiment_w2v(model)
    print(f"Completed training of model {key} for Word2Vector")
  
  result_w2v["SVM"] = experiment_w2v(SVC())
  print("Completed training of model SVM for BoW")

  return result_w2v

In [17]:
r_tfidf_w2v_5k = run_exp_w2v(weighted_w2v_mat_5k, label_5k)
r_tfidf_w2v_15k = run_exp_w2v(weighted_w2v_mat_15k, label_15k)

print("Results of tfidf w2v - 5k")
for key, value in r_tfidf_w2v_5k.items():
    print(f"{key}: {value}")
print("Results of tfidf w2v - 15k")
for key, value in r_tfidf_w2v_15k.items():
    print(f"{key}: {value}")

Completed training of model LR for Word2Vector
Completed training of model XGB for Word2Vector
Completed training of model RF for Word2Vector
Completed training of model SVM for BoW
Completed training of model LR for Word2Vector
Completed training of model XGB for Word2Vector
Completed training of model RF for Word2Vector
Completed training of model SVM for BoW
Results of tfidf w2v - 5k
LR: (0.792, 0.691358024691358, 0.6746987951807228, 0.8402366863905325, 0.8502994011976048)
XGB: (0.767, 0.6542056074766355, 0.6325301204819277, 0.8203240058910162, 0.8338323353293413)
RF: (0.776, 0.6719745222929936, 0.6355421686746988, 0.8236151603498543, 0.8458083832335329)
SVM: (0.78, 0.6761006289308176, 0.6475903614457831, 0.8284457478005866, 0.8458083832335329)
Results of tfidf w2v - 15k
LR: (0.7927970065481759, 0.6993464052287581, 0.6229985443959243, 0.8302752293577982, 0.8731909028256375)
XGB: (0.7675397567820393, 0.6512738853503185, 0.5953420669577875, 0.8158940397350993, 0.8490696071674707)
RF: 

In [18]:
r_w2v_5k = run_exp_w2v(w2v_mat_5k, label_5k)
r_w2v_15k = run_exp_w2v(w2v_mat_15k, label_15k)

print("Results of w2v - 5k")
for key, value in r_w2v_5k.items():
    print(f"{key}: {value}")
print("Results of w2v - 15k")
for key, value in r_w2v_15k.items():
    print(f"{key}: {value}")

Completed training of model LR for Word2Vector
Completed training of model XGB for Word2Vector
Completed training of model RF for Word2Vector
Completed training of model SVM for BoW
Completed training of model LR for Word2Vector
Completed training of model XGB for Word2Vector
Completed training of model RF for Word2Vector
Completed training of model SVM for BoW
Results of w2v - 5k
LR: (0.724, 0.5921052631578947, 0.5421686746987951, 0.7816091954022989, 0.8143712574850299)
XGB: (0.73, 0.6, 0.5602409638554217, 0.7884057971014493, 0.8143712574850299)
RF: (0.736, 0.6180555555555556, 0.536144578313253, 0.7837078651685393, 0.8353293413173652)
SVM: (0.725, 0.6, 0.5150602409638554, 0.7748251748251749, 0.8293413173652695)
Results of w2v - 15k
LR: (0.7188961646398503, 0.5820610687022901, 0.44395924308588064, 0.7633209417596035, 0.8490696071674707)
XGB: (0.7572497661365762, 0.6329113924050633, 0.5822416302765647, 0.8094289508632138, 0.8401102687801516)
RF: (0.7525724976613658, 0.6338983050847458, 

In [19]:
r_cv_5k, r_tfidf_5k = run_exp(cv_mat_5k, tfidf_mat_5k, label_5k)
r_cv_15k, r_tfidf_15k = run_exp(cv_mat_15k, tfidf_mat_15k, label_15k)

Completed training of model LR for BoW
Completed training of model LR for TF-IDF
Completed training of model XGB for BoW
Completed training of model XGB for TF-IDF
Completed training of model RF for BoW
Completed training of model RF for TF-IDF
Completed training of model SVM for BoW
Completed training of model LR for BoW
Completed training of model LR for TF-IDF
Completed training of model XGB for BoW
Completed training of model XGB for TF-IDF
Completed training of model RF for BoW
Completed training of model RF for TF-IDF
Completed training of model SVM for BoW


In [20]:
print("Results of BoW - 5k")
for key, value in r_cv_5k.items():
    print(f"{key}: {value}")
print("Results of BoW - 15k")
for key, value in r_cv_15k.items():
    print(f"{key}: {value}")

Results of BoW - 5k
LR: (0.719, 0.5825242718446602, 0.5421686746987951, 0.7800289435600579, 0.8068862275449101)
XGB: (0.727, 0.6013745704467354, 0.5271084337349398, 0.7785613540197461, 0.8263473053892215)
RF: (0.683, 0.5252525252525253, 0.46987951807228917, 0.7496443812233285, 0.7889221556886228)
SVM: (0.739, 0.6651162790697674, 0.4307228915662651, 0.759235668789809, 0.8922155688622755)
Results of BoW - 15k
LR: (0.7179607109448082, 0.584, 0.42503639010189226, 0.7588522588522588, 0.8566505858028945)
XGB: (0.7277829747427502, 0.6035502958579881, 0.44541484716157204, 0.7664009809932557, 0.8614748449345279)
RF: (0.7142188961646398, 0.5688405797101449, 0.4570596797671033, 0.7648171500630517, 0.8359751895244659)
SVM: (0.7202993451824135, 0.6377708978328174, 0.29985443959243085, 0.7349862258953168, 0.9193659545141282)


In [21]:
print("Results of TF-IDF - 5k")
for key, value in r_tfidf_5k.items():
    print(f"{key}: {value}")
print("Results of TF-IDF - 15k")
for key, value in r_tfidf_15k.items():
    print(f"{key}: {value}")

Results of TF-IDF - 5k
LR: (0.795, 0.6906906906906907, 0.6927710843373494, 0.8470764617691154, 0.8458083832335329)
XGB: (0.786, 0.6766467065868264, 0.6807228915662651, 0.8408408408408409, 0.8383233532934131)
RF: (0.795, 0.6895522388059702, 0.6957831325301205, 0.8481203007518797, 0.844311377245509)
Results of TF-IDF - 15k
LR: (0.8021515434985969, 0.7163934426229508, 0.636098981077147, 0.8363874345549738, 0.8807718814610613)
XGB: (0.7792329279700655, 0.6616541353383458, 0.6404657933042213, 0.8323150033944331, 0.844934527911785)
RF: (0.7820392890551918, 0.678513731825525, 0.611353711790393, 0.8242264647794602, 0.8628532046864231)
