In [9]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import xgboost as xgb

In [11]:
pd.set_option('display.max_colwidth', 50)

In [12]:
data = pd.read_csv('../CSE-291-ML-for-Systems/data/runtime/query-runtime-all.csv')
data.head()

Unnamed: 0,query,runtime (ms),db,engine
0,SELECT DISTINCT COURSEalias0.ADVISORY_REQUIREM...,8.092165,advising,mysql
1,"SELECT DISTINCT COURSEalias0.DEPARTMENT , COUR...",0.772238,advising,mysql
2,"SELECT DISTINCT COURSEalias0.DEPARTMENT , COUR...",0.530958,advising,mysql
3,SELECT COUNT( * ) > 0 FROM COURSE AS COURSEali...,92.988014,advising,mysql
4,"SELECT DISTINCT COURSEalias0.DEPARTMENT , COUR...",1.093864,advising,mysql


### Adding new features 

In [13]:
# converting all queries to lowercase

data['query'] = data['query'].str.lower()
data.head()

Unnamed: 0,query,runtime (ms),db,engine
0,select distinct coursealias0.advisory_requirem...,8.092165,advising,mysql
1,"select distinct coursealias0.department , cour...",0.772238,advising,mysql
2,"select distinct coursealias0.department , cour...",0.530958,advising,mysql
3,select count( * ) > 0 from course as courseali...,92.988014,advising,mysql
4,"select distinct coursealias0.department , cour...",1.093864,advising,mysql


In [20]:
# checking presence of any expensive operation in a SQL query

complex_keywords = ["join", "distinct", "group by", "count", "order by"]

def check_query_complexity(query):
    words = query.split()
    return int(any(keyword in words for keyword in complex_keywords))

data["query_complexity"] = data["query"].apply(check_query_complexity)
data.head()

Unnamed: 0,query,runtime (ms),db,engine,query_complexity
0,select distinct coursealias0.advisory_requirem...,8.092165,advising,mysql,1
1,"select distinct coursealias0.department , cour...",0.772238,advising,mysql,1
2,"select distinct coursealias0.department , cour...",0.530958,advising,mysql,1
3,select count( * ) > 0 from course as courseali...,92.988014,advising,mysql,0
4,"select distinct coursealias0.department , cour...",1.093864,advising,mysql,1


In [21]:
data["query_complexity"].value_counts()

query_complexity
1    1411
0     392
Name: count, dtype: int64

In [22]:
data["engine"].value_counts()

engine
mysql     1799
sqlite       4
Name: count, dtype: int64

In [26]:
data["db"].value_counts()

db
atis           1118
geography       257
advising        204
imdb             95
car_1            42
inn_1            36
student_1        27
formula_1        16
restaurants       4
Name: count, dtype: int64

In [24]:
# We still have 4 queries of sqlite type, to proceed ahead we can drop those and later drop the "engine" column altogether
data = data[data['engine'] != 'sqlite']
del data['engine']

In [25]:
data.head()

Unnamed: 0,query,runtime (ms),db,query_complexity
0,select distinct coursealias0.advisory_requirem...,8.092165,advising,1
1,"select distinct coursealias0.department , cour...",0.772238,advising,1
2,"select distinct coursealias0.department , cour...",0.530958,advising,1
3,select count( * ) > 0 from course as courseali...,92.988014,advising,0
4,"select distinct coursealias0.department , cour...",1.093864,advising,1


### Finding the threshold for determining slow or fast query

In [29]:
data['runtime_boolean'] = (data['runtime (ms)'] >= 2.7).astype(int)
data.head()

Unnamed: 0,query,runtime (ms),db,query_complexity,runtime_boolean
0,select distinct coursealias0.advisory_requirem...,8.092165,advising,1,1
1,"select distinct coursealias0.department , cour...",0.772238,advising,1,0
2,"select distinct coursealias0.department , cour...",0.530958,advising,1,0
3,select count( * ) > 0 from course as courseali...,92.988014,advising,0,1
4,"select distinct coursealias0.department , cour...",1.093864,advising,1,0


In [28]:
counts = data['runtime_boolean'].value_counts()

print("Frequency of 0s (runtime < 4 ms):", counts[0])
print("Frequency of 1s (runtime >= 4 ms):", counts[1])

Frequency of 0s (runtime < 4 ms): 1327
Frequency of 1s (runtime >= 4 ms): 472


In [24]:
def preprocess_query(query):
    query = re.sub(r'[^\w\s]', '', query)  # Remove punctuation
    query = re.sub(r'\s+', ' ', query)     # Remove unnecessary whitespace
    return query.lower()

### BoW (CountVectorizer)

A bag of words is one of the popular word embedding techniques of text where each value in the vector would represent the count of words in a document/sentence. In other words, it extracts features from the text. 

**BoW** represents each document (or in this case, SQL query) as a vector where each element represents the count of a word in the vocabulary. It is simple and efficient, especially for smaller datasets. However, BoW ignores word order and semantic relationships between words.

**CountVectorizer** (provided by the scikit-learn library in Python) is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text. It creates a matrix in which each unique word is represented by a column of the matrix, and each text sample from the document is a row in the matrix. The value of each cell is the count of the word in that particular text sample. 

In [25]:
features = data['query'].apply(preprocess_query)
print(features[0])

select distinct coursealias0advisory_requirement coursealias0enforced_requirement coursealias0name from course as coursealias0 where coursealias0department eecs and coursealias0number 595 


In [26]:
vectorizer = CountVectorizer()
features = vectorizer.fit_transform(features)

In [27]:
# Format is "(i, j) k" which means there is a nonzero entry with value k (occurring k times) at row i and column j in the matrix.
print(features)

  (0, 1374)	1
  (0, 724)	1
  (0, 572)	1
  (0, 577)	1
  (0, 581)	1
  (0, 944)	1
  (0, 546)	1
  (0, 344)	1
  (0, 571)	1
  (0, 1524)	1
  (0, 575)	1
  (0, 743)	1
  (0, 324)	1
  (0, 583)	1
  (0, 189)	1
  (1, 1374)	2
  (1, 724)	1
  (1, 581)	1
  (1, 944)	2
  (1, 546)	1
  (1, 344)	3
  (1, 571)	1
  (1, 1524)	2
  (1, 575)	1
  (1, 324)	2
  :	:
  (1913, 1316)	1
  (1913, 1278)	1
  (1913, 1314)	1
  (1914, 1374)	1
  (1914, 944)	1
  (1914, 344)	3
  (1914, 1042)	2
  (1914, 1191)	2
  (1914, 535)	1
  (1914, 1199)	1
  (1914, 392)	2
  (1914, 972)	1
  (1914, 700)	1
  (1914, 1075)	1
  (1914, 1273)	1
  (1914, 1274)	1
  (1914, 732)	1
  (1914, 727)	1
  (1914, 728)	1
  (1914, 729)	3
  (1914, 1312)	1
  (1914, 1313)	1
  (1914, 1316)	1
  (1914, 1278)	1
  (1914, 1314)	1


In [28]:
X_train, X_test, y_train, y_test = train_test_split(features, data['runtime_boolean'], test_size=0.2, random_state=42)

In [30]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1532, 1569)
(383, 1569)
(1532,)
(383,)


In [32]:
# Logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8955613577023499


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
# XGB Classifier
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9190600522193212


### Word2Vec

**Word2Vec** represents words as dense, continuous-valued vectors in a high-dimensional space where the similarity between words is captured based on their context in the corpus. It captures semantic relationships and similarities between words, even for words not present in the training data. However, Word2Vec requires more data to train effectively and may be computationally expensive.

In [81]:
# Preprocess SQL queries and tokenize
w2v_features = data['query'].apply(lambda x: x.lower().split())
print(w2v_features[0])

['select', 'distinct', 'coursealias0.advisory_requirement', ',', 'coursealias0.enforced_requirement', ',', 'coursealias0.name', 'from', 'course', 'as', 'coursealias0', 'where', 'coursealias0.department', '=', '"eecs"', 'and', 'coursealias0.number', '=', '595', ';']


In [101]:
# Filtering out punctuations
translator = str.maketrans('', '', string.punctuation)
w2v_features = [[word for word in query if word.translate(translator) != ''] for query in w2v_features]
print(w2v_features[0])

['select', 'distinct', 'coursealias0.advisory_requirement', 'coursealias0.enforced_requirement', 'coursealias0.name', 'from', 'course', 'as', 'coursealias0', 'where', 'coursealias0.department', '"eecs"', 'and', 'coursealias0.number', '595']


In [102]:
from gensim.models import Word2Vec
word2vec_model = Word2Vec(w2v_features, window=5, min_count=1, workers=4)

In [103]:
# Each vector is 100-dimensional (default value chosen by Word2Vec)
word2vec_model.wv['select'].shape

(100,)

In [104]:
# Code to see if a word exists in the vocabulary of our model
print(word2vec_model.wv.__contains__("coursealias0.advisory_requirement"))
print(word2vec_model.wv.__contains__("coursealias0.advisory_requirement1"))

True
False


In [105]:
# List of words in the vocabulary of our W2V model
print(word2vec_model.wv.key_to_index.keys())

print('where' in word2vec_model.wv.key_to_index.keys())
print('sdf' in word2vec_model.wv.key_to_index.keys())

dict_keys(['and', 'as', 'select', 'from', 'where', 'city', 'airport_service', 'distinct', 'flight', 'flightalias0.flight_id', 'in', 'cityalias0', 'cityalias0.city_name', 'cityalias0.city_code', 'flightalias0', 'airport_servicealias0.city_code', 'airport_servicealias0.airport_code', 'airport_servicealias0', 'flightalias0.to_airport', 'flightalias0.from_airport', 'cityalias1', 'cityalias1.city_code', 'cityalias1.city_name', 'airport_servicealias1.airport_code', 'airport_servicealias1.city_code', 'airport_servicealias1', 'days', '1991', 'flightalias0.departure_time', 'date_day', 'flightalias0.flight_days', 'farealias0.fare_id', 'daysalias0.days_code', 'daysalias0.day_name', 'flight_fare', 'fare', 'daysalias0', 'date_dayalias0.year', 'date_dayalias0.day_number', 'date_dayalias0.day_name', 'date_dayalias0.month_number', 'flightalias0.arrival_time', 'date_dayalias0', 'farealias0', 'flight_farealias0.fare_id', 'flight_farealias0.flight_id', 'on', 'join', 'flightalias0.airline_code', 'flight_f

In [106]:
word_embeddings = {word: word2vec_model.wv[word] for word in word2vec_model.wv.key_to_index.keys()}
word_embeddings

{'and': array([-0.81259793,  0.40010256,  0.6631283 , -0.18050194, -0.22870602,
        -1.3900377 ,  0.4822297 ,  1.448212  , -0.44894147, -0.18075107,
        -0.07376041, -0.803392  , -0.80040246,  0.314153  , -0.09129883,
        -0.17583036, -0.4989852 , -1.0737761 ,  0.76861864, -1.198918  ,
         0.66761935,  1.5712378 ,  0.08272324,  0.18401223,  0.31433287,
         0.9203484 , -0.2180053 ,  0.11232099, -0.6790795 , -0.25907767,
         0.5188856 , -0.2502278 ,  1.5019306 , -0.550312  , -1.1388361 ,
        -0.09119148,  0.41315377, -1.0568789 , -0.27516925, -0.6041101 ,
        -1.3249376 , -0.08094329, -1.0728717 , -0.40266097,  0.22822845,
        -1.6260514 , -0.8219274 , -0.97947973,  0.29563808,  0.5406251 ,
        -1.0702093 , -0.9532464 ,  1.0624471 ,  0.45179197,  0.40385678,
         1.0903213 , -0.00422453, -0.6235346 , -0.03940577,  1.5120412 ,
         0.27868578,  0.19927102, -1.5129067 ,  0.06250916, -0.64255536,
         0.16433252, -0.01804188,  0.2812414

In [107]:
# number of unique keys (words in vocabulary)
len(word_embeddings.keys())

1630

In [108]:
features = np.array([np.mean([word_embeddings[word] for word in query], axis=0) for query in w2v_features])
print(features.shape)

# For each query (datapoint), we represent it in a 100-dimensional vector

(1915, 100)


In [109]:
X_train, X_test, y_train, y_test = train_test_split(features, data['runtime_boolean'], test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1532, 100)
(383, 100)
(1532,)
(383,)


In [110]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8120104438642297


In [111]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9060052219321149
