In [77]:
import os

import numpy as np
import pandas as pd
from scipy.sparse import hstack
import string

from gensim.models import Word2Vec
import xgboost as xgb

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
seed = 291

In [3]:
data_dir = "../data/runtime"
data = pd.read_csv(os.path.join(data_dir, "proccessed-data"))
data.head()

Unnamed: 0,query,runtime (ms),query_complexity,runtime_boolean,table_size
0,select distinct coursealias0advisory_requireme...,8.092165,1,1,11453
1,select distinct coursealias0department coursea...,0.772238,1,0,11531
2,select distinct coursealias0department coursea...,0.530958,1,0,11531
3,select count 0 from course as coursealias0 cou...,92.988014,0,1,326457
4,select distinct coursealias0department coursea...,1.093864,1,0,23060


In [4]:
labels = data['runtime_boolean']

In [84]:
x_basic = data[['query_complexity', 'table_size']].copy()

## BoW (CountVectorizer)

In [6]:
bow_features = data['query'].copy()
vectorizer = CountVectorizer()
bow_features = vectorizer.fit_transform(bow_features)
additional_features = data[['query_complexity', 'table_size']].values
x_bow = hstack([bow_features, additional_features])
print(x_bow.shape)

(1799, 1439)


## Word2Vec

In [10]:
w2v_features = data['query'].apply(lambda x: x.lower().split())
translator = str.maketrans('', '', string.punctuation)
w2v_features = [[word for word in query if word.translate(translator) != ''] for query in w2v_features]

In [11]:
word2vec_model = Word2Vec(w2v_features, window=5, min_count=1, workers=4)

In [12]:
word_embeddings = {word: word2vec_model.wv[word] for word in word2vec_model.wv.key_to_index.keys()}
w2v_mean_features = np.array([np.mean([word_embeddings[word] for word in query], axis=0) for query in w2v_features])
x_w2v = np.hstack([w2v_mean_features, additional_features])
print(x_w2v.shape)

(1799, 102)


In [85]:
def train_valid_test_split(X, y, test_size=0.15, valid_size=0.15, random_state=seed):
    valid_perc = valid_size / (1. - test_size)
    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=valid_perc, random_state=random_state)
    return X_train, y_train, X_val, y_val, X_test, y_test, X_trainval, y_trainval

In [86]:
# train, validation, test split (70:15:15)
basic_X_train, basic_y_train, basic_X_val, basic_y_val, basic_X_test, basic_y_test, basic_X_trainval, basic_y_trainval= train_valid_test_split(x_basic, labels)
bow_X_train, bow_y_train, bow_X_val, bow_y_val, bow_X_test, bow_y_test, bow_X_trainval, bow_y_trainval= train_valid_test_split(x_bow, labels)
w2v_X_train, w2v_y_train, w2v_X_val, w2v_y_val, w2v_X_test, w2v_y_test, w2v_X_trainval, w2v_y_trainval = train_valid_test_split(x_w2v, labels)

In [100]:
# normalize trainval data
basic_trainval_scaler = MinMaxScaler()
norm_basic_X_trainval = basic_trainval_scaler.fit_transform(basic_X_trainval)
norm_basic_X_test = basic_trainval_scaler.fit_transform(basic_X_test)

bow_trainval_scaler = MinMaxScaler()
norm_bow_X_trainval = bow_trainval_scaler.fit_transform(bow_X_trainval.toarray())
norm_bow_X_test = bow_trainval_scaler.fit_transform(bow_X_test.toarray())

w2v_trainval_scaler = MinMaxScaler()
norm_w2v_X_trainval = w2v_trainval_scaler.fit_transform(w2v_X_trainval)
norm_w2v_X_test = w2v_trainval_scaler.fit_transform(w2v_X_test)

## SVM

In [96]:
svc_params = {
    'C': [0.1, 1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto']
}

In [97]:
svc = SVC(max_iter=5000)
basic_svc_clf = GridSearchCV(estimator=svc, param_grid=svc_params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
basic_svc_clf.fit(norm_basic_X_trainval, basic_y_trainval)
print(basic_svc_clf.best_estimator_)
print(basic_svc_clf.best_score_)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
SVC(C=1000, degree=2, kernel='linear', max_iter=5000)
0.8116339869281045


In [98]:
svc = SVC()
bow_svc_clf = GridSearchCV(estimator=svc, param_grid=svc_params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
bow_svc_clf.fit(norm_bow_X_trainval, bow_y_trainval)
print(bow_svc_clf.best_estimator_)
print(bow_svc_clf.best_score_)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
SVC(C=10, degree=2)
0.887493839065681


In [101]:
svc = SVC()
w2v_svc_clf = GridSearchCV(estimator=svc, param_grid=svc_params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
w2v_svc_clf.fit(norm_w2v_X_trainval, w2v_y_trainval)
print(w2v_svc_clf.best_estimator_)
print(w2v_svc_clf.best_score_)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
SVC(C=1000, degree=2)
0.8776792028286724


## Random Forest

In [102]:
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [2, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

In [103]:
rf = RandomForestClassifier()
basic_rf_clf = GridSearchCV(estimator=rf, param_grid=rf_params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
basic_rf_clf.fit(norm_basic_X_trainval, basic_y_trainval)
print(basic_rf_clf.best_estimator_)
print(basic_rf_clf.best_score_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
RandomForestClassifier(max_depth=10, min_samples_split=5, n_estimators=50)
0.8835636987035251


In [104]:
rf = RandomForestClassifier()
bow_rf_clf = GridSearchCV(estimator=rf, param_grid=rf_params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
bow_rf_clf.fit(norm_bow_X_trainval, bow_y_trainval)
print(bow_rf_clf.best_estimator_)
print(bow_rf_clf.best_score_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
RandomForestClassifier(criterion='entropy', max_depth=20, n_estimators=200)
0.8874938390656808


In [105]:
rf = RandomForestClassifier()
w2v_rf_clf = GridSearchCV(estimator=rf, param_grid=rf_params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
w2v_rf_clf.fit(norm_w2v_X_trainval, w2v_y_trainval)
print(w2v_rf_clf.best_estimator_)
print(w2v_rf_clf.best_score_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
RandomForestClassifier(max_depth=20, min_samples_split=5, n_estimators=50)
0.8874959819993571


## XGBoost

In [106]:
xgb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

In [107]:
xgbc = xgb.XGBClassifier()
basic_xgb_clf = GridSearchCV(estimator=xgbc, param_grid=xgb_params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
basic_xgb_clf.fit(norm_basic_X_trainval, basic_y_trainval)
print(basic_xgb_clf.best_estimator_)
print(basic_xgb_clf.best_score_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=1.0, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
0.8672174006214508


In [108]:
xgbc = xgb.XGBClassifier()
bow_xgb_clf = GridSearchCV(estimator=xgbc, param_grid=xgb_params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
bow_xgb_clf.fit(norm_bow_X_trainval, bow_y_trainval)
print(bow_xgb_clf.best_estimator_)
print(bow_xgb_clf.best_score_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits




XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=1.0, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
0.8959905710918248


In [109]:
xgbc = xgb.XGBClassifier()
w2v_xgb_clf = GridSearchCV(estimator=xgbc, param_grid=xgb_params, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
w2v_xgb_clf.fit(norm_w2v_X_trainval, w2v_y_trainval)
print(w2v_xgb_clf.best_estimator_)
print(w2v_xgb_clf.best_score_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits




XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
0.8907575270545376


In [110]:
def train_evaluate(train_X, train_y, valid_x, valid_y, model):
    model = model.train(train_X, train_y)
    y_pred = model.predict(valid_x)
    classification_report(valid_y, y_pred, output_dict=True)
    return model, classification_report

In [None]:
# MLP

In [None]:
# Plot feature importance graph

In [None]:
# Result on Test Set, classification report

In [None]:
# Plot results