In [1]:
import json
import re
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
#from official.nlp import optimization
print(torch.version.cuda)

2024-06-10 16:31:01.830367: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


None


In [2]:
seed = 42

In [3]:

def tokenize(path):
  with open(path) as f:
    json_data = json.load(f)
  print(len(json_data))
  plain_sql = [item['sql'] for item in json_data]
  plain_sql = [sql.lower() for sql in plain_sql]

  # split data into tokens

  pattern = r'[\s()\-,:;]'
  string_literal_pattern = r"'([^']*)'"
  placeholder = "<string>"
  
  # replace content inside single quotes by <string>
  plain_sql_ph = [re.sub(string_literal_pattern, placeholder, sql) for sql in   plain_sql]
  
  # split the statements with placeholder
  tokenized_sql = [re.split(pattern, sql) for sql in plain_sql_ph]
  
  # remove empty tokens
  tokenized_sql = [token for token in tokenized_sql if token]
  
  # replace numbers by placeholder
  for sql in tokenized_sql:
      for i, token in enumerate(sql):
          # if re.match(r'^[\'\"].*[\'\"]$', token):  # Check if token is a   string literal
          #     sql[i] = '<string>'
          if re.match(r'^[0-9]+(\.[0-9]+)?$', token):  # Check if token is a  number
              sql[i] = '<number>'
  
  # remove empty tokens
  for i, sql in enumerate(tokenized_sql):
      tokenized_sql[i] = [token for token in tokenized_sql[i] if token]

    # build the vocab
  vocab_set = set()
  for sql in tokenized_sql:
      vocab_set.update(sql)

  vocab_dict = {word: idx for idx, word in enumerate(vocab_set)}

  # get the runtimes
  runtime = [item['runtime_ms'] for item in json_data]
  runtime = np.array(runtime)

  # classify the runtimes, label 0 for runtime <=3000ms, 1 for runtime >3000ms
  label = np.where(runtime > 3000, 1, 0)

  return vocab_set, plain_sql_ph, label

In [4]:
vocab_set_15k, plain_sql_ph_15k, label_15k = tokenize("/Users/zhangchilu/Desktop/CSE291Cloud/291-Query-Classification/datasets/plain_text/plain_statement.json")
vocab_set_5k, plain_sql_ph_5k, label_5k = tokenize("/Users/zhangchilu/Desktop/CSE291Cloud/291-Query-Classification/datasets/plain_text/plain_statement_5000.json")

10687
5000


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(vocabulary=vocab_set_15k)
cv_mat_15k = vectorizer.fit_transform(plain_sql_ph_15k)
vectorizer = CountVectorizer(vocabulary=vocab_set_5k)
cv_mat_5k = vectorizer.fit_transform(plain_sql_ph_5k)
print(cv_mat_15k.shape)
print(cv_mat_5k.shape)

(10687, 77)
(5000, 77)


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_mat_15k = tfidf_vectorizer.fit_transform(plain_sql_ph_15k)
tfidf_mat_15k = tfidf_mat_15k.toarray()

tfidf_mat_5k = tfidf_vectorizer.fit_transform(plain_sql_ph_5k)
tfidf_mat_5k = tfidf_mat_5k.toarray()
print(tfidf_mat_15k.shape)
print(tfidf_mat_5k.shape)
print(np.array([len(ql) for ql in plain_sql_ph_15k]).max)
print(np.array([len(ql) for ql in plain_sql_ph_5k]).max)

(10687, 16162)
(5000, 8146)
<built-in method max of numpy.ndarray object at 0x7fcc2d866e10>
<built-in method max of numpy.ndarray object at 0x7fcc2d866e10>


In [7]:

class TextClassificationDataset():
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}
    
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.bert.requires_grad_(True)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
        #self.xgb = xgb.XGBClassifier()
        #self.rf = RandomForestClassifier(n_estimators=100, random_state=seed)
        
        self.prev_input_ids = []
        self.prev_labels = []

    def forward(self, input_ids, attention_mask, key):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        if (key == "LR"):
            self.loss = nn.CrossEntropyLoss()
        elif (key == "SVM"):
            self.loss = nn.MultiMarginLoss()
        #logits = torch.from_numpy(self.xgb.predict_proba(x.detach().numpy()))
        return logits
    
    def trainxgb(self, input_ids, labels, attention_mask):
        if len(self.prev_input_ids) == 0 :
            self.prev_input_ids = input_ids
            self.prev_labels = labels
        outputs = self.bert(input_ids=self.prev_input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        self.xgb.fit(x.detach().numpy(), self.prev_labels)
        self.prev_input_ids = input_ids
        self.prev_labels = labels

def train(model, data_loader, optimizer, scheduler,device):
    model.train()
    i = 0
    for batch in data_loader:
        
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        #model.trainxgb(input_ids,labels, attention_mask)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        #loss = nn.CrossEntropyLoss()(outputs, labels)
        loss = model.loss(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        if (i % 10 == 0):
            print(i)
        i+=1
        
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return (accuracy_score(actual_labels, predictions), precision_score(actual_labels, predictions, pos_label = 1), recall_score(actual_labels, predictions, pos_label = 1), precision_score(actual_labels, predictions, pos_label = 0), recall_score(actual_labels, predictions, pos_label = 0))

In [9]:
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 32
num_epochs = 2
learning_rate = 2e-5


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, confusion_matrix

In [11]:
def run_exp(cv_mat, tfidf_mat, bert_mat, label):

  X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(bert_mat, label, test_size=0.2, random_state=seed)
  result_bert = dict()
  
  models = {"LR": LogisticRegression(max_iter=1000), "SVM": SVC()}

  result_cv = dict()
  result_tfidf = dict()

  for key in models:
    model = models[key]

    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    train_dataset = TextClassificationDataset(X_train_bert, y_train_bert, tokenizer, max_length)
    val_dataset = TextClassificationDataset(X_test_bert, y_test_bert, tokenizer, max_length)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BERTClassifier(bert_model_name, num_classes, key).to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_dataloader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    for epoch in range(num_epochs):
      print(f"Epoch {epoch + 1}/{num_epochs}")
      train(model, train_dataloader, optimizer, scheduler, device)
      result_bert[key] = evaluate(model, val_dataloader, device)
      print(f"Validation Accuracy:" ,result_bert[key])
      print("Completed training of model {key} for Bert")

  
  return result_cv, result_tfidf, result_bert

In [12]:
r_cv_5k, r_tfidf_5k, r_bert_5k = run_exp(cv_mat_5k, tfidf_mat_5k, plain_sql_ph_5k, label_5k)
r_cv_15k, r_tfidf_15k, r_bert_15k = run_exp(cv_mat_15k, tfidf_mat_15k, plain_sql_ph_15k, label_15k)

125
False




Epoch 1/2
0
10
20
30
40
50
60
70
80
90
100
110
120
Validation Accuracy: (0.738, 0.5795454545454546, 0.7680722891566265, 0.8625, 0.7230538922155688)
(0.738, 0.5795454545454546, 0.7680722891566265, 0.8625, 0.7230538922155688)
Epoch 2/2
0
10
20
30
40
50
60
70
80
90
100
110
120
Validation Accuracy: (0.745, 0.5955334987593052, 0.7228915662650602, 0.8458961474036851, 0.7559880239520959)
(0.745, 0.5955334987593052, 0.7228915662650602, 0.8458961474036851, 0.7559880239520959)
Completed training of model {key} for BoW
Completed training of model {key} for TF-IDF
Completed training of model {key} for BoW
Completed training of model {key} for TF-IDF
Completed training of model {key} for BoW
Completed training of model {key} for TF-IDF
268
False




Epoch 1/2
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
Validation Accuracy: (0.7422825070159027, 0.6122112211221122, 0.5400291120815138, 0.793733681462141, 0.8380427291523087)
(0.7422825070159027, 0.6122112211221122, 0.5400291120815138, 0.793733681462141, 0.8380427291523087)
Epoch 2/2
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
Validation Accuracy: (0.7347988774555659, 0.5887573964497042, 0.5793304221251819, 0.8023255813953488, 0.808407994486561)
(0.7347988774555659, 0.5887573964497042, 0.5793304221251819, 0.8023255813953488, 0.808407994486561)
Completed training of model {key} for BoW
Completed training of model {key} for TF-IDF
Completed training of model {key} for BoW
Completed training of model {key} for TF-IDF
Completed training of model {key} for BoW
Completed training of model {key} for TF-IDF


In [None]:
print("Results of BoW - 5k")
for key, value in r_cv_5k.items():
    print(f"{key}: {value}")
print("Results of BoW - 15k")
for key, value in r_cv_15k.items():
    print(f"{key}: {value}")

Results of BoW - 5k


NameError: name 'r_cv_5k' is not defined

In [None]:
print("Results of TF-IDF - 5k")
for key, value in r_tfidf_5k.items():
    print(f"{key}: {value}")
print("Results of TF-IDF - 15k")
for key, value in r_tfidf_15k.items():
    print(f"{key}: {value}")

In [None]:
### PyTorch Embedding

# define embedding layer

# vocab_size = len(vocab_dict)
# embedding_dim = 10
# embedding = nn.Embedding(vocab_size, embedding_dim)

In [None]:
# convert tokens to indices for each sample
# indices = [torch.LongTensor([vocab_dict[token] for token in sql]) for sql in tokenized_sql]

# X_torch = []

# for index in indices:
#     emb = embedding(index)
#     sum = torch.sum(emb, dim=0)
#     X_torch.append(sum.tolist())

# X_torch = np.array(X_torch)