### 1. Data preparation for generation tasks

In [1]:
import json
import os
import pickle
import spacy
from typing import List, Dict, Tuple
from collections import Counter

# tokens
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"
SOS_TOKEN = "<SOS>"
EOS_TOKEN = "<EOS>"

# File paths
train_question_file = "./datasets/question_train.jsonl"
train_query_file    = "./datasets/query_train.jsonl"
dev_question_file   = "./datasets/question_dev.jsonl"
dev_query_file      = "./datasets/query_dev.jsonl"
test_question_file  = "./datasets/question_test.jsonl"
test_query_file     = "./datasets/query_test.jsonl"

# compile vocab model and give vocab paths
vocab_dir = "./vocab"
input_vocab_file = os.path.join(vocab_dir, "input_vocab.pkl")
output_vocab_file = os.path.join(vocab_dir, "output_vocab.pkl")
os.makedirs(vocab_dir, exist_ok=True)

# Cache paths, use cache to speed up, so that improve efficiency
cache_dir = "./cache"
os.makedirs(cache_dir, exist_ok=True)
train_cache = os.path.join(cache_dir, 'train_tokenized.pkl')
dev_cache = os.path.join(cache_dir, 'dev_tokenized.pkl')
test_q_cache = os.path.join(cache_dir, 'test_q_tokenized.pkl')
test_s_cache = os.path.join(cache_dir, 'test_s_tokenized.pkl')

# Load spaCy model, 
print("Loading spaCy model...")
# use named entity recognition, lemmatiztion
# use sm small model
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "lemmatizer"])

# Dataset loader
def load_dataset(question_path: str, query_path: str) -> List[Tuple[str, str]]:
    print(f"Loading dataset from: '{question_path}' and '{query_path}'")
    data_pairs = []
    with open(question_path, 'r', encoding='utf-8') as fq, open(query_path, 'r', encoding='utf-8') as fs:
        for line_num, (q_line, s_line) in enumerate(zip(fq, fs), 1):
            q_line, s_line = q_line.strip(), s_line.strip()
            if not q_line or not s_line:
                continue
            question = json.loads(q_line).get("text", "") if q_line.startswith("{") else q_line
            sql = json.loads(s_line).get("sql", "") if s_line.startswith("{") else s_line
            data_pairs.append((question, sql))
            if line_num % 500 == 0:
                print(f"  Loaded {line_num} examples...")
    print(f"Completed loading dataset. Total pairs: {len(data_pairs)}\n")
    return data_pairs

# spaCy tokenizers
def tokenize_question(question: str) -> List[str]:
    return [token.text for token in nlp(question.strip())]

def tokenize_sql(sql: str) -> List[str]:
    return [token.text for token in nlp(sql.strip())]

# Load datasets
print("=== Start Loading Data ===")
train_data = load_dataset(train_question_file, train_query_file)
dev_data = load_dataset(dev_question_file, dev_query_file)
test_q_data = load_dataset(test_question_file, test_question_file)
test_s_data = load_dataset(test_query_file, test_query_file)
print("=== All Data Loaded Successfully ===\n")

# Vocab functions
def build_vocab(train_data):
    print("Building vocabulary from training data...")
    # use counter instance for count numbers so that we could check process at any time
    input_counter, output_counter = Counter(), Counter()
    for idx, (question, sql) in enumerate(train_data, 1):
        input_counter.update(tokenize_question(question))
        output_counter.update(tokenize_sql(sql))
        if idx % 1000 == 0:
            print(f"  Processed {idx} examples for vocab building...")
    # pad unknow token ...
    input_vocab = {PAD_TOKEN:0, UNK_TOKEN:1}
    output_vocab = {PAD_TOKEN:0, UNK_TOKEN:1, SOS_TOKEN:2, EOS_TOKEN:3}
    # koenize vocab
    input_vocab.update({tok: len(input_vocab) for tok, _ in input_counter.items()})
    output_vocab.update({tok: len(output_vocab) for tok, _ in output_counter.items()})

    print(f"Vocabulary built: Input vocab size {len(input_vocab)}, Output vocab size {len(output_vocab)}\n")
    return input_vocab, output_vocab
# save the mapping
def save_vocab(input_vocab, output_vocab):
    with open(input_vocab_file, 'wb') as f:
        pickle.dump(input_vocab, f)
    with open(output_vocab_file, 'wb') as f:
        pickle.dump(output_vocab, f)
    print("Vocabularies saved successfully.\n")
# in this way, save a lot of time, don't need to process again
def load_vocab():
    with open(input_vocab_file, 'rb') as f:
        input_vocab = pickle.load(f)
    with open(output_vocab_file, 'rb') as f:
        output_vocab = pickle.load(f)
    print("Vocabularies loaded successfully from files.\n")
    return input_vocab, output_vocab

# Load or build vocabularies
if os.path.exists(input_vocab_file) and os.path.exists(output_vocab_file):
    print("Vocab files exist. Loading vocabularies...")
    input_vocab, output_vocab = load_vocab()
else:
    input_vocab, output_vocab = build_vocab(train_data)
    save_vocab(input_vocab, output_vocab)

# Cache tokenized datasets
def preprocess_and_cache(data_pairs, cache_path):
    tokenized_pairs = []
    for q, s in data_pairs:
        q_tok = [input_vocab.get(tok.text, input_vocab[UNK_TOKEN]) for tok in nlp(q.strip())]
        s_tok = [output_vocab.get(tok.text, output_vocab[UNK_TOKEN]) for tok in nlp(s.strip())]
        tokenized_pairs.append((q_tok, s_tok))
    with open(cache_path, 'wb') as f:
        pickle.dump(tokenized_pairs, f)
    print(f"Tokenized data cached at {cache_path}")

preprocess_and_cache(train_data, train_cache)
preprocess_and_cache(dev_data, dev_cache)
preprocess_and_cache(test_q_data, test_q_cache)
preprocess_and_cache(test_s_data, test_s_cache)

# Inverse mappings
input_idx_to_token = {idx: tok for tok, idx in input_vocab.items()}
output_idx_to_token = {idx: tok for tok, idx in output_vocab.items()}

# Final vocab sizes
input_vocab_size = len(input_vocab)
output_vocab_size = len(output_vocab)
print(f"Final Vocab Sizes:\n  Input: {input_vocab_size}\n  Output: {output_vocab_size}")

Loading spaCy model...
=== Start Loading Data ===
Loading dataset from: './datasets/question_train.jsonl' and './datasets/query_train.jsonl'
  Loaded 500 examples...
  Loaded 1000 examples...
  Loaded 1500 examples...
  Loaded 2000 examples...
  Loaded 2500 examples...
  Loaded 3000 examples...
  Loaded 3500 examples...
  Loaded 4000 examples...
Completed loading dataset. Total pairs: 4347

Loading dataset from: './datasets/question_dev.jsonl' and './datasets/query_dev.jsonl'
Completed loading dataset. Total pairs: 121

Loading dataset from: './datasets/question_test.jsonl' and './datasets/question_test.jsonl'
Completed loading dataset. Total pairs: 447

Loading dataset from: './datasets/query_test.jsonl' and './datasets/query_test.jsonl'
Completed loading dataset. Total pairs: 347

=== All Data Loaded Successfully ===

Vocab files exist. Loading vocabularies...
Vocabularies loaded successfully from files.

Tokenized data cached at ./cache\train_tokenized.pkl
Tokenized data cached at .

### 2. LSTM

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from typing import List, Tuple
import random
import pickle
import os

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# path
cache_dir = "./cache"
train_cache_file = os.path.join(cache_dir, "train_tokenized.pkl")
dev_cache_file = os.path.join(cache_dir, "dev_tokenized.pkl")
test_q_cache_file = os.path.join(cache_dir, "test_q_tokenized.pkl")
test_s_cache_file = os.path.join(cache_dir, "test_s_tokenized.pkl")

# Load cached tokenized data
def load_cached_data(cache_file):
    with open(cache_file, 'rb') as f:
        return pickle.load(f)

print("Loading cached datasets...")
train_data = load_cached_data(train_cache_file)
dev_data = load_cached_data(dev_cache_file)
test_q_data = load_cached_data(test_q_cache_file)
test_s_data = load_cached_data(test_s_cache_file)

# Load vocabularies
vocab_dir = "./vocab"
input_vocab = pickle.load(open(os.path.join(vocab_dir, "input_vocab.pkl"), 'rb'))
output_vocab = pickle.load(open(os.path.join(vocab_dir, "output_vocab.pkl"), 'rb'))
input_vocab_size = len(input_vocab)
output_vocab_size = len(output_vocab)

input_idx_to_token = {idx: tok for tok, idx in input_vocab.items()}
output_idx_to_token = {idx: tok for tok, idx in output_vocab.items()}

PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"
SOS_TOKEN = "<SOS>"
EOS_TOKEN = "<EOS>"

# define seq2seq model, use torch to build model
class Seq2SeqModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, embed_size=128, hidden_size=256):
        super().__init__()
        # embedding
        self.encoder_embed = nn.Embedding(input_vocab_size, embed_size, padding_idx=0)
        # LSTM
        self.encoder_lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.decoder_embed = nn.Embedding(output_vocab_size, embed_size, padding_idx=0)
        self.decoder_lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        # use one linear for output
        self.decoder_out = nn.Linear(hidden_size, output_vocab_size)

    # for feed forward processing, use encoder 
    def forward(self, src_batch, src_lengths, tgt_batch):
        packed_src = nn.utils.rnn.pack_padded_sequence(
            self.encoder_embed(src_batch), src_lengths,
            batch_first=True, enforce_sorted=False
        )
        _, (h, c) = self.encoder_lstm(packed_src)

        # use decoder
        outputs, _ = self.decoder_lstm(self.decoder_embed(tgt_batch), (h, c))
        logits = self.decoder_out(outputs)
        return logits

# batchify with cached indices
def batchify(data_pairs, batch_size, shuffle=True):
    if shuffle:
        random.shuffle(data_pairs)

    batches = []
    for i in range(0, len(data_pairs), batch_size):
        batch = data_pairs[i:i+batch_size]
        batch_q, batch_s_input, batch_s_target = [], [], []
        
        # set indices
        for q_idx, s_idx in batch:
            s_input_idx = [output_vocab[SOS_TOKEN]] + s_idx
            s_target_idx = s_idx + [output_vocab[EOS_TOKEN]]

            batch_q.append(q_idx)
            batch_s_input.append(s_input_idx)
            batch_s_target.append(s_target_idx)
            
        # for future padding
        q_lengths = [len(seq) for seq in batch_q]
        max_q_len = max(q_lengths)
        max_s_len = max(len(seq) for seq in batch_s_target)

        enc_batch = [seq + [input_vocab[PAD_TOKEN]]*(max_q_len - len(seq)) for seq in batch_q]
        dec_in_batch = [seq + [output_vocab[PAD_TOKEN]]*(max_s_len - len(seq)) for seq in batch_s_input]
        dec_tgt_batch = [seq + [output_vocab[PAD_TOKEN]]*(max_s_len - len(seq)) for seq in batch_s_target]

        batches.append((
            torch.tensor(enc_batch, device=device),
            q_lengths,
            torch.tensor(dec_in_batch, device=device),
            torch.tensor(dec_tgt_batch, device=device)
        ))
    return batches

# training model use cross entropy loss
def train_seq2seq(model, train_data, dev_data, epochs=10, batch_size=64, lr=0.001):
    # use GPU
    model.to(device)
    # crossentropy loss ignore PAD token
    criterion = nn.CrossEntropyLoss(ignore_index=output_vocab[PAD_TOKEN])
    # use Adam optimizer 
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        batches = batchify(train_data, batch_size)

        # gradient backward process
        for enc_batch, enc_lengths, dec_in_batch, dec_tgt_batch in batches:
            optimizer.zero_grad()
            logits = model(enc_batch, enc_lengths, dec_in_batch)
            loss = criterion(logits.view(-1, logits.size(-1)), dec_tgt_batch.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(batches)
        print(f"Epoch {epoch+1}/{epochs} - Training loss: {avg_loss:.4f}")

        dev_acc = evaluate_seq2seq(model, dev_data)
        print(f"Dev Set Accuracy: {dev_acc*100:.2f}%")

# inference 
def infer_seq2seq(model, question_indices, max_len=100):
    # set evaluation model
    model.eval()
    enc_tensor = torch.tensor([question_indices], device=device)
    enc_len = [len(question_indices)]

    with torch.no_grad():
        packed_src = nn.utils.rnn.pack_padded_sequence(
            model.encoder_embed(enc_tensor), enc_len, batch_first=True, enforce_sorted=False
        )
        _, (h, c) = model.encoder_lstm(packed_src)

        dec_input = torch.tensor([[output_vocab[SOS_TOKEN]]], device=device)
        pred_tokens = []

        for _ in range(max_len):
            output, (h, c) = model.decoder_lstm(model.decoder_embed(dec_input), (h, c))
            logits = model.decoder_out(output.squeeze(1))
            pred_idx = logits.argmax(1).item()
            if pred_idx == output_vocab[EOS_TOKEN]:
                break
            pred_tokens.append(output_idx_to_token.get(pred_idx, UNK_TOKEN))
            dec_input = torch.tensor([[pred_idx]], device=device)

    return " ".join(pred_tokens)

def evaluate_seq2seq(model, dataset):
    correct = 0
    for q_idx, sql_idx in dataset:
        pred_sql = infer_seq2seq(model, q_idx)
        gold_sql = " ".join([output_idx_to_token[idx] for idx in sql_idx])
        if pred_sql.strip().lower() == gold_sql.strip().lower():
            correct += 1
    return correct / len(dataset)

lstm_model = Seq2SeqModel(input_vocab_size, output_vocab_size)
train_seq2seq(lstm_model, train_data, dev_data, epochs=10, batch_size=64)

acc_question = evaluate_seq2seq(lstm_model, test_q_data)
acc_query = evaluate_seq2seq(lstm_model, test_s_data)
print(f"Question Split Test Accuracy: {acc_question*100:.2f}%")
print(f"Query Split Test Accuracy: {acc_query*100:.2f}%")

Loading cached datasets...
Epoch 1/10 - Training loss: 0.5194
Dev Set Accuracy: 0.00%
Epoch 2/10 - Training loss: 0.0677
Dev Set Accuracy: 0.00%
Epoch 3/10 - Training loss: 0.0666
Dev Set Accuracy: 0.00%
Epoch 4/10 - Training loss: 0.0663
Dev Set Accuracy: 0.00%
Epoch 5/10 - Training loss: 0.0662
Dev Set Accuracy: 0.00%
Epoch 6/10 - Training loss: 0.0661
Dev Set Accuracy: 0.00%
Epoch 7/10 - Training loss: 0.0662
Dev Set Accuracy: 0.00%
Epoch 8/10 - Training loss: 0.0660
Dev Set Accuracy: 0.00%
Epoch 9/10 - Training loss: 0.0659
Dev Set Accuracy: 0.00%
Epoch 10/10 - Training loss: 0.0658
Dev Set Accuracy: 0.00%
Question Split Test Accuracy: 0.45%
Query Split Test Accuracy: 0.00%


### 3. LSTM Encoder-Decoder with Attentiony

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import pickle
import os

# GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load cached datasets
cache_dir = "./cache"

def load_cached_data(filename):
    path = os.path.join(cache_dir, filename)
    with open(path, 'rb') as f:
        return pickle.load(f)

print("Loading cached datasets...")
train_data = load_cached_data("train_tokenized.pkl")
dev_data = load_cached_data("dev_tokenized.pkl")
test_q_data = load_cached_data("test_q_tokenized.pkl")
test_s_data = load_cached_data("test_s_tokenized.pkl")

# Load vocabularies
vocab_dir = "./vocab"
with open(os.path.join(vocab_dir, "input_vocab.pkl"), 'rb') as f:
    input_vocab = pickle.load(f)
with open(os.path.join(vocab_dir, "output_vocab.pkl"), 'rb') as f:
    output_vocab = pickle.load(f)

input_vocab_size = len(input_vocab)
output_vocab_size = len(output_vocab)

input_idx_to_token = {idx: tok for tok, idx in input_vocab.items()}
output_idx_to_token = {idx: tok for tok, idx in output_vocab.items()}

# Special tokens
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"
SOS_TOKEN = "<SOS>"
EOS_TOKEN = "<EOS>"

class Seq2SeqAttnModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, embed_size=128, hidden_size=256):
        super().__init__()
        self.hidden_size = hidden_size

        # Encoder
        self.encoder_embed = nn.Embedding(input_vocab_size, embed_size, padding_idx=input_vocab[PAD_TOKEN])
        self.encoder_lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)

        # Decoder
        self.decoder_embed = nn.Embedding(output_vocab_size, embed_size, padding_idx=output_vocab[PAD_TOKEN])
        self.decoder_lstm = nn.LSTM(embed_size + hidden_size, hidden_size, batch_first=True)

        # Attention layers
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.attn_v = nn.Linear(hidden_size, 1, bias=False)

        # Output layer
        self.out_linear = nn.Linear(hidden_size * 2, output_vocab_size)

    def forward(self, src_batch, src_lengths, tgt_batch):
        src_emb = self.encoder_embed(src_batch)
        packed_src = nn.utils.rnn.pack_padded_sequence(src_emb, src_lengths, batch_first=True, enforce_sorted=False)
        enc_outputs, (h_enc, c_enc) = self.encoder_lstm(packed_src)
        enc_outputs, _ = nn.utils.rnn.pad_packed_sequence(enc_outputs, batch_first=True)

        h_dec, c_dec = h_enc, c_enc
        logits = []

        dec_embedded = self.decoder_embed(tgt_batch)

        for t in range(tgt_batch.size(1)):
            dec_input_t = dec_embedded[:, t, :].unsqueeze(1)

            attn_weights = self.calculate_attention(h_dec[-1], enc_outputs, src_lengths)
            context = torch.bmm(attn_weights.unsqueeze(1), enc_outputs)

            dec_input_combined = torch.cat([dec_input_t, context], dim=2)
            output, (h_dec, c_dec) = self.decoder_lstm(dec_input_combined, (h_dec, c_dec))

            output_combined = torch.cat([output, context], dim=2).squeeze(1)
            logits.append(self.out_linear(output_combined).unsqueeze(1))

        logits = torch.cat(logits, dim=1)
        return logits

    # use tanh for activation function 
    def calculate_attention(self, hidden, encoder_outputs, src_lengths):
        hidden_expanded = hidden.unsqueeze(1).expand_as(encoder_outputs)
        energy = torch.tanh(self.attn(torch.cat((hidden_expanded, encoder_outputs), dim=2)))
        scores = self.attn_v(energy).squeeze(2)

        mask = torch.arange(encoder_outputs.size(1), device=device).unsqueeze(0) >= torch.tensor(src_lengths, device=device).unsqueeze(1)
        scores.data.masked_fill_(mask, -float('inf'))
        return torch.softmax(scores, dim=1)

def batchify(data_pairs, batch_size, shuffle=True):
    if shuffle:
        random.shuffle(data_pairs)
    batches = []
    for i in range(0, len(data_pairs), batch_size):
        batch = data_pairs[i:i+batch_size]
        batch_q, batch_s_input, batch_s_target = [], [], []
        for q_idx, s_idx in batch:
            s_input_idx = [output_vocab[SOS_TOKEN]] + s_idx
            s_target_idx = s_idx + [output_vocab[EOS_TOKEN]]

            batch_q.append(q_idx)
            batch_s_input.append(s_input_idx)
            batch_s_target.append(s_target_idx)

        q_lengths = [len(q) for q in batch_q]
        max_q_len = max(q_lengths)
        max_s_len = max(len(s) for s in batch_s_target)

        enc_batch = [seq + [input_vocab[PAD_TOKEN]]*(max_q_len-len(seq)) for seq in batch_q]
        dec_in_batch = [seq + [output_vocab[PAD_TOKEN]]*(max_s_len-len(seq)) for seq in batch_s_input]
        dec_tgt_batch = [seq + [output_vocab[PAD_TOKEN]]*(max_s_len-len(seq)) for seq in batch_s_target]

        batches.append((torch.tensor(enc_batch, device=device), q_lengths,
                        torch.tensor(dec_in_batch, device=device),
                        torch.tensor(dec_tgt_batch, device=device)))
    return batches

# we need to normalize the sql
def normalize_sql(sql):
    return ' '.join(sql.lower().strip().rstrip(';').split())

def train_seq2seq_attn(model, train_data, dev_data=None, epochs=10, batch_size=64, lr=0.001):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(ignore_index=output_vocab[PAD_TOKEN])

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        batches = batchify(train_data, batch_size)

        for enc_batch, q_lengths, dec_input_batch, dec_target_batch in batches:
            optimizer.zero_grad()
            logits = model(enc_batch, q_lengths, dec_input_batch)
            loss = criterion(logits.view(-1, logits.size(-1)), dec_target_batch.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(batches)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

        if dev_data:
            dev_acc = evaluate_seq2seq_attn(model, dev_data)
            print(f"Dev Accuracy: {dev_acc*100:.2f}%")

# inference using token
def infer_seq2seq_attn(model, question_indices, max_len=100):
    model.eval()
    enc_tensor = torch.tensor([question_indices], device=device)
    enc_len = [len(question_indices)]

    with torch.no_grad():
        src_emb = model.encoder_embed(enc_tensor)
        packed_src = nn.utils.rnn.pack_padded_sequence(src_emb, enc_len, batch_first=True, enforce_sorted=False)
        enc_outputs, (h_dec, c_dec) = model.encoder_lstm(packed_src)
        enc_outputs, _ = nn.utils.rnn.pad_packed_sequence(enc_outputs, batch_first=True)

        dec_input_idx = output_vocab[SOS_TOKEN]
        pred_tokens = []

        for _ in range(max_len):
            dec_emb = model.decoder_embed(torch.tensor([[dec_input_idx]], device=device))
            attn_weights = model.calculate_attention(h_dec[-1], enc_outputs, enc_len)
            context = torch.bmm(attn_weights.unsqueeze(1), enc_outputs)
            dec_input_combined = torch.cat([dec_emb, context], dim=2)
            output, (h_dec, c_dec) = model.decoder_lstm(dec_input_combined, (h_dec, c_dec))
            logits = model.out_linear(torch.cat([output, context], dim=2).squeeze(1))
            pred_idx = logits.argmax(-1).item()
            if pred_idx == output_vocab[EOS_TOKEN]:
                break
            pred_tokens.append(output_idx_to_token.get(pred_idx, UNK_TOKEN))
            dec_input_idx = pred_idx

    return " ".join(pred_tokens)

def evaluate_seq2seq_attn(model, dataset):
    correct = 0
    for q_indices, sql_indices in dataset:
        pred_sql = infer_seq2seq_attn(model, q_indices)
        gold_sql = " ".join([output_idx_to_token[idx] for idx in sql_indices])
        if normalize_sql(pred_sql) == normalize_sql(gold_sql):
            correct += 1
    return correct / len(dataset)

attn_model = Seq2SeqAttnModel(input_vocab_size, output_vocab_size)
train_seq2seq_attn(attn_model, train_data, dev_data, epochs=10, batch_size=64, lr=0.001)

acc_question = evaluate_seq2seq_attn(attn_model, test_q_data)
acc_query = evaluate_seq2seq_attn(attn_model, test_s_data)
print(f"Question Split Accuracy: {acc_question*100:.2f}%")
print(f"Query Split Accuracy: {acc_query*100:.2f}%")

Loading cached datasets...
Epoch 1/10 - Loss: 0.3770
Dev Accuracy: 0.00%
Epoch 2/10 - Loss: 0.0670
Dev Accuracy: 0.00%
Epoch 3/10 - Loss: 0.0663
Dev Accuracy: 0.00%
Epoch 4/10 - Loss: 0.0662
Dev Accuracy: 0.00%
Epoch 5/10 - Loss: 0.0660
Dev Accuracy: 0.00%
Epoch 6/10 - Loss: 0.0658
Dev Accuracy: 0.00%
Epoch 7/10 - Loss: 0.0660
Dev Accuracy: 0.00%
Epoch 8/10 - Loss: 0.0659
Dev Accuracy: 0.00%
Epoch 9/10 - Loss: 0.0658
Dev Accuracy: 0.00%
Epoch 10/10 - Loss: 0.0655
Dev Accuracy: 0.00%
Question Split Accuracy: 0.45%
Query Split Accuracy: 0.00%


### 4. Transformer

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import pickle
import os

# GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Directory paths
cache_dir = "./cache"
vocab_dir = "./vocab"

# Load cached data
def load_cached_data(filename):
    path = os.path.join(cache_dir, filename)
    with open(path, 'rb') as f:
        return pickle.load(f)

train_data = load_cached_data("train_tokenized.pkl")
dev_data = load_cached_data("dev_tokenized.pkl")
test_q_data = load_cached_data("test_q_tokenized.pkl")
test_s_data = load_cached_data("test_s_tokenized.pkl")

# Load vocabularies
with open(os.path.join(vocab_dir, "input_vocab.pkl"), 'rb') as f:
    input_vocab = pickle.load(f)
with open(os.path.join(vocab_dir, "output_vocab.pkl"), 'rb') as f:
    output_vocab = pickle.load(f)

input_vocab_size = len(input_vocab)
output_vocab_size = len(output_vocab)
input_idx_to_token = {idx: tok for tok, idx in input_vocab.items()}
output_idx_to_token = {idx: tok for tok, idx in output_vocab.items()}

PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"
SOS_TOKEN = "<SOS>"
EOS_TOKEN = "<EOS>"
# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, embed_size=128, num_heads=8, num_encoder_layers=3,
                 num_decoder_layers=3, ff_hidden_size=512, dropout=0.1, max_seq_len=512):
        super().__init__()
        self.embed_size = embed_size

        self.input_embedding = nn.Embedding(input_vocab_size, embed_size, padding_idx=input_vocab[PAD_TOKEN])
        self.output_embedding = nn.Embedding(output_vocab_size, embed_size, padding_idx=output_vocab[PAD_TOKEN])

        self.positional_encoding = nn.Parameter(self._generate_positional_encoding(max_seq_len, embed_size), requires_grad=False)

        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=ff_hidden_size,
            dropout=dropout
        )

        self.fc_out = nn.Linear(embed_size, output_vocab_size)

    def forward(self, src, tgt, src_padding_mask, tgt_padding_mask, tgt_mask):
        src_emb = self.input_embedding(src) + self.positional_encoding[:src.size(1), :]
        tgt_emb = self.output_embedding(tgt) + self.positional_encoding[:tgt.size(1), :]

        src_emb = src_emb.transpose(0, 1)
        tgt_emb = tgt_emb.transpose(0, 1)

        output = self.transformer(
            src_emb, tgt_emb,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask,
            tgt_mask=tgt_mask,
            memory_key_padding_mask=src_padding_mask
        )

        output = output.transpose(0, 1)
        return self.fc_out(output)

    def _generate_positional_encoding(self, max_len, d_model):
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(torch.log(torch.tensor(10000.0)) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe

# Generate masks
def generate_square_subsequent_mask(sz):
    mask = torch.triu(torch.ones(sz, sz, device=device), diagonal=1).bool()
    return mask

def create_padding_mask(seq, pad_idx):
    return (seq == pad_idx)

# Batchify
def batchify(data_pairs, batch_size, shuffle=True):
    if shuffle:
        random.shuffle(data_pairs)
    batches = []
    for i in range(0, len(data_pairs), batch_size):
        batch = data_pairs[i:i + batch_size]
        batch_q, batch_s_input, batch_s_target = [], [], []
        for q, s in batch:
            s_input_idx = [output_vocab[SOS_TOKEN]] + s
            s_target_idx = s + [output_vocab[EOS_TOKEN]]
            batch_q.append(q)
            batch_s_input.append(s_input_idx)
            batch_s_target.append(s_target_idx)

        max_q_len = max(len(q) for q in batch_q)
        max_s_len = max(len(s) for s in batch_s_input)

        enc_batch = [seq + [input_vocab[PAD_TOKEN]] * (max_q_len - len(seq)) for seq in batch_q]
        dec_in_batch = [seq + [output_vocab[PAD_TOKEN]] * (max_s_len - len(seq)) for seq in batch_s_input]
        dec_tgt_batch = [seq + [output_vocab[PAD_TOKEN]] * (max_s_len - len(seq)) for seq in batch_s_target]

        batches.append((
            torch.tensor(enc_batch, device=device),
            torch.tensor(dec_in_batch, device=device),
            torch.tensor(dec_tgt_batch, device=device)
        ))
    return batches

# Train function (with masks)
def train(model, train_data, dev_data=None, epochs=10, batch_size=64, lr=1e-4):
    criterion = nn.CrossEntropyLoss(ignore_index=output_vocab[PAD_TOKEN])
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        batches = batchify(train_data, batch_size)

        for src, tgt_input, tgt_output in batches:
            optimizer.zero_grad()

            tgt_mask = generate_square_subsequent_mask(tgt_input.size(1))
            src_padding_mask = create_padding_mask(src, input_vocab[PAD_TOKEN])
            tgt_padding_mask = create_padding_mask(tgt_input, output_vocab[PAD_TOKEN])

            output = model(src, tgt_input, src_padding_mask, tgt_padding_mask, tgt_mask)
            loss = criterion(output.view(-1, output_vocab_size), tgt_output.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(batches)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

# SQL normalization function
def normalize_sql(sql_str):
    return ' '.join(sql_str.lower().strip().rstrip(';').split())

# Inference function
def infer(model, src_sequence, max_len=100):
    model.eval()
    src_tensor = torch.tensor([src_sequence], device=device)
    generated = [output_vocab[SOS_TOKEN]]

    with torch.no_grad():
        for _ in range(max_len):
            tgt_tensor = torch.tensor([generated], device=device)
            tgt_mask = generate_square_subsequent_mask(tgt_tensor.size(1))
            src_padding_mask = create_padding_mask(src_tensor, input_vocab[PAD_TOKEN])
            tgt_padding_mask = create_padding_mask(tgt_tensor, output_vocab[PAD_TOKEN])

            output = model(src_tensor, tgt_tensor, src_padding_mask, tgt_padding_mask, tgt_mask)
            pred_token = output.argmax(-1)[:, -1].item()
            if pred_token == output_vocab[EOS_TOKEN]:
                break
            generated.append(pred_token)

    return " ".join(output_idx_to_token.get(idx, UNK_TOKEN) for idx in generated[1:])

# Evaluation function
def evaluate(model, dataset):
    correct = 0
    for q_tokens, sql_tokens in dataset:
        pred_sql = infer(model, q_tokens)
        gold_sql = " ".join(output_idx_to_token.get(idx, UNK_TOKEN) for idx in sql_tokens)
        if normalize_sql(pred_sql) == normalize_sql(gold_sql):
            correct += 1
    return correct / len(dataset)

# Initialize, train, and evaluate the model
transformer_model = TransformerModel(input_vocab_size, output_vocab_size)
train(transformer_model, train_data, dev_data, epochs=10, batch_size=128, lr=1e-3)

acc_question = evaluate(transformer_model, test_q_data)
acc_query = evaluate(transformer_model, test_s_data)
print(f"Transformer - Question Split Test Accuracy: {acc_question*100:.2f}%")
print(f"Transformer - Query Split Test Accuracy: {acc_query*100:.2f}%")

Epoch 1/10, Loss: 0.5655
Epoch 2/10, Loss: 0.0832
Epoch 3/10, Loss: 0.0768
Epoch 4/10, Loss: 0.0748
Epoch 5/10, Loss: 0.0736
Epoch 6/10, Loss: 0.0707
Epoch 7/10, Loss: 0.0670
Epoch 8/10, Loss: 0.0634
Epoch 9/10, Loss: 0.0627
Epoch 10/10, Loss: 0.0609
Transformer - Question Split Test Accuracy: 0.45%
Transformer - Query Split Test Accuracy: 0.00%
