# Environment setup and imports
Import core libraries, set deterministic seeds, and prepare Keras/TensorFlow utilities.

In [1]:
import numpy as np
import pandas as pd
import nltk
!pip install tensorflow
import tensorflow as tf
from tensorflow import keras

# Reproducible randomness
np.random.seed(42)
tf.random.set_seed(42)

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.12.19-py2.py3-none-any.whl.metadata (1.0 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.20.0 (from tensorflow)
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow)
  Downloading wheel-0.46.3-py3-none-any.whl.metadata (2.4 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard~=2.20.0->tensorflow)
  Downloading tensorboard_data_server-0.7.



In [2]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, LSTM, Embedding, InputLayer, Bidirectional, TimeDistributed, Input, Concatenate, Reshape, Lambda, Flatten, RepeatVector, Dot
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import os

# Loading dataset
Read parallel NL (natural language) and SQL files into memory and perform basic sanity checks.

In [3]:
# Load parallel NL (natural language) and SQL files. Strip empty lines and trailing whitespace.

def load_lines(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.read().splitlines() if line.strip()]

ftrain_x = load_lines('/content/new_train.nl')
ftrain_y = load_lines('/content/new_train.sql')
ftest_x = load_lines('/content/test.nl')
ftest_y = load_lines('/content/test.sql')

print(f"Loaded {len(ftrain_x)} train NL lines and {len(ftrain_y)} train SQL lines.")
print(f"Loaded {len(ftest_x)} test NL lines and {len(ftest_y)} test SQL lines.")

Loaded 37113 train NL lines and 37113 train SQL lines.
Loaded 448 test NL lines and 448 test SQL lines.


In [4]:
# Keep only pairs where SQL length is <= max_sql_len to avoid extremely long targets.
max_sql_len = 50

train_x, train_y, test_x, test_y = [], [], [], []

for x, y in zip(ftrain_x, ftrain_y):
    if len(y.split()) <= max_sql_len:
        train_x.append(x)
        train_y.append(y)

for x, y in zip(ftest_x, ftest_y):
    if len(y.split()) <= max_sql_len:
        test_x.append(x)
        test_y.append(y)

print(f"Filtered train pairs: {len(train_x)}")
print(f"Filtered test pairs: {len(test_x)}")

Filtered train pairs: 9401
Filtered test pairs: 186


# Creating Index for NL Data

In [5]:
# Build deterministic NL vocabulary (sorted) and reserve <PAD> and <UNK> as indices 0 and 1.
train_corpus_nl = sorted({word for sentence in train_x for word in sentence.split()})

# Reserve special tokens: <PAD>=0, <UNK>=1
idx2word_nl = {i: token for i, token in enumerate(['<PAD>', '<UNK>'])}
word2idx_nl = {'<PAD>': 0, '<UNK>': 1}

for i, word in enumerate(train_corpus_nl, start=2):
    word2idx_nl[word] = i
    idx2word_nl[i] = word

len_x = len(idx2word_nl)
print(f"NL vocab size (including special tokens): {len_x}")

NL vocab size (including special tokens): 741


In [6]:
# Tokenize NL training sentences and map tokens to indices (use <UNK> when missing).
words_x = [[w for w in s.split()] for s in train_x]
words_x_idx = [[word2idx_nl.get(w, word2idx_nl['<UNK>']) for w in sent] for sent in words_x]

# Creating vocabulary for SQL targets
Build deterministic token-index mappings for the SQL (target) side, reserving special tokens (<PAD>, <UNK>, <SOS>, <EOS>).

In [7]:
# Build SQL vocabulary with deterministic ordering and reserve special tokens: <PAD>, <UNK>, <SOS>, <EOS>
train_corpus_sql = sorted({word for sentence in train_y for word in sentence.split()})

special_tokens = ['<PAD>', '<UNK>', '<SOS>', '<EOS>']
word2idx_sql = {tok: i for i, tok in enumerate(special_tokens)}
idx2word_sql = {i: tok for i, tok in enumerate(special_tokens)}

for i, word in enumerate(train_corpus_sql, start=len(special_tokens)):
    word2idx_sql[word] = i
    idx2word_sql[i] = word

len_y = len(idx2word_sql)
print(f"SQL vocab size (including special tokens): {len_y}")

SQL vocab size (including special tokens): 397


In [8]:
# Decoder inputs: prepend <SOS>. Targets: append <EOS>.
words_y = [['<SOS>'] + s.split() + ['<EOS>'] for s in train_y]
words_y_idx = [[word2idx_sql.get(w, word2idx_sql['<UNK>']) for w in sent] for sent in words_y]

words_target = [s.split() + ['<EOS>'] for s in train_y]
words_target_idx = [[word2idx_sql.get(w, word2idx_sql['<UNK>']) for w in sent] for sent in words_target]

In [9]:
# Compute maximum sequence lengths for padding and print a short summary
max_x = max(len(sent) for sent in words_x_idx)
max_y = max(len(sent) for sent in words_y_idx)

print(f"max_x={max_x}, max_y={max_y}, len_x={len_x}, len_y={len_y}")

max_x=23, max_y=52, len_x=741, len_y=397


In [10]:
# (Formatting cell) Small placeholder to separate preprocessing steps

In [11]:
# Pad sequences to max lengths using the <PAD> token index
words_x_idx_pad = [sent + [word2idx_nl['<PAD>']] * (max_x - len(sent)) for sent in words_x_idx]
words_y_idx_pad = [sent + [word2idx_sql['<PAD>']] * (max_y - len(sent)) for sent in words_y_idx]
words_target_idx_pad = [sent + [word2idx_sql['<PAD>']] * (max_y - len(sent)) for sent in words_target_idx]

In [12]:
# Convert padded integer sequences to one-hot vectors using `to_categorical` (vectorized and more efficient).
words_x_idx_arr = to_categorical(np.array(words_x_idx_pad), num_classes=len_x)
words_y_idx_arr = to_categorical(np.array(words_y_idx_pad), num_classes=len_y)
words_target_idx_arr = to_categorical(np.array(words_target_idx_pad), num_classes=len_y)

In [13]:
# The previous manual loop-based one-hot encoding has been replaced by `to_categorical` above (faster and less memory-savvy).

In [14]:
# Display the dimension of the one-hot vectors for targets
print("One-hot vector dimension for targets:", words_target_idx_arr.shape[-1])

One-hot vector dimension for targets: 397


In [15]:
# Sanity-check: display shapes of the converted arrays
print("Converted one-hot arrays shapes:")
print("words_x_idx_arr:", words_x_idx_arr.shape)
print("words_y_idx_arr:", words_y_idx_arr.shape)
print("words_target_idx_arr:", words_target_idx_arr.shape)

Converted one-hot arrays shapes:
words_x_idx_arr: (9401, 23, 741)
words_y_idx_arr: (9401, 52, 397)
words_target_idx_arr: (9401, 52, 397)


In [16]:
# Quick shape check for the main arrays
print("Shape check (encoder input, decoder input, target):")
print(words_x_idx_arr.shape, words_y_idx_arr.shape, words_target_idx_arr.shape)

Shape check (encoder input, decoder input, target):
(9401, 23, 741) (9401, 52, 397) (9401, 52, 397)


# Encoder model
Build the encoder that consumes sequences of input one-hot vectors and returns final states (hidden and cell).

In [17]:
# Reference: earlier attempts at using Embedding for encoder. Kept for reference in case you want to switch from one-hot inputs to embeddings.
# Example: enEmbed = Embedding(input_dim=len(idx2word_nl), output_dim=300)
# enLSTM = LSTM(256, return_state=True)

In [18]:
"""
Encoder: processes a sequence of one-hot vectors (timesteps x len_x) and returns the final hidden and cell states.
Input shape: (batch, Tx, len_x) where each timestep is a one-hot vector of size len_x.
"""

enInput = Input(shape=(None, len_x), name='encoder_input')
enLSTM = LSTM(64, return_state=True, return_sequences=False, name='encoder_lstm')
enOutput, enHiddenState, enCellState = enLSTM(enInput)
enFinal = [enHiddenState, enCellState]


In [19]:
# Print encoder cell-state shape for quick verification
print("Encoder cell-state shape:", enCellState.shape)

Encoder cell-state shape: (None, 64)


# Decoder model
Configure the decoder to generate output sequences using initial states from the encoder and a softmax output over the target vocabulary.

In [20]:
# Reference: earlier decoder sketches using Embedding; kept as comments for quick experimentation.
# If you move from one-hot inputs to embeddings, uncomment and adapt these.
# deEmbed = Embedding(input_dim=len(idx2word_sql), output_dim=300)
# deLSTM = LSTM(256, return_sequences=True, return_state=True)

In [21]:
'''
Decoder: consumes one-hot vectors per timestep with initial states from the encoder.
Return sequences (outputs over all timesteps), then project to SQL vocabulary with a Dense+softmax.
'''

deInput = Input(shape=(None, len_y), name='decoder_input')
deLSTM = LSTM(64, return_sequences=True, return_state=True, name='decoder_lstm')
deOutput, deHiddenState, deCellState = deLSTM(deInput, initial_state=enFinal)
deDense = Dense(len_y, activation='softmax', name='decoder_output_dense')
deDenseOutput = deDense(deOutput)

# Print decoder output shape (symbolic tensor)
print('Decoder output shape:', deDenseOutput.shape)

Decoder output shape: (None, None, 397)


In [22]:
model = Model(inputs=[enInput, deInput], outputs=[deDenseOutput], name='seq2seq_model')
model.summary()

In [23]:
# Compile and train the seq2seq model using a safer default learning rate and early stopping
callbacks = [EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)]
optimizer = Adam(1e-3)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(x=[words_x_idx_arr, words_y_idx_arr], y=words_target_idx_arr,
          batch_size=64, validation_split=0.2, epochs=20, verbose=1, shuffle=True, callbacks=callbacks)

Epoch 1/20
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 91ms/step - accuracy: 0.2583 - loss: 4.6718 - val_accuracy: 0.3879 - val_loss: 2.5067
Epoch 2/20
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 77ms/step - accuracy: 0.4265 - loss: 2.3528 - val_accuracy: 0.5301 - val_loss: 2.0333
Epoch 3/20
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 76ms/step - accuracy: 0.5624 - loss: 1.9156 - val_accuracy: 0.6121 - val_loss: 1.7016
Epoch 4/20
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 76ms/step - accuracy: 0.6369 - loss: 1.5722 - val_accuracy: 0.7602 - val_loss: 1.3376
Epoch 5/20
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 78ms/step - accuracy: 0.8121 - loss: 1.1697 - val_accuracy: 0.8273 - val_loss: 0.9790
Epoch 6/20
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 78ms/step - accuracy: 0.8475 - loss: 0.8327 - val_accuracy: 0.8515 - val_loss: 0.7827
Epoch 7/20
[1m118/11

<keras.src.callbacks.history.History at 0x7fc033475700>

# Inference models
Build lightweight encoder/decoder models for step-by-step decoding during inference (greedy decoding below).

In [24]:
# Encoder inference model: returns final hidden and cell states for a given input sequence
enInfModel = Model(inputs=enInput, outputs=enFinal, name='encoder_inference')

In [25]:
# Print decoder input symbolic shape for verification
print('Decoder input symbolic shape:', deInput.shape)

Decoder input symbolic shape: (None, None, 397)


In [26]:
# Decoder inference model: takes one timestep (one-hot) and previous states, returns distribution + next states

deInfInput = Input(shape=(1, len_y), name='decoder_infer_input')
deInfHiddenInput = Input(shape=(64,), name='decoder_infer_hidden')
deInfCellInput = Input(shape=(64,), name='decoder_infer_cell')

deInfOutput, deInfHiddenOutput, deInfCellOutput = deLSTM(deInfInput, initial_state=[deInfHiddenInput, deInfCellInput])
deInfOutput2 = deDense(deInfOutput)

deInfModel = Model(inputs=[deInfInput, deInfHiddenInput, deInfCellInput],
                   outputs=[deInfOutput2, deInfHiddenOutput, deInfCellOutput],
                   name='decoder_inference')

# Test data
Tokenize and encode test data using the same vocabulary and padding scheme as training data.

In [27]:
# Tokenize test natural language inputs and map to indices (truncate to max_x if longer)
test_words_x = [s.split() for s in test_x]
test_words_x_idx = [[word2idx_nl.get(w, word2idx_nl['<UNK>']) for w in sent][:max_x] for sent in test_words_x]


In [28]:
# Prepare decoder inputs and targets for the test set; use <UNK> when tokens are unseen
test_words_y = [['<SOS>'] + s.split() + ['<EOS>'] for s in test_y]
test_words_y_idx = [[word2idx_sql.get(w, word2idx_sql['<UNK>']) for w in sent] for sent in test_words_y]

test_words_target = [s.split() + ['<EOS>'] for s in test_y]
test_words_target_idx = [[word2idx_sql.get(w, word2idx_sql['<UNK>']) for w in sent] for sent in test_words_target]


In [29]:
# Pad test sequences to the same lengths used for training

test_words_x_idx_pad = [sent + [word2idx_nl['<PAD>']] * (max_x - len(sent)) for sent in test_words_x_idx]

test_words_y_idx_pad = [sent + [word2idx_sql['<PAD>']] * (max_y - len(sent)) for sent in test_words_y_idx]

test_words_target_idx_pad = [sent + [word2idx_sql['<PAD>']] * (max_y - len(sent)) for sent in test_words_target_idx]


In [30]:
# Vectorized one-hot conversion for test data using `to_categorical`
test_words_x_idx_arr = to_categorical(np.array(test_words_x_idx_pad), num_classes=len_x)
test_words_y_idx_arr = to_categorical(np.array(test_words_y_idx_pad), num_classes=len_y)
test_words_target_idx_arr = to_categorical(np.array(test_words_target_idx_pad), num_classes=len_y)


In [31]:
# Quick sanity-check: test arrays shapes (should match training dims)
print("Test arrays shapes:", test_words_x_idx_arr.shape, test_words_y_idx_arr.shape, test_words_target_idx_arr.shape)

Test arrays shapes: (186, 23, 741) (186, 52, 397) (186, 52, 397)


In [32]:
# Next: evaluate model on the test set and inspect sample predictions (see following cells)

In [33]:
test_words_x_idx_arr = test_words_x_idx_arr.reshape(-1, 1, max_x)
test_words_y_idx_arr = test_words_y_idx_arr.reshape(-1, 1, max_y)

In [38]:
import numpy as np
from tensorflow.keras.utils import to_categorical

# assuming test_words_x_idx_pad, test_words_y_idx_pad, len_x, and len_y are still in scope.
test_words_x_idx_arr = to_categorical(np.array(test_words_x_idx_pad), num_classes=len_x)
test_words_y_idx_arr = to_categorical(np.array(test_words_y_idx_pad), num_classes=len_y)

# Evaluate the model on the test set and print loss & accuracy
eval_result = model.evaluate(x=[test_words_x_idx_arr, test_words_y_idx_arr], y=test_words_target_idx_arr, verbose=1)
print("Test loss: {:.4f}, Test accuracy: {:.2f}%".format(eval_result[0], eval_result[1]*100))

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.5035 - loss: 4.3345
Test loss: 4.1707, Test accuracy: 51.75%


In [39]:
# formatted test accuracy
accuracy = eval_result[1] * 100
print(f"The test accuracy of the Encode-Decoder Model is {round(accuracy, 2)}%")

The test accuracy of the Encode-Decoder Model is 51.75%
