In [27]:
import pandas as pd 
import numpy as np

def load_data(file_path):
    selected_columns = ["db_id","question","query"]
    df = pd.read_parquet(file_path,columns=selected_columns)
    X_train = df[["db_id","question"]]
    y_train = df[["query"]]
    return X_train,y_train

X_train, y_train = load_data("data/train/0000.parquet")

X_train['question'] = X_train["db_id"] +" "+ X_train['question']

X_train = X_train.drop('db_id',axis=1)

print(X_train.loc[:10])
print(y_train.loc[:10])



                                             question
0   department_management How many heads of the de...
1   department_management List the name, born stat...
2   department_management List the creation year, ...
3   department_management What are the maximum and...
4   department_management What is the average numb...
5   department_management What are the names of th...
6   department_management What are the distinct cr...
7   department_management What are the names of th...
8   department_management In which year were most ...
9   department_management Show the name and number...
10  department_management How many acting statuses...
                                                query
0          SELECT count(*) FROM head WHERE age  >  56
1   SELECT name ,  born_state ,  age FROM head ORD...
2   SELECT creation ,  name ,  budget_in_billions ...
3   SELECT max(budget_in_billions) ,  min(budget_i...
4   SELECT avg(num_employees) FROM department WHER...
5   SELECT name FROM head WH

In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize input sequences (questions)
tokenizer_input = Tokenizer()
tokenizer_input.fit_on_texts(X_train['question'])
X_train_sequences = tokenizer_input.texts_to_sequences(X_train['question'])
X_train_padded = pad_sequences(X_train_sequences, padding='post')

print(f"X_train_padded: {X_train_padded[:10]}")

# Tokenize target sequences (SQL queries)
tokenizer_output = Tokenizer()
tokenizer_output.fit_on_texts(y_train['query'])
y_train_sequences = tokenizer_output.texts_to_sequences(y_train['query'])
y_train_padded = pad_sequences(y_train_sequences, padding='post',maxlen=X_train_padded.shape[1])

print(f"y_train_padded: {y_train_padded[:10]}")

# Vocabulary sizes
vocab_size_input = len(tokenizer_input.word_index) + 1
vocab_size_output = len(tokenizer_output.word_index) + 1

print(f"vocab_size_input: {vocab_size_input}")
print(f"vocab_size_output: {vocab_size_output}")





X_train_padded: [[  48  249   19   21  987    2    1  226    6  403   22 2579    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [  48  249   24    1    9 1065  174    3   91    2    1  987    2  226
    82   27   91    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [  48  249   24    1 1302   73    9    3  330    2   23   48    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [  48  249    4    6    1   86    3  144  330    2    1  226    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [  48  249    4   10    1   30   14    2   72    2    1  226   42  152
    10  209  237    3  774    0    0    0    0    0    0    0    0    0
     0    0 

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

embedding_dim = 128
units = 256

model = Sequential([
    Embedding(input_dim=vocab_size_input, output_dim=embedding_dim, input_length=X_train_padded.shape[1]),
    LSTM(units, return_sequences=True),
    Dense(units=vocab_size_output, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [34]:
model.fit(X_train_padded, y_train_padded, epochs=30, batch_size=32, validation_split=0.2)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x298227fd0>

In [37]:
def generate_sql_sequence(question):
    question_sequence = tokenizer_input.texts_to_sequences([question])
    question_padded = pad_sequences(question_sequence, padding='post', maxlen=X_train_padded.shape[1])
    predicted_sequence = model.predict(question_padded)[0]
    predicted_sql_query = [tokenizer_output.index_word[idx] for idx in np.argmax(predicted_sequence, axis=-1) if idx != 0]
    return " ".join(predicted_sql_query)

# Example usage:
new_question = "department_management How many heads of the departments ?"
predicted_sql_query = generate_sql_sequence(new_question)
print("Predicted SQL Query:", predicted_sql_query)


Predicted SQL Query: select count from temporary where 56


In [None]:
model.save("artifacts/baseline.h5")