In [1]:
import pandas as pd 
import logging


def load_data(file_path:str, selected_columns:list):
    logging.info("loading data")
    df = pd.read_parquet(file_path)
    X_train = df[["db_id","question"]]
    y_train = df[["query"]]
    logging.info(f"shape of X_train: {X_train.shape}")
    logging.info(f"shape of y_train: {y_train.shape}")
    X_train['question'] = X_train["db_id"] +" "+ X_train['question']
    X_train = X_train.drop('db_id',axis=1)
    
    return X_train,y_train

In [8]:
X_train, y_train = load_data("~/ML_Projects/text-sql/data/train/0000.parquet",selected_columns=["db_id","question","query"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['question'] = X_train["db_id"] +" "+ X_train['question']


In [9]:
X_train.head()

Unnamed: 0,question
0,department_management How many heads of the de...
1,"department_management List the name, born stat..."
2,"department_management List the creation year, ..."
3,department_management What are the maximum and...
4,department_management What is the average numb...


In [12]:
y_train.head(5)

Unnamed: 0,query
0,SELECT count(*) FROM head WHERE age > 56
1,"SELECT name , born_state , age FROM head ORD..."
2,"SELECT creation , name , budget_in_billions ..."
3,"SELECT max(budget_in_billions) , min(budget_i..."
4,SELECT avg(num_employees) FROM department WHER...


In [13]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [14]:
vocab_size = 1000

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [99]:
len_of_question = [len(seq) for seq in X_train['question']]

In [94]:
max_length_X = max([len(seq) for seq in X_train['question']])
print(max_length_X)

232


In [95]:
tokenizer_input = Tokenizer()
tokenizer_input.fit_on_texts(X_train['question'])
X_train_sequences = tokenizer_input.texts_to_sequences(X_train['question'])
X_train_padded = pad_sequences(X_train_sequences, padding='post', maxlen=max_length_X)

In [96]:
X_train_padded[:6], X_train.loc[0:6]

(array([[ 48, 249,  19, ...,   0,   0,   0],
        [ 48, 249,  24, ...,   0,   0,   0],
        [ 48, 249,  24, ...,   0,   0,   0],
        [ 48, 249,   4, ...,   0,   0,   0],
        [ 48, 249,   4, ...,   0,   0,   0],
        [ 48, 249,   4, ...,   0,   0,   0]], dtype=int32),
                                             question
 0  department_management How many heads of the de...
 1  department_management List the name, born stat...
 2  department_management List the creation year, ...
 3  department_management What are the maximum and...
 4  department_management What is the average numb...
 5  department_management What are the names of th...
 6  department_management What are the distinct cr...)

In [77]:
max_length_y = max([len(seq) for _,seq in enumerate(y_train['query'])])
print(max_length_y)

577


In [78]:
# Tokenize target sequences (SQL queries)
tokenizer_output = Tokenizer()
tokenizer_output.fit_on_texts(y_train['query'])
y_train_sequences = tokenizer_output.texts_to_sequences(y_train['query'])
y_train_padded = pad_sequences(y_train_sequences, padding='post',maxlen=max_length_X)

In [79]:
y_train_padded[:6], y_train.loc[:6]

(array([[   4,   12,    3,  480,    7,   41, 2200,    0],
        [ 948,   85,   41,    3,  480,   14,   11,   41],
        [   4, 1145,    8,  151,   30, 1406,    3,   40],
        [  30, 1406,   76,  151,   30, 1406,    3,   40],
        [   3,   40,    7, 1146,  166,  179,   23,  643],
        [   4,    8,    3,  480,    7,  948,   85, 1124]], dtype=int32),
                                                query
 0         SELECT count(*) FROM head WHERE age  >  56
 1  SELECT name ,  born_state ,  age FROM head ORD...
 2  SELECT creation ,  name ,  budget_in_billions ...
 3  SELECT max(budget_in_billions) ,  min(budget_i...
 4  SELECT avg(num_employees) FROM department WHER...
 5  SELECT name FROM head WHERE born_state != 'Cal...
 6  SELECT DISTINCT T1.creation FROM department AS...)

In [80]:
vocab_size_input = len(tokenizer_input.word_index) + 1
print(vocab_size_input)
vocab_size_output = len(tokenizer_output.word_index) + 1
print(vocab_size_output)

3261
2405


In [81]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

embedding_dim = 250
units = 256

model = Sequential([
    Embedding(input_dim=vocab_size_input, output_dim=embedding_dim, input_length=max_length_X),
    LSTM(units, return_sequences=True),
    Dense(units=vocab_size_output, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [82]:
history = model.fit(X_train_padded, y_train_padded, epochs=5, batch_size=32, validation_split=0.2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [87]:
import numpy as np

In [88]:
def generate_sql_sequence(question):
    question_sequence = tokenizer_input.texts_to_sequences([question])
    print(f"question sequnec: {question_sequence}")
    question_padded = pad_sequences(question_sequence, padding='post', maxlen=max_length_X)
    print(f"question_padded: {question_padded}")
    predicted_sequence = model.predict(question_padded)[0]
    print(f"predicted_sequence: {predicted_sequence}")
    
    predicted_sql_query = [tokenizer_output.index_word[idx] for idx in np.argmax(predicted_sequence, axis=-1) if idx != 0]
    return " ".join(predicted_sql_query)

# Example usage:
new_question = "department_management How many acting statuses are there?"
predicted_sql_query = generate_sql_sequence(new_question)
print("Predicted SQL Query:", predicted_sql_query)

question sequnec: [[48, 249, 19, 21, 1655, 796, 6, 63]]
question_padded: [[  48  249   19   21 1655  796    6   63]]
predicted_sequence: [[5.1949971e-04 3.8198404e-02 7.0495576e-02 ... 1.0635645e-06
  1.0663899e-06 1.1512800e-06]
 [1.2305073e-04 2.7452616e-02 5.1207867e-02 ... 3.7478998e-07
  3.0290491e-07 3.7767256e-07]
 [7.3684455e-04 3.7868481e-02 3.6765717e-02 ... 2.6146660e-08
  1.9799160e-08 2.4678787e-08]
 ...
 [3.0317156e-02 3.2257389e-02 4.3448593e-02 ... 4.4313083e-08
  3.3861834e-08 5.0213348e-08]
 [1.6772658e-01 3.8796687e-03 5.2882684e-03 ... 2.5343448e-07
  1.9333856e-07 2.4185977e-07]
 [2.6479200e-01 5.4874388e-04 8.1779796e-04 ... 8.3105863e-07
  7.2402213e-07 7.7696461e-07]]
Predicted SQL Query: select count from by where count


In [85]:
for i in range(10,16):

    question = list(X_train.iloc[i,:])[0]
    print(f"question: {question}")
    predicted_sql_query = generate_sql_sequence(question)
    actual_query = list(y_train.iloc[i,:])[0]
    
    print(f"actual query: {actual_query}")
    print(f"predicted_sql_query: {predicted_sql_query}")

question: department_management How many acting statuses are there?
question sequnec: [[48, 249, 19, 21, 1655, 796, 6, 63]]
question_padded: [[  48  249   19   21 1655  796    6   63]]
predicted_sequence: [[5.1949971e-04 3.8198404e-02 7.0495576e-02 ... 1.0635645e-06
  1.0663899e-06 1.1512800e-06]
 [1.2305073e-04 2.7452616e-02 5.1207867e-02 ... 3.7478998e-07
  3.0290491e-07 3.7767256e-07]
 [7.3684455e-04 3.7868481e-02 3.6765717e-02 ... 2.6146660e-08
  1.9799160e-08 2.4678787e-08]
 ...
 [3.0317156e-02 3.2257389e-02 4.3448593e-02 ... 4.4313083e-08
  3.3861834e-08 5.0213348e-08]
 [1.6772658e-01 3.8796687e-03 5.2882684e-03 ... 2.5343448e-07
  1.9333856e-07 2.4185977e-07]
 [2.6479200e-01 5.4874388e-04 8.1779796e-04 ... 8.3105863e-07
  7.2402213e-07 7.7696461e-07]]
actual query: SELECT count(DISTINCT temporary_acting) FROM management
predicted_sql_query: select count from by where count
question: department_management How many departments are led by heads who are not mentioned?
question sequn

In [86]:
Embedding(input_dim=vocab_size_input, output_dim=embedding_dim, input_length=X_train_padded.shape[1])

<keras.src.layers.core.embedding.Embedding at 0x150e8f070>

In [15]:
# onehot_repr = [one_hot(words,vocab_size) for words in X_train]

In [16]:
# onehot_repr

[[854]]