In [6]:
import pandas as pd
import numpy as np   


def load_data(file_path,selected_columns = ["db_id","question","query"]):
    
    df = pd.read_parquet(file_path,columns=selected_columns)
    X_train = df[["db_id","question"]]
    y_train = df[["query"]]
    return X_train,y_train

X_train, y_train = load_data("~/ML_Projects/text-sql/data/train/0000.parquet")

X_train['question'] = (X_train["db_id"] +" "+ X_train['question'])
X_train = X_train.drop('db_id',axis=1)
y_train = y_train['query'].to_list()
X_train = X_train['question'].to_list()


In [12]:
import tiktoken
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
enc = tiktoken.get_encoding("cl100k_base")
import pandas as pd 
import numpy as np





print(X_train[:10])
print(y_train[:10])


X_train_sequences = []
for sent in X_train:
    print(enc.encode(sent))
    X_train_sequences.append(enc.encode(sent))

# X_train_sequences
X_train_padded = pad_sequences(X_train_sequences, padding='post')
# print(f"X_train_padded: {X_train_padded}")

y_train_sequences = []
for sent in y_train:
    y_train_sequences.append(enc.encode(sent))
# print(f"y_train_sequence: {y_train_sequences}")


y_train_padded = pad_sequences(y_train_sequences, padding='post',maxlen=X_train_padded.shape[1])
# print(f"y_train_padded: {y_train_padded}")



vocab_size_enc = enc.n_vocab
print(f'vocab_size: {vocab_size_enc}')




from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

embedding_dim = 128
units = 256

model = Sequential([
    Embedding(input_dim=vocab_size_enc, output_dim=embedding_dim, input_length=X_train_padded.shape[1]),
    LSTM(units, return_sequences=True),
    Dense(units=vocab_size_enc, activation='softmax')
])

# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# history = model.fit(X_train_padded, y_train_padded, epochs=30, batch_size=32, validation_split=0.2)



['department_management How many heads of the departments are older than 56 ?', 'department_management List the name, born state and age of the heads of departments ordered by age.', 'department_management List the creation year, name and budget of each department.', 'department_management What are the maximum and minimum budget of the departments?', 'department_management What is the average number of employees of the departments whose rank is between 10 and 15?', 'department_management What are the names of the heads who are born outside the California state?', "department_management What are the distinct creation years of the departments managed by a secretary born in state 'Alabama'?", 'department_management What are the names of the states where at least 3 heads were born?', 'department_management In which year were most departments established?', "department_management Show the name and number of employees for the departments managed by heads whose temporary acting value is 'Yes'

In [123]:
model.save('/Users/jagpreetsingh/ML_Projects/text-sql/artifacts/tiktoken-enc.h5')

  saving_api.save_model(


In [165]:
padding_index = 0
question_sequence = enc.encode("How many heads of the departments are older than 56 ?")
# print(f"question_sequence: {question_sequence}")
question_padded = pad_sequences([question_sequence], padding='post',maxlen=X_train_padded.shape[1])
# print(f"question_padded: {question_padded}")
predicted_sequence = model.predict(question_padded)
# print(f"predicted_sequence: {predicted_sequence}")
predicted_indices = np.argmax(predicted_sequence, axis=-1)
predicted_indices[predicted_sequence.argmax(axis=-1) == padding_index] = padding_index
predicted_query = enc.decode(predicted_indices[0])
# predicted_sql_query = enc.decode(predicted_sequence)
print(f"actual query: SELECT count(*) FROM head WHERE age > 56 ")
predicted_query

actual query: SELECT count(*) FROM head WHERE age > 56 


'!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'

In [18]:
def generate_sql(question,padding_index = 0):

    question_sequence = enc.encode(question)
    # print(f"question_sequence: {question_sequence}")
    question_padded = pad_sequences([question_sequence], padding='post',maxlen=X_train_padded.shape[1])
    # print(f"question_padded: {question_padded}")
    predicted_sequence = model.predict(question_padded)
    # print(f"predicted_sequence: {predicted_sequence}")
    predicted_indices = np.argmax(predicted_sequence, axis=-1)
    predicted_indices[predicted_sequence.argmax(axis=-1) == padding_index] = padding_index
    predicted_query = enc.decode(predicted_indices[0])
    # predicted_sql_query = enc.decode(predicted_sequence)
    return predicted_query

In [19]:
from tensorflow.keras.models import load_model


model = load_model('/Users/jagpreetsingh/ML_Projects/text-sql/artifacts/tiktoken-enc.h5')

In [20]:
X_val, y_val = load_data('/Users/jagpreetsingh/ML_Projects/text-sql/data/validation/validation-00000-of-00001.parquet')

In [21]:
X_val.head() ,y_val.head()

(            db_id                                           question
 0  concert_singer                       How many singers do we have?
 1  concert_singer               What is the total number of singers?
 2  concert_singer  Show name, country, age for all singers ordere...
 3  concert_singer  What are the names, countries, and ages for ev...
 4  concert_singer  What is the average, minimum, and maximum age ...,
                                                query
 0                        SELECT count(*) FROM singer
 1                        SELECT count(*) FROM singer
 2  SELECT name ,  country ,  age FROM singer ORDE...
 3  SELECT name ,  country ,  age FROM singer ORDE...
 4  SELECT avg(age) ,  min(age) ,  max(age) FROM s...)

In [22]:
X_val.loc[0,"question"]

'How many singers do we have?'

In [23]:
generate_sql("How many singers do we have?")



'SELECT count FROM FROM!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'

In [24]:
for i in range(11):
    question = X_val.loc[i,"question"]
    
    print(f'Question : {question}')
    
    print(f"Actual Query: {y_val.loc[i,'query']}")
    predicted_query = generate_sql(X_val.loc[i,"question"])
    print(f"Predicted Query: {predicted_query}\n")
    

Question : How many singers do we have?
Actual Query: SELECT count(*) FROM singer


Predicted Query: SELECT count FROM FROM!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

Question : What is the total number of singers?
Actual Query: SELECT count(*) FROM singer
Predicted Query:  perpetratorensseseHAensSELECT count_Name FROM FROM BY!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

Question : Show name, country, age for all singers ordered by age from the oldest to the youngest.
Actual Query: SELECT name ,  country ,  age FROM singer ORDER BY age DESC
Predicted Query: SELECTSELECT T FROM ,  BY! T!!!! T T T2 T T T          T T BY!!!!!!!!!!!!!!!!!!!!!!

Question : What are the names, countries, and ages for every singer in descending order of age?
Actual Query: SELECT name ,  country ,  age FROM singer ORDER BY age DESC
Predicted Query:  perpetratorensensSELECT T FROM FROM , FROM FROM FROM FROM!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

Question : What is the average, minimum, and maximum age of all singers from France?
Actual Query: SELECT avg(age) ,  min(age) ,  max(age) F