In [40]:
import tiktoken
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Masking

import mlflow



enc = tiktoken.get_encoding("cl100k_base")
import pandas as pd 
import numpy as np

def load_data(file_path):
    selected_columns = ["db_id","question","query"]
    df = pd.read_parquet(file_path,columns=selected_columns)
    X_train = df[["db_id","question"]]
    y_train = df[["query"]]
    return X_train,y_train

X_train, y_train = load_data("~/ML_Projects/text-sql/data/train/0000.parquet")

X_train['question'] = (X_train["db_id"] +" "+ X_train['question'])
X_train = X_train.drop('db_id',axis=1)
y_train = y_train['query'].to_list()
X_train = X_train['question'].to_list()




print(f'X_train: {X_train[:2]}')
print(f'y_train: {y_train[:2]}')


X_train_sequences = []
for sent in X_train[:11]:
    print(enc.encode(sent))
    X_train_sequences.append(enc.encode(sent))
X_train_padded = pad_sequences(X_train_sequences, padding='post')
y_train_sequences = []
for sent in y_train[:11]:
    y_train_sequences.append(enc.encode(sent))
y_train_padded = pad_sequences(y_train_sequences, padding='post',maxlen=X_train_padded.shape[1])

vocab_size_enc = enc.n_vocab
print(f'vocab_size: {vocab_size_enc}\n')
print(f'X_train_padded{X_train_padded[:2]}\n')
print(f"y_train_padded: {y_train_padded[:2]}\n")

X_train: ['department_management How many heads of the departments are older than 56 ?', 'department_management List the name, born state and age of the heads of departments ordered by age.']
y_train: ['SELECT count(*) FROM head WHERE age  >  56', 'SELECT name ,  born_state ,  age FROM head ORDER BY age']
[28414, 46463, 2650, 1690, 14971, 315, 279, 26280, 527, 9191, 1109, 220, 3487, 949]
[28414, 46463, 1796, 279, 836, 11, 9405, 1614, 323, 4325, 315, 279, 14971, 315, 26280, 11713, 555, 4325, 13]
[28414, 46463, 1796, 279, 9886, 1060, 11, 836, 323, 8199, 315, 1855, 9476, 13]
[28414, 46463, 3639, 527, 279, 7340, 323, 8187, 8199, 315, 279, 26280, 30]
[28414, 46463, 3639, 374, 279, 5578, 1396, 315, 8420, 315, 279, 26280, 6832, 7222, 374, 1990, 220, 605, 323, 220, 868, 30]
[28414, 46463, 3639, 527, 279, 5144, 315, 279, 14971, 889, 527, 9405, 4994, 279, 7188, 1614, 30]
[28414, 46463, 3639, 527, 279, 12742, 9886, 1667, 315, 279, 26280, 9152, 555, 264, 19607, 9405, 304, 1614, 364, 98911, 71090]


In [41]:
import mlflow

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")


In [35]:
# mlflow.end_run()

In [43]:


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Masking, Dropout
from tensorflow.keras.callbacks import EarlyStopping




embedding_dim = 128
units = 128
params = {'input_dim':vocab_size_enc,
          'output_dim':embedding_dim,
          'units':units,
          'activation':'softmax',
          'optimizer':'adam',
          'loss':'sparse_categorical_crossentropy',
          'metrics':'accuracy'
          }

# print(params.keys())
# with mlflow.start_run():
#     mlflow.log_param(params)
    

# Assuming 0 is the padding value
model = Sequential([
    Masking(mask_value=0),
    Embedding(input_dim=vocab_size_enc, output_dim=embedding_dim),
    LSTM(units, return_sequences=True),
    # Dropout(0.2),  # Adjust the dropout rate as needed
    Dense(units=vocab_size_enc, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with Dropout and EarlyStopping
history = model.fit(
    X_train_padded,
    y_train_padded,
    epochs=30,
    batch_size=32,
    validation_split=0.3,
    callbacks=[early_stopping]  # Add EarlyStopping callback
)



TypeError: log_param() missing 1 required positional argument: 'value'

In [6]:
def generate_sql(question,padding_index = 0):

    question_sequence = enc.encode(question)
    # print(f"question_sequence: {question_sequence}")
    question_padded = pad_sequences([question_sequence], padding='post')
    # print(f"question_padded: {question_padded}")
    predicted_sequence = model.predict(question_padded)
    # print(f"predicted_sequence: {predicted_sequence}")
    predicted_indices = np.argmax(predicted_sequence, axis=-1)
    predicted_indices[predicted_sequence.argmax(axis=-1) == padding_index] = padding_index
    predicted_query = enc.decode(predicted_indices[0])
    # predicted_sql_query = enc.decode(predicted_sequence)
    return predicted_query

In [8]:

print(generate_sql("How many heads of the departments are older than 56 ?"))

!!!!!!!!!!!!


In [10]:
X_val,y_val = load_data('/Users/jagpreetsingh/ML_Projects/text-sql/data/validation/validation-00000-of-00001.parquet')

In [12]:
X_val.head(), y_val.head()

(            db_id                                           question
 0  concert_singer                       How many singers do we have?
 1  concert_singer               What is the total number of singers?
 2  concert_singer  Show name, country, age for all singers ordere...
 3  concert_singer  What are the names, countries, and ages for ev...
 4  concert_singer  What is the average, minimum, and maximum age ...,
                                                query
 0                        SELECT count(*) FROM singer
 1                        SELECT count(*) FROM singer
 2  SELECT name ,  country ,  age FROM singer ORDE...
 3  SELECT name ,  country ,  age FROM singer ORDE...
 4  SELECT avg(age) ,  min(age) ,  max(age) FROM s...)

In [None]:
# from nltk.translate.bleu_score import sentence_bleu

# # Calculate BLEU score for each pair of true and generated queries
# bleu_scores = [sentence_bleu([true_query.split()], generated_query.split()) for true_query, generated_query in zip(y_test, decoded_predictions)]

# # Print average BLEU score
# average_bleu_score = sum(bleu_scores) / len(bleu_scores)
# print(f"Average BLEU Score: {average_bleu_score}")
 