In [2]:
import pandas as pd


train_df = pd.read_csv('/Users/jagpreetsingh/ML_Projects/text-sql/data/train/wikisql_train.csv')
train_df.head()

Unnamed: 0,question,sql,table_header,table_header_types,conds,rows
0,Tell me what the notes are for South Australia,SELECT Notes FROM table WHERE Current slogan =...,"['State/territory', 'Text/background colour', ...","['text', 'text', 'text', 'text', 'text', 'text']",['SOUTH AUSTRALIA'],"[['Australian Capital Territory', 'blue/white'..."
1,What is the current series where the new serie...,SELECT Current series FROM table WHERE Notes =...,"['State/territory', 'Text/background colour', ...","['text', 'text', 'text', 'text', 'text', 'text']",['New series began in June 2011'],"[['Australian Capital Territory', 'blue/white'..."
2,What is the format for South Australia?,SELECT Format FROM table WHERE State/territory...,"['State/territory', 'Text/background colour', ...","['text', 'text', 'text', 'text', 'text', 'text']",['South Australia'],"[['Australian Capital Territory', 'blue/white'..."
3,Name the background colour for the Australian ...,SELECT Text/background colour FROM table WHERE...,"['State/territory', 'Text/background colour', ...","['text', 'text', 'text', 'text', 'text', 'text']",['Australian Capital Territory'],"[['Australian Capital Territory', 'blue/white'..."
4,how many times is the fuel propulsion is cng?,SELECT COUNT Fleet Series (Quantity) FROM tabl...,"['Order Year', 'Manufacturer', 'Model', 'Fleet...","['text', 'text', 'text', 'text', 'text', 'text']",['CNG'],"[['1992-93', 'Gillig', 'Phantom (High Floor)',..."


In [3]:
train_df.columns

Index(['question', 'sql', 'table_header', 'table_header_types', 'conds',
       'rows'],
      dtype='object')

In [4]:
train_df.question.head(), train_df.sql.head(), train_df.conds.head()

(0      Tell me what the notes are for South Australia 
 1    What is the current series where the new serie...
 2              What is the format for South Australia?
 3    Name the background colour for the Australian ...
 4        how many times is the fuel propulsion is cng?
 Name: question, dtype: object,
 0    SELECT Notes FROM table WHERE Current slogan =...
 1    SELECT Current series FROM table WHERE Notes =...
 2    SELECT Format FROM table WHERE State/territory...
 3    SELECT Text/background colour FROM table WHERE...
 4    SELECT COUNT Fleet Series (Quantity) FROM tabl...
 Name: sql, dtype: object,
 0                  ['SOUTH AUSTRALIA']
 1    ['New series began in June 2011']
 2                  ['South Australia']
 3     ['Australian Capital Territory']
 4                              ['CNG']
 Name: conds, dtype: object)

In [73]:
train_df['table_header'].loc[0]

"['State/territory', 'Text/background colour', 'Format', 'Current slogan', 'Current series', 'Notes']"

In [5]:
train_df['rows'].loc[0]

"[['Australian Capital Territory', 'blue/white', 'Yaa·nna', 'ACT · CELEBRATION OF A CENTURY 2013', 'YIL·00A', 'Slogan screenprinted on plate'], ['New South Wales', 'black/yellow', 'aa·nn·aa', 'NEW SOUTH WALES', 'BX·99·HI', 'No slogan on current series'], ['New South Wales', 'black/white', 'aaa·nna', 'NSW', 'CPX·12A', 'Optional white slimline series'], ['Northern Territory', 'ochre/white', 'Ca·nn·aa', 'NT · OUTBACK AUSTRALIA', 'CB·06·ZZ', 'New series began in June 2011'], ['Queensland', 'maroon/white', 'nnn·aaa', 'QUEENSLAND · SUNSHINE STATE', '999·TLG', 'Slogan embossed on plate'], ['South Australia', 'black/white', 'Snnn·aaa', 'SOUTH AUSTRALIA', 'S000·AZD', 'No slogan on current series'], ['Victoria', 'blue/white', 'aaa·nnn', 'VICTORIA - THE PLACE TO BE', 'ZZZ·562', 'Current series will be exhausted this year']]"

In [None]:
## filter query with length 10

In [28]:
train_df = train_df[train_df['sql'].apply(lambda x: len(x.split()) < 10)]

train_df.shape

(11019, 6)

In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention, Masking, Concatenate
from tensorflow.keras.losses import SparseCategoricalCrossentropy


# Assuming you have a DataFrame df with 'question', 'table_header', and 'SQL' columns
# Make sure to preprocess the data accordingly (e.g., lowercasing, tokenization)
MAX_LEN = 20
# Tokenize the natural language input (question)
tokenizer_question = Tokenizer()
tokenizer_question.fit_on_texts(train_df['question'])
question_sequences = tokenizer_question.texts_to_sequences(train_df['question'])
max_len_question = max(len(seq) for seq in question_sequences)
padded_question = pad_sequences(question_sequences, padding='post', maxlen=MAX_LEN)


# Tokenize the context input (table header)
tokenizer_context = Tokenizer()
tokenizer_context.fit_on_texts(train_df['table_header'])
context_sequences = tokenizer_context.texts_to_sequences(train_df['table_header'])
max_len_context = max(len(seq) for seq in context_sequences)
padded_context = pad_sequences(context_sequences, padding='post', maxlen=MAX_LEN)

# Tokenize the target (SQL)
tokenizer_sql = Tokenizer()
tokenizer_sql.fit_on_texts(train_df['sql'])
sql_sequences = tokenizer_sql.texts_to_sequences(train_df['sql'])
max_len_sql = max(len(seq) for seq in sql_sequences)
padded_sql = pad_sequences(sql_sequences, padding='post', maxlen=MAX_LEN)

# Vocabulary sizes
vocab_size_question = len(tokenizer_question.word_index) + 1
vocab_size_context = len(tokenizer_context.word_index) + 1
vocab_size_sql = len(tokenizer_sql.word_index) + 1

In [32]:
print(f"pad_question: {padded_question[0]}\n")
print(f"pad_context: {padded_context[0]}\n\n")
print(f"pad_sql: {padded_sql[0]}\n")



pad_question: [ 14   3   1 412   8   1 756  41 350   0   0   0   0   0   0   0   0   0
   0   0]

pad_context: [ 614  333  183  236  639  886 4248    1 4249 4250 4251    1  702 4252
    0    0    0    0    0    0]


pad_sql: [  3 301   1   2   4 574  16 251   0   0   0   0   0   0   0   0   0   0
   0   0]



In [33]:
# Model architecture
embedding_dim = 128
units = 128

# Input layers
input_question = Input(shape=(MAX_LEN,))
input_context = Input(shape=(MAX_LEN,))

# Embedding layers
embedding_layer_question = Embedding(input_dim=vocab_size_question, output_dim=embedding_dim, mask_zero=True)(input_question)
embedding_layer_context = Embedding(input_dim=vocab_size_question, output_dim=embedding_dim, mask_zero=True)(input_context)

# LSTM layers
lstm_layer_question = LSTM(units, return_sequences=True)(embedding_layer_question)
lstm_layer_context = LSTM(units, return_sequences=True)(embedding_layer_context)

# Attention layer
attention = Attention()([lstm_layer_question, lstm_layer_context])

# Concatenate the attention output with the LSTM output for context
context_combined = Concatenate(axis=-1)([lstm_layer_context, attention])

# Output layer
output_layer = Dense(vocab_size_sql, activation='softmax')(context_combined)




In [35]:
# Model
model = Model(inputs=[input_question, input_context], outputs=output_layer)
model.compile(optimizer='adam', loss=SparseCategoricalCrossentropy(), metrics=['accuracy'])

# Split the data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split([padded_question[0:100], padded_context[0:100]], padded_sql[0:100], test_size=0.2, random_state=42)



In [37]:
# Train the model
history = model.fit([padded_question,padded_context], padded_sql, epochs=10, batch_size=64, validation_split=0.2)

Epoch 1/10

W0000 00:00:1706995019.119536       1 op_level_cost_estimator.cc:699] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" frequency: 2400 num_cores: 8 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [62]:
train_df = train_df.reset_index()

In [120]:
def generate_sql_sequence(input_question,input_context):
    # Tokenize and pad the input question
    input_question = tokenizer_question.texts_to_sequences([input_question])
    question_sequence = pad_sequences(input_question, padding='post',maxlen=MAX_LEN)

    # Tokenize and pad the input context
    input_context = tokenizer_context.texts_to_sequences([input_context])
    context_sequence= pad_sequences(input_context, padding='post',maxlen=MAX_LEN)

    predicted_sequence = model.predict([question_sequence,context_sequence])[0]
    # print(predicted_sequence)
    predicted_sql_query = [tokenizer_sql.index_word[idx] for idx in np.argmax(predicted_sequence, axis=-1) if idx != 0]
    return " ".join(predicted_sql_query)

# Example usage


In [117]:
val_df = pd.read_csv('~/ML_Projects/text-sql/data/validation/wikisql_val.csv')

val_df = val_df[val_df['sql'].apply(lambda x: len(x.split()) < 10)].reset_index()

val_df.shape

X_val = val_df[['question','table_header']]
y_val = val_df[['sql']]

In [126]:
for i in range(23,29):
    input_question = val_df.loc[i,'question']
    input_context =  val_df.loc[i,'table_header']
    predicted_sql = generate_sql_sequence(input_question, input_context)
    actual_query = val_df.loc[i, 'sql']

    print(f"Input Question: {input_question}")
    print(f"Input Context: {input_context}\n\n")
    print(f"actual query: {actual_query}\n")
    print(f"Predicted SQL: {predicted_sql}\n")

Input Question: What is the vault score for the total of 56.635?
Input Context: ['Rank', 'Gymnast', 'Floor Exercise', 'Pommel Horse', 'Rings', 'Vault', 'Parallel Bars', 'Horizontal Bar', 'Total']


actual query: SELECT Vault FROM table WHERE Total = 56.635

Predicted SQL: select select from from table where

Input Question: What is the total score when the score for floor exercise was 9.287?
Input Context: ['Rank', 'Gymnast', 'Floor Exercise', 'Pommel Horse', 'Rings', 'Vault', 'Parallel Bars', 'Horizontal Bar', 'Total']


actual query: SELECT Total FROM table WHERE Floor Exercise = 9.287

Predicted SQL: select select from from table where

Input Question:  what's the margin where runner(s)-up is phil mickelson
Input Context: ['Year', 'Championship', '54 holes', 'Winning score', 'Margin', 'Runner(s)-up']


actual query: SELECT Margin FROM table WHERE Runner(s)-up = Phil Mickelson

Predicted SQL: select country from from table up up

Input Question: Which Allied Force targetted Woensdrec

In [125]:
# model.save('../artifacts/lstm-context.h5')

  saving_api.save_model(


In [138]:
class PredictTiktoken():
    
    def __init__(self,model):
        self.model = model

    def inference_prediction(self,input_question:str,input_context:str ,padding_index=0) -> str:

        # Tokenize and pad the input question
        input_question = tokenizer_question.texts_to_sequences([input_question])
        question_sequence = pad_sequences(input_question, padding='post',maxlen=MAX_LEN)

        # Tokenize and pad the input context
        input_context = tokenizer_context.texts_to_sequences([input_context])
        context_sequence= pad_sequences(input_context, padding='post',maxlen=MAX_LEN)

        predicted_sequence = model.predict([question_sequence,context_sequence])[0]
        # print(predicted_sequence)
        predicted_sql_query = [tokenizer_sql.index_word[idx] for idx in np.argmax(predicted_sequence, axis=-1) if idx != 0]
        return " ".join(predicted_sql_query)
            # predicted_sql_query = enc.decode(predicted_sequence)
        
    
    def batch_prediction(self, X_val: pd.DataFrame, y_val: pd.DataFrame):
        pred_queries = []
        true_queries = []
        for i in range(len(X_val)):
            question = X_val.loc[i, "question"]
            context =  X_val.loc[i, "table_header"]
            predicted_query = self.inference_prediction(question,context)
            pred_queries.append(predicted_query)
            true_queries.append(y_val.loc[i, 'sql'])

        return true_queries, pred_queries


In [135]:
from nltk.translate.bleu_score import sentence_bleu,SmoothingFunction

In [142]:
def evaluate(X_val: pd.DataFrame, y_val: pd.DataFrame):
        smoothing_function = SmoothingFunction().method1
        y_test, decoded_predictions = PredictTiktoken(model).batch_prediction(X_val, y_val)
        bleu_scores = [sentence_bleu([true_query.split()], generated_query.split(),smoothing_function=smoothing_function) for true_query, generated_query in zip(y_test, decoded_predictions)]
        return bleu_scores

In [168]:
from rouge import Rouge

def evaluate_sql_to_text_rouge(X_val, y_val):
    """
    Evaluate the SQL-to-text model using ROUGE score.

    Args:
    - model: The trained SQL-to-text model.
    - X_val: Validation set of SQL queries.
    - y_val: Reference texts for evaluation.

    Returns:
    - ROUGE-N score.
    """
    rouge = Rouge()
    rouge_scores = []

    for i in range(len(X_val)):
        predicted_sql = generate_sql_sequence(X_val.loc[i, 'question'], X_val.loc[i, 'table_header'])
        reference_text = y_val.loc[i, 'sql']  # Assuming 'text' is the reference column

        # Calculate ROUGE scores
        scores = rouge.get_scores(predicted_sql, reference_text)
        print(f"score: {scores}")

        rouge_score = scores[0]['rouge-l']['f']

        print(f"rouge_score: {rouge_score}")


        rouge_scores.append(rouge_score)

    # Calculate the average ROUGE score
    avg_rouge_score = sum(rouge_scores) / len(rouge_scores)

    return avg_rouge_score

# Example usage:
# avg_rouge = evaluate_sql_to_text_rouge(model, X_validation, y_validation)
# print(f"Average ROUGE-2 Score: {avg_rouge}")


In [172]:
evaluate_sql_to_text_rouge(X_val=X_val,y_val=y_val)

score: [{'rouge-1': {'r': 0.1111111111111111, 'p': 0.16666666666666666, 'f': 0.1333333285333335}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.1111111111111111, 'p': 0.16666666666666666, 'f': 0.1333333285333335}}]
rouge_score: 0.1333333285333335
score: [{'rouge-1': {'r': 0.125, 'p': 0.14285714285714285, 'f': 0.13333332835555575}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.125, 'p': 0.14285714285714285, 'f': 0.13333332835555575}}]
rouge_score: 0.13333332835555575
score: [{'rouge-1': {'r': 0.1111111111111111, 'p': 0.25, 'f': 0.15384614958579892}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.1111111111111111, 'p': 0.25, 'f': 0.15384614958579892}}]
rouge_score: 0.15384614958579892
score: [{'rouge-1': {'r': 0.125, 'p': 0.3333333333333333, 'f': 0.18181817785123974}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.125, 'p': 0.3333333333333333, 'f': 0.18181817785123974}}]
rouge_score: 0.18181817785123974
score: [{'rouge-1': {'r

0.13557488570896073

In [145]:
score = evaluate(X_val,y_val)
average_score = sum(score) /len(score)
print(average_score)

0.01768906348727101
