# Import Necessary Libraries

In [1]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam




In [2]:
def load_queries_from_json(file_path):
    """Load queries from a JSON file."""
    queries_json = json.load(open(file_path, 'r'))
    query_ids = [item["query number"] for item in queries_json]
    queries = [item["query"] for item in queries_json]
    return query_ids, queries

In [3]:
def preprocess_queries(queries):
    """Preprocess queries by merging and splitting into sentences."""
    query_merged = ' '.join(queries)
    queries_sent = query_merged.split('.')
    return queries_sent

In [4]:
def tokenize_queries(queries_complete):
    """Tokenize queries."""
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(queries_complete)
    return tokenizer

In [5]:
def generate_input_sequences(tokenizer, queries_complete):
    """Generate input sequences for training."""
    input_sequences = []
    for query in queries_complete:
        sequence = tokenizer.texts_to_sequences([query])[0]
        for i in range(1, len(sequence)):
            input_sequences.append(sequence[:i+1])
    return input_sequences

In [6]:
def preprocess_input_sequences(input_sequences, max_seq_len):
    """Pad input sequences to equal length."""
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre'))
    return input_sequences

In [7]:
def create_model_and_train(x_sequences, y_sequences, total_words, max_seq_len, n_epochs=200):
    """Create and train the LSTM model."""
    model = Sequential()
    model.add(Embedding(total_words, 16, input_length=max_seq_len - 1))
    model.add(Bidirectional(LSTM(50)))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    history = model.fit(x_sequences, y_sequences, epochs=n_epochs, verbose=1) # try to run for full 200 epochs atleast for some meaningful completion
    return model

In [8]:
def complete_query(model, tokenizer, reverse_word_index, incomplete_query, next_n_words=1, max_seq_len=0):
    """Complete the given query by predicting the next n words."""
    seed_text = incomplete_query
    
    for i in range(next_n_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]  # list of list
        token_list = pad_sequences([token_list], maxlen=max_seq_len - 1, padding='pre')
    
        # Use model.predict to get probabilities for all words
        predictions = model.predict(token_list, verbose=0)[0]
    
        # Use argmax to get the index of the word with highest probability
        predicted_index = np.argmax(predictions)
        out_word = reverse_word_index[predicted_index]
        seed_text += " " + out_word
    return seed_text

In [9]:
# Load queries
query_ids, queries = load_queries_from_json("cran_queries.json")
print("Number of queries:", len(queries))

Number of queries: 225


In [10]:
# Preprocess queries
sentences = preprocess_queries(queries)

In [11]:
# Tokenize queries
tokenizer = tokenize_queries(sentences)
total_words = len(tokenizer.word_index) + 1

In [12]:
# Generate input sequences
input_sequences = generate_input_sequences(tokenizer, sentences)

In [13]:
# Pad input sequences
max_sequence_length = max([len(sequence) for sequence in input_sequences])
padded_sequences = preprocess_input_sequences(input_sequences, max_sequence_length)

In [14]:
# Create predictors and labels
predictors, labels = padded_sequences[:, :-1], padded_sequences[:, -1]
one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [15]:
trained_model = create_model_and_train(predictors, one_hot_labels, total_words, max_sequence_length, n_epochs=200)



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 49, 16)            15296     
                                                                 
 bidirectional (Bidirection  (None, 100)               26800     
 al)                                                             
                                                                 
 dense (Dense)               (None, 956)               96556     
                                                                 
Total params: 138652 (541.61 KB)
Trainable params: 138652 (541.61 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/200


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/20

In [16]:
# Reverse word index
reverse_word_index = {index: word for (word, index) in tokenizer.word_index.items()}

In [17]:
# Queries
original_queries = [
    "to find an approximate correction for thickness in slender thin-wing theory .",
    "why does the compressibility transformation fail to correlate the high speed data for helium and air .",
    "how is the heat transfer downstream of the mass transfer region effected by mass transfer at the nose of a blunted cone .",
    "does transition in the hypersonic wake depend on body geometry and size",
    "what is a criterion that the transonic flow around an airfoil with a round leading edge be validly analyzed by the linearized transonic flow theory ."
]

incomplete_queries = [
    "to find an approximate correction for",
    "why does the compressibility transformation fail to correlate the high",
    "how is the heat transfer downstream of the mass transfer region effected by mass transfer at",
    "does transition in the hypersonic wake",
    "what is a criterion that the transonic flow around an airfoil with a round leading edge be validly analyzed"
]

In [18]:
# Complete queries
for idx, incomplete_query in enumerate(incomplete_queries):
    print(f"Processing Query No. {idx+1}:")
    completed_query_n_words = complete_query(trained_model, tokenizer, reverse_word_index, incomplete_query, next_n_words=6, max_seq_len=max_sequence_length)
    print("Incomplete query:", incomplete_query)
    print("Predicted Complete query:", completed_query_n_words)
    print("Original Complete query:", original_queries[idx], "\n")

Processing Query No. 1:
Incomplete query: to find an approximate correction for
Predicted Complete query: to find an approximate correction for thickness in slender thin wing theory
Original Complete query: to find an approximate correction for thickness in slender thin-wing theory . 

Processing Query No. 2:
Incomplete query: why does the compressibility transformation fail to correlate the high
Predicted Complete query: why does the compressibility transformation fail to correlate the high speed data for helium and air
Original Complete query: why does the compressibility transformation fail to correlate the high speed data for helium and air . 

Processing Query No. 3:
Incomplete query: how is the heat transfer downstream of the mass transfer region effected by mass transfer at
Predicted Complete query: how is the heat transfer downstream of the mass transfer region effected by mass transfer at the nose of a blunted cone
Original Complete query: how is the heat transfer downstream o