In [1]:
from masterlibrary import *

2024-11-26 15:31:49.160745: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732615309.177665   73164 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732615309.182322   73164 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-26 15:31:49.199318: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load data
df = pd.read_csv('../materials/paired_nl_bash.csv', delimiter=',')
input_texts = df['natural_language']
output_texts = df['bash_command']

In [3]:
# Define special tokens
SPECIAL_TOKENS = ['<start>', '<end>', '<unk>', '<pad>']

# Initialize tokenizers
input_tokeniser = Tokenizer(oov_token='<unk>')
output_tokeniser = Tokenizer(oov_token='<unk>')

# Prepare texts
input_tokeniser.fit_on_texts(input_texts)
output_texts_with_tokens = ['<start> ' + text + ' <end>' for text in output_texts]
output_tokeniser.fit_on_texts(output_texts_with_tokens)

# Add special tokens if missing
if '<start>' not in output_tokeniser.word_index:
    print("Warning: Special tokens not in vocabulary. Adding them...")
    current_vocab_size = len(output_tokeniser.word_index)
    for i, token in enumerate(SPECIAL_TOKENS, start=1):
        if token not in output_tokeniser.word_index:
            output_tokeniser.word_index[token] = current_vocab_size + i
            output_tokeniser.index_word[current_vocab_size + i] = token

# Create sequences
input_sequences = pad_sequences(input_tokeniser.texts_to_sequences(input_texts), 
                              padding='post')
output_sequences = pad_sequences(output_tokeniser.texts_to_sequences(output_texts_with_tokens),
                               padding='post')

# Define vocabulary sizes
input_vocabsize = len(input_tokeniser.word_index) + 1
output_vocabsize = len(output_tokeniser.word_index) + 1

# Model parameters
embedding_dim = 128
units = 256



In [4]:
def create_training_model():
    # Encoder
    encoder_inputs = tf.keras.Input(shape=(None,))
    encoder_embedding = Embedding(input_vocabsize, embedding_dim, name='embedding')(encoder_inputs)
    encoder_lstm = LSTM(units, return_state=True, name='lstm')
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

    # Decoder
    decoder_inputs = tf.keras.Input(shape=(None,))
    decoder_embedding = Embedding(output_vocabsize, embedding_dim, name='embedding_1')(decoder_inputs)
    decoder_lstm = LSTM(units, return_sequences=True, return_state=True, name='lstm_1')
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
    decoder_dense = Dense(output_vocabsize, activation='softmax', name='dense')
    output = decoder_dense(decoder_outputs)

    # Create and compile model
    model = Model([encoder_inputs, decoder_inputs], output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [5]:
def create_inference_models(trained_model):
    # Get the layers from trained model
    encoder_inputs = trained_model.input[0]
    encoder_embedding = trained_model.get_layer('embedding')
    encoder_lstm = trained_model.get_layer('lstm')
    
    # Recreate encoder model
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding(encoder_inputs))
    encoder_model = Model(encoder_inputs, [state_h, state_c])
    
    # Decoder setup
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = trained_model.get_layer('embedding_1')
    decoder_lstm = trained_model.get_layer('lstm_1')
    decoder_dense = trained_model.get_layer('dense')
    
    decoder_state_input_h = Input(shape=(units,))
    decoder_state_input_c = Input(shape=(units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs = decoder_embedding(decoder_inputs)
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_outputs, initial_state=decoder_states_inputs
    )
    decoder_outputs = decoder_dense(decoder_outputs)
    
    decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs, state_h, state_c]
    )
    
    return encoder_model, decoder_model

def generate_command(input_query, max_length=50):
    print("Processing input...")
    # Input preprocessing
    input_seq = input_tokeniser.texts_to_sequences([input_query])
    input_seq = pad_sequences(input_seq, maxlen=input_sequences.shape[1], padding='post')
    
    print("Getting encoder predictions...")
    # Get initial states from encoder (reduce verbosity)
    states_value = encoder_model.predict(input_seq, verbose=0)
    
    # Initialize target sequence
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = output_tokeniser.word_index.get('<start>', 1)
    
    decoded_sentence = []
    
    print("Generating command...")
    for i in range(max_length):
        # Reduce prediction verbosity
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, 
            verbose=0
        )
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = output_tokeniser.index_word.get(sampled_token_index, '<unk>')
        
        if sampled_word == '<end>':
            break
            
        if sampled_word not in SPECIAL_TOKENS:
            decoded_sentence.append(sampled_word)
        
        # Update for next iteration
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
        
        if i % 10 == 0:  # Print progress every 10 tokens
            print(f"Generated {i} tokens...")
    
    print("Command generation complete!")
    return ' '.join(decoded_sentence)

# Test with timeout warning
import time

print("Starting command generation...")
start_time = time.time()

# Set a reasonable timeout (e.g., 30 seconds)
TIMEOUT = 30

try:
    input_query = "(BSD specific) Display process information twice, waiting one second between each, filtering out the header line."
    generated_command = generate_command(input_query)
    print(f"\nGenerated Command: {generated_command}")
    print(f"\nTotal time taken: {time.time() - start_time:.2f} seconds")
except Exception as e:
    print(f"Error occurred: {str(e)}")
finally:
    if time.time() - start_time > TIMEOUT:
        print("\nWarning: Command generation took longer than expected!")

In [6]:
# Create and train the model
model = create_training_model()
history = model.fit(
    [input_sequences, output_sequences[:, :-1]],
    output_sequences[:, 1:],
    epochs=20,
    batch_size=64,
    validation_split=0.2
)

Epoch 1/20


2024-11-26 15:32:29.088169: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 686ms/step - accuracy: 0.8399 - loss: 2.4754 - val_accuracy: 0.9007 - val_loss: 0.6449
Epoch 2/20
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 689ms/step - accuracy: 0.8901 - loss: 0.7055 - val_accuracy: 0.9078 - val_loss: 0.6096
Epoch 3/20
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 699ms/step - accuracy: 0.8929 - loss: 0.6758 - val_accuracy: 0.9094 - val_loss: 0.5837
Epoch 4/20
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 703ms/step - accuracy: 0.8952 - loss: 0.6426 - val_accuracy: 0.9107 - val_loss: 0.5629
Epoch 5/20
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 695ms/step - accuracy: 0.8973 - loss: 0.6148 - val_accuracy: 0.9113 - val_loss: 0.5485
Epoch 6/20
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 707ms/step - accuracy: 0.8998 - loss: 0.5898 - val_accuracy: 0.9126 - val_loss: 0.5373
Epoch 7/20
[1m

In [7]:
# Create and train the model
model = create_training_model()
history = model.fit(
    [input_sequences, output_sequences[:, :-1]],
    output_sequences[:, 1:],
    epochs=20,
    batch_size=64,
    validation_split=0.2
)

Epoch 1/20
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 600ms/step - accuracy: 0.8398 - loss: 2.4929 - val_accuracy: 0.9007 - val_loss: 0.6424
Epoch 2/20
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 600ms/step - accuracy: 0.8895 - loss: 0.7079 - val_accuracy: 0.9068 - val_loss: 0.6106
Epoch 3/20
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 598ms/step - accuracy: 0.8937 - loss: 0.6705 - val_accuracy: 0.9095 - val_loss: 0.5835
Epoch 4/20
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 599ms/step - accuracy: 0.8944 - loss: 0.6480 - val_accuracy: 0.9105 - val_loss: 0.5623
Epoch 5/20
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 601ms/step - accuracy: 0.8974 - loss: 0.6133 - val_accuracy: 0.9118 - val_loss: 0.5443
Epoch 6/20
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 600ms/step - accuracy: 0.8981 - loss: 0.6005 - val_accuracy: 0.9127 - val_loss: 0.5330
Epoch 7/20

In [8]:
# Create inference models
encoder_model, decoder_model = create_inference_models(model)

In [11]:
# Test the model
input_query = "(BSD specific) Display process information twice, waiting one second between each, filtering out the header line."
generated_command = generate_command(input_query)
print(f"Generated Command: {generated_command}")

In [10]:
# Save model
model.save('command_generator_model.keras')

# Save tokenizers
with open('input_tokenizer.pkl', 'wb') as f:
    pickle.dump(input_tokeniser, f)
with open('output_tokenizer.pkl', 'wb') as f:
    pickle.dump(output_tokeniser, f)