In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

from eval import eval_tools, error_analysis
from encoder_decoder.seq2tree.seq2tree_model import Seq2TreeModel
from encoder_decoder.seq2seq.seq2seq_model import Seq2SeqModel
from encoder_decoder import slot_filling
from encoder_decoder import parse_args
from encoder_decoder import meta_experiments
from encoder_decoder import graph_utils
from encoder_decoder import decode_tools
from encoder_decoder import data_utils
import tensorflow as tf
from tqdm import tqdm
import time
import pickle
import numpy as np
import math


ModuleNotFoundError: No module named '__main__.eval'; '__main__' is not a package

In [None]:
# Using copyNet
sys.argv = "--rnn_cell gru \
    --encoder_topology birnn \
    --num_epochs 1 \
    --num_samples 256 \
    --variational_recurrent_dropout \
    --token_decoding_algorithm beam_search \
    --beam_size 100 \
    --alpha 1.0 \
    --num_nn_slot_filling 10 \
    --dataset bash \
    --channel token \
    --use_copy \
    --copy_fun copynet \
    --batch_size 128 \
    --sc_token_dim 200 \
    --learning_rate 0.0001 \
    --steps_per_epoch 4000 \
    --tg_token_use_attention \
    --tg_token_attn_fun non-linear \
    --universal_keep 0.6 \
    --sc_input_keep 1.0 \
    --tg_input_keep 1.0 \
    --sc_output_keep 1.0 \
    --tg_output_keep 1.0 \
    --attention_input_keep 1.0 \
    --attention_output_keep 1.0 \
    --beta 0 \
    --create_fresh_params \
    --min_vocab_frequency 4 \
    --demo".split(" ")

In [3]:
FLAGS = tf.compat.v1.flags.FLAGS
parse_args.define_input_flags()
FLAGS

<absl.flags._flagvalues.FlagValues at 0x7f2a588f0400>

In [4]:
FLAGS.data_dir = os.path.join(
        os.path.dirname(os.getcwd()), "data", FLAGS.dataset)
print("Reading data from {}".format(FLAGS.data_dir))


# set up encoder/decider dropout rate
if FLAGS.universal_keep >= 0 and FLAGS.universal_keep < 1:
    FLAGS.sc_input_keep = FLAGS.universal_keep
    FLAGS.sc_output_keep = FLAGS.universal_keep
    FLAGS.tg_input_keep = FLAGS.universal_keep
    FLAGS.tg_output_keep = FLAGS.universal_keep
    FLAGS.attention_input_keep = FLAGS.universal_keep
    FLAGS.attention_output_keep = FLAGS.universal_keep

# adjust hyperparameters for batch normalization
if FLAGS.recurrent_batch_normalization:
    # larger batch size
    FLAGS.batch_size *= 4
    # larger initial learning rate
    FLAGS.learning_rate *= 10

if FLAGS.decoder_topology in ['basic_tree']:
    FLAGS.model_root_dir = os.path.join(
        os.path.dirname(os.getcwd()), FLAGS.model_root_dir, "seq2tree")
elif FLAGS.decoder_topology in ['rnn']:
    FLAGS.model_root_dir = os.path.join(
        os.path.dirname(os.getcwd()), FLAGS.model_root_dir, "seq2seq")
else:
    raise ValueError("Unrecognized decoder topology: {}."
                     .format(FLAGS.decoder_topology))
print("Saving models to {}".format(FLAGS.model_root_dir))


train_set, dev_set, test_set = data_utils.load_data(
                FLAGS, use_buckets=True)

Reading data from /home/greenmon/Projects/nlc2cmd/nl2bash/data/bash
Saving models to /home/greenmon/Projects/nlc2cmd/nl2bash/model/seq2seq
Loading data from /home/greenmon/Projects/nlc2cmd/nl2bash/data/bash
source vocabulary size = 1489
target vocabulary size = 1109
max source token size = 27
max target token size = 55
source file: /home/greenmon/Projects/nlc2cmd/nl2bash/data/bash/train.nl.filtered
target file: /home/greenmon/Projects/nlc2cmd/nl2bash/data/bash/train.cm.filtered
source tokenized sequence file: /home/greenmon/Projects/nlc2cmd/nl2bash/data/bash/train.nl.token
target tokenized sequence file: /home/greenmon/Projects/nlc2cmd/nl2bash/data/bash/train.cm.token
9985 data points read.
max_source_length = 44
max_target_length = 64
Group data points into buckets...
max_source_length after filtering = 27
max_target_length after filtering = 22
source vocabulary size = 1489
target vocabulary size = 1109
max source token size = 27
max target token size = 55
source file: /home/greenmon/

In [5]:
print(train_set.buckets)
print(train_set.data_points[0])

[(10, 23), (14, 23), (28, 23)]
[<encoder_decoder.data_utils.DataPoint object at 0x7f2a4abc5e50>, <encoder_decoder.data_utils.DataPoint object at 0x7f2ab85e9bb0>, <encoder_decoder.data_utils.DataPoint object at 0x7f2ab85e9fa0>, <encoder_decoder.data_utils.DataPoint object at 0x7f2a4ab69970>, <encoder_decoder.data_utils.DataPoint object at 0x7f2a4ab69bb0>, <encoder_decoder.data_utils.DataPoint object at 0x7f2a4ab69f10>, <encoder_decoder.data_utils.DataPoint object at 0x7f2a4ab69f40>, <encoder_decoder.data_utils.DataPoint object at 0x7f2a4ab47160>, <encoder_decoder.data_utils.DataPoint object at 0x7f2a4ab47430>, <encoder_decoder.data_utils.DataPoint object at 0x7f2a4ab474c0>, <encoder_decoder.data_utils.DataPoint object at 0x7f2a4ab474f0>, <encoder_decoder.data_utils.DataPoint object at 0x7f2a4ab475e0>, <encoder_decoder.data_utils.DataPoint object at 0x7f2a4ab47610>, <encoder_decoder.data_utils.DataPoint object at 0x7f2a4ab47670>, <encoder_decoder.data_utils.DataPoint object at 0x7f2a4ab4

In [6]:

def define_model(session, forward_only, buckets=None):
    """
    Define tensor graphs.
    """
    if FLAGS.decoder_topology in ['basic_tree']:
        return graph_utils.define_model(
            FLAGS, session, Seq2TreeModel, buckets, forward_only)
    elif FLAGS.decoder_topology in ['rnn']:
        return graph_utils.define_model(
            FLAGS, session, Seq2SeqModel, buckets, forward_only)
    else:
        raise ValueError("Unrecognized decoder topology: {}.".format(
            FLAGS.decoder_topology))

In [9]:
print("Set dataset parameters")

vocab = data_utils.load_vocabulary(FLAGS)
FLAGS.max_sc_length = train_set.max_sc_length if not train_set.buckets else \
    train_set.buckets[-1][0]
FLAGS.max_tg_length = train_set.max_tg_length if not train_set.buckets else \
    train_set.buckets[-1][1]
FLAGS.sc_vocab_size = len(vocab.sc_vocab)
FLAGS.tg_vocab_size = len(vocab.tg_vocab)
FLAGS.max_sc_token_size = vocab.max_sc_token_size
FLAGS.max_tg_token_size = vocab.max_tg_token_size

Set dataset parameters
source vocabulary size = 1489
target vocabulary size = 1109
max source token size = 27
max target token size = 55


Need to set proper FLAGS
```
python -m encoder_decoder.translate --rnn_cell gru \
    --encoder_topology birnn \
    --num_epochs 1 \
    --num_samples 256 \
    --variational_recurrent_dropout \
    --token_decoding_algorithm beam_search \
    --beam_size 100 \
    --alpha 1.0 \
    --num_nn_slot_filling 10 \
    --dataset bash \
    --channel token \
    --use_copy \
    --copy_fun copynet \
    --batch_size 128 \
    --sc_token_dim 200 \
    --learning_rate 0.0001 \
    --steps_per_epoch 4000 \
    --tg_token_use_attention \
    --tg_token_attn_fun non-linear \
    --universal_keep 0.6 \
    --sc_input_keep 1.0 \
    --tg_input_keep 1.0 \
    --sc_output_keep 1.0 \
    --tg_output_keep 1.0 \
    --attention_input_keep 1.0 \
    --attention_output_keep 1.0 \
    --beta 0 \
    --create_fresh_params \
    --min_vocab_frequency 4 \
    --demo
```

In [10]:
if sys.version_info > (3, 0):
    from six.moves import xrange
def translate_fun(data_point, sess, model, vocabs, FLAGS,
                  slot_filling_classifier=None):
    tg_ids = [data_utils.ROOT_ID]
    decoder_features = [[tg_ids]]
    if type(data_point) is str:
        source_str = data_point
        encoder_features = decode_tools.query_to_encoder_features(data_point, vocabs, FLAGS)
    else:
        source_str = data_point[0].sc_txt
        encoder_features = [[data_point[0].sc_ids]]
        if FLAGS.use_copy and FLAGS.copy_fun == 'copynet':
            encoder_features.append([data_point[0].csc_ids])

    if FLAGS.use_copy and FLAGS.copy_fun == 'copynet':
        # append dummy copynet target features (
        # used only for computing training objectives)
        ctg_ids = [data_utils.ROOT_ID]
        decoder_features.append([ctg_ids])
        # tokenize the source string with minimal changes on the token form
        copy_tokens = [decode_tools.query_to_copy_tokens(source_str, FLAGS)]
    else:
        copy_tokens = None
    if FLAGS.normalized:
        _, entities = tokenizer.ner_tokenizer(source_str)
        sc_fillers = [entities[0]]
    else:
        sc_fillers = None

    # Which bucket does it belong to?
    bucket_ids = [b for b in xrange(len(model.buckets))
                  if model.buckets[b][0] > len(encoder_features[0][0])]
    bucket_id = min(bucket_ids) if bucket_ids else (len(model.buckets) - 1)

    # Get a 1-element batch to feed the sentence to the model.
    formatted_example = model.format_batch(
        encoder_features, decoder_features, bucket_id=bucket_id)

    # Compute neural network decoding output
    model_outputs = model.step(sess, formatted_example, bucket_id,
                               forward_only=True)
    sequence_logits = model_outputs.sequence_logits

    decoded_outputs = decode_tools.decode(model_outputs, FLAGS, vocabs, sc_fillers=sc_fillers,
                             slot_filling_classifier=slot_filling_classifier,
                             copy_tokens=copy_tokens)

    return decoded_outputs, sequence_logits


In [11]:
def demo(sess, model, invocations, result_cnt, FLAGS):
    n_batch = len(invocations)
    predictions = [[''] * result_cnt for _ in range(n_batch)]
    print(predictions)
    
    for i in range(n_batch):
        sentence = invocations[i]    
        
        # Do not fill argument slots
        batch_outputs, sequence_logits = translate_fun(sentence, sess, model, vocab, FLAGS)
        
        # Use Beam Search
        if batch_outputs:
            top_k_predictions = batch_outputs[0]
            top_k_scores = sequence_logits[0]
            for j in xrange(min(FLAGS.beam_size, result_cnt, len(batch_outputs[0]))):
                if len(top_k_predictions) <= j:
                    break
                top_k_pred_tree, top_k_pred_cmd = top_k_predictions[j]
                predictions[i][j] = top_k_pred_cmd
                print('Prediction {}: {} ({}) '.format(
                    j+1, top_k_pred_cmd, top_k_scores[j]))

    return predictions


In [12]:
def predict(invocations, result_cnt, buckets=None):
    with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True,
                                                              log_device_placement=FLAGS.log_device_placement, gpu_options=gpu_options)) as sess:
        # Initialize model parameters.
        model = define_model(sess, forward_only=True, buckets=buckets)
        predictions = demo(sess, model, invocations, result_cnt, FLAGS)
        return predictions

In [13]:
gpu_options= tf.compat.v1.GPUOptions(allow_growth=True)


In [14]:
predict(["Display all lines containing \"IP_MROUTE\" in the current kernel's compile-time config file."], 3, buckets=train_set.buckets)

model_dir=/home/greenmon/Projects/nlc2cmd/nl2bash/model/seq2seq/bash-T-4-birnn-gru-standard-attention-0.6-0.6-0.0-copy-1.0-128-200-1-0.0001-1e-08-0.6-0.6-0.6-0.6
decode_sig=beam_search.100.dev
Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
encoder input dimension = 200
encoder output dimension = 400
creating beam search decoder: alpha = 1.0
token_decoder dimension = 400
token_decoder decoding_algorithm = beam_search
creating bucket 0 (10, 23)...
source token embedding size = 1489
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
target token embedding size = 1109
AttentionCellWrapper added!
CopyCellWrapper added!
Instructions for updating:
Deprecated i

[['__SP__UNK',
  'find . -name __SP__UNK -print0 | xargs -0 -I {} grep "IP_MROUTE" {}',
  'find . -name __SP__UNK | xargs -I {} grep "IP_MROUTE" {}']]