From 30e161dee706b54821a2dc0b20da9ed1c4a05560 Mon Sep 17 00:00:00 2001 From: Richard Townsend Date: Fri, 1 Jan 2016 20:03:03 +0000 Subject: [PATCH] New bidirectional architecture --- lstm.py | 23 ++++++++--------------- nn_layers.py | 1 + nn_lstm.py | 21 +++++++++++++++++++++ nn_params.py | 39 ++++++++++++++++++++++++--------------- nn_serialization.py | 2 +- server.py | 2 +- 6 files changed, 56 insertions(+), 32 deletions(-) diff --git a/lstm.py b/lstm.py index 229ea61..92d3cb1 100644 --- a/lstm.py +++ b/lstm.py @@ -15,7 +15,7 @@ from modelio import load_pos_tagged_data, prepare_data, get_max_word_count, get_max_length from nn_layers import * -from nn_lstm import lstm_layer, lstm_unmasked_layer +from nn_lstm import lstm_layer, lstm_unmasked_layer, bidirectional_lstm_layer from nn_params import * from nn_optimizers import * from nn_support import pred_error @@ -43,26 +43,19 @@ def build_model(tparams, options, maxw, training=True): n_samples = xc.shape[1] emb = embeddings_layer(xc, tparams['Cemb'], n_timesteps, n_samples, options['dim_proj']) - # emb2 = embeddings_layer(xw, tparams['Wemb'], n_timesteps, n_samples, options['dim_proj_words']) - #emb = tensor.concatenate([emb1, emb2], axis=2) - - #emb = theano.printing.Print("emb", attrs=["shape"])(emb) - - proj_chars_1 = lstm_layer(tparams, emb, options, "lstm_chars_forwards", mask=mask) - proj_chars_2 = lstm_layer(tparams, emb, options, "lstm_chars_backwards", mask=mask, go_backwards=True) - - proj = proj_chars_1 + proj_chars_2 + proj = bidirectional_lstm_layer(tparams, emb, options, "lstm_chars_1", mask=mask) + proj = bidirectional_lstm_layer(tparams, proj, options, "lstm_chars_2", mask=mask) avg_per_word = per_word_averaging_layer(proj, wmask, maxw) avg_per_word = avg_per_word.dimshuffle(1, 0, 2) - proj2 = lstm_unmasked_layer(tparams, avg_per_word, options, prefix="lstm_words", mult=3) - proj3 = lstm_unmasked_layer(tparams, avg_per_word, options, prefix="lstm_words_2", mult=3, go_backwards=True) + #avg_per_word = theano.printing.Print("avg", attrs=["shape"])(avg_per_word) - proj4 = proj2 + proj3 + proj2 = bidirectional_lstm_layer(tparams, avg_per_word, options, "lstm_words_1", mult=3) + proj2 = bidirectional_lstm_layer(tparams, proj2, options, "lstm_words_2", mult=3) - pred = softmax_layer(proj4, tparams['U'], tparams['b'], y_mask, maxw, training) + pred = softmax_layer(proj2, tparams['U'], tparams['b'], y_mask, maxw, training) f_pred_prob = theano.function([xc, mask, wmask, y_mask], pred, name='f_pred_prob', on_unused_input='ignore') f_pred = theano.function([xc, mask, wmask, y_mask], pred.argmax(axis=2), name='f_pred', on_unused_input='ignore') @@ -157,7 +150,7 @@ def train_lstm( test = load_pos_tagged_data("Data/TweeboDaily547.conll", char_dict, word_dict, pos_dict, 16) test, valid = split_at(test, 0.10) max_word_count = max(max_word_count, get_max_word_count("Data/TweeboDaily547.conll")) - batch_size = 50 + batch_size = 25 else: # Pre-populate test = load_pos_tagged_data("Data/Brown.conll", char_dict, word_dict, pos_dict) diff --git a/nn_layers.py b/nn_layers.py index e489c6d..1ba4632 100644 --- a/nn_layers.py +++ b/nn_layers.py @@ -36,6 +36,7 @@ def per_word_averaging_layer_distrib(proj, wmask, maxw): """ """ + print maxw, "MAXW" dup = [tensor.shape_padaxis(proj, 0) for _ in range(maxw)] dup = tensor.concatenate(dup, 0) #dup = tensor.shape_padaxis(proj, 0) diff --git a/nn_lstm.py b/nn_lstm.py index f0a53c7..f498a8b 100644 --- a/nn_lstm.py +++ b/nn_lstm.py @@ -124,3 +124,24 @@ def _step(m_, x_, h_, c_): name=_p(prefix, '_layers'), n_steps=nsteps, go_backwards=go_backwards) return rval[0] + +def bidirectional_lstm_layer(tparams, state_below, options, prefix='lstm', mask=None, mult=1): + + def _p(pp, name): + return '%s_%s' % (pp, name) + + prefix_forwards = '%s_forwards' % (prefix,) + prefix_backwards = '%s_backwards' % (prefix,) + + if mask is not None: + forwards = lstm_layer(tparams, state_below, options, prefix=prefix_forwards, mask=mask, go_backwards=False, mult=mult) + backwards = lstm_layer(tparams, state_below, options, prefix=prefix_backwards, mask=mask, go_backwards=True, mult=mult) + else: + forwards = lstm_unmasked_layer(tparams, state_below, options, prefix=prefix_forwards, mult=mult, go_backwards=False) + backwards = lstm_unmasked_layer(tparams, state_below, options, prefix=prefix_backwards, mult=mult, go_backwards=True) + + #forwards = theano.printing.Print(prefix_forwards, attrs=["shape"])(forwards) + #backwards = theano.printing.Print(prefix_forwards, attrs=["shape"])(backwards) + + return forwards + backwards + diff --git a/nn_params.py b/nn_params.py index 776a16f..faba47d 100644 --- a/nn_params.py +++ b/nn_params.py @@ -41,21 +41,21 @@ def generate_init_params(options, params): options['dim_proj_chars'])*2 - 1 params['Cemb'] = (0.01 * randn).astype(config.floatX) - params = param_init_lstm(options, - params, - prefix="lstm_chars_forwards") - - params = param_init_lstm(options, - params, - prefix="lstm_chars_backwards") - - params = param_init_lstm(options, - params, - prefix="lstm_words", mult=3) - - params = param_init_lstm(options, - params, - prefix="lstm_words_2", mult=3) + params = param_init_bidirection_lstm(options, + params, + prefix="lstm_chars_1") + + params = param_init_bidirection_lstm(options, + params, + prefix="lstm_chars_2") + + params = param_init_bidirection_lstm(options, + params, + prefix="lstm_words_1", mult=3) + + params = param_init_bidirection_lstm(options, + params, + prefix="lstm_words_2", mult=3) # classifier params['U'] = 0.01 * numpy.random.randn(options['dim_proj']*3, @@ -97,3 +97,12 @@ def param_init_lstm(options, params, prefix='lstm', mult=1): params[_p(prefix, 'b')] = b.astype(config.floatX) return params + +def param_init_bidirection_lstm(options, params, prefix='lstm', mult=1): + prefix_forwards = '%s_forwards' % (prefix,) + prefix_backwards = '%s_backwards' % (prefix,) + + params = param_init_lstm(options, params, prefix_forwards, mult) + params = param_init_lstm(options, params, prefix_backwards, mult) + + return params diff --git a/nn_serialization.py b/nn_serialization.py index c1b9cf3..50ee74d 100644 --- a/nn_serialization.py +++ b/nn_serialization.py @@ -36,6 +36,6 @@ def load_params(path, params): logging.info("Loading model from file '%s'...", path) with open(path, 'rb') as fin: data = pickle.load(fin) - for k in ['dim_proj_chars', 'dim_proj_words', 'char_dict', 'pos_dict', 'word_dict']: + for k in ['dim_proj_chars', 'char_dict', 'pos_dict', 'word_dict']: params[k] = data[k] return params diff --git a/server.py b/server.py index ffeff0a..92e5ae2 100644 --- a/server.py +++ b/server.py @@ -102,7 +102,7 @@ def hello(): print chars, words # TODO: 32 is the n_proj - xc, xw, mask, wmask, y, y_mask = prepare_data(chars, words, labels, 140, 38, 32) + xc, xw, mask, wmask, y, y_mask = prepare_data(chars, words, labels, 140, 38, 16) pred = model[-3](xc, mask, wmask, y_mask) print pred