Only about ~75% accuracy on GATE

Sentimentron · Jan 25, 2016 · 41227a8 · 41227a8
1 parent 30e161d
commit 41227a8
Show file tree

Hide file tree

Showing 6 changed files with 159,544 additions and 31 deletions.
diff --git a/Data/gate_twitter_bootstrap_corpus.1543K.tokens b/Data/gate_twitter_bootstrap_corpus.1543K.tokens
diff --git a/corpus.py b/corpus.py
@@ -18,10 +18,11 @@ def process_cmd_line():
     """Read command line arguments"""
     p = ArgumentParser("Download and pre-process command line arguments")
     p.add_argument("--brown", action="store_true")
+    p.add_argument("--tokens", nargs="+")
 
     args = p.parse_args()
 
-    if not args.brown:
+    if not args.brown and not args.tokens:
         raise ValueError("No corpora specified (see --usage)")
 
     return args
@@ -177,6 +178,18 @@ def convert_tag(tag):
     print tags
     print len(tags)
 
+def process_token(t):
+
+    with open(t, 'r') as fin:
+        for line in fin:
+            tokens = line.split()
+            for token in tokens:
+                parts = token.split('_')
+                word = '_'.join(parts[:-1])
+                pos = parts[-1]
+                print "%s\t%s" % (word, pos)
+            print ""
+
 def main():
     """
     Main method.
@@ -188,6 +201,10 @@ def main():
     if args.brown:
         process_brown()
 
+    if args.tokens:
+        for t in args.tokens:
+            process_token(t)
+
 
 if __name__ == "__main__":
     main()
diff --git a/lstm.py b/lstm.py
@@ -45,15 +45,15 @@ def build_model(tparams, options, maxw, training=True):
     emb = embeddings_layer(xc, tparams['Cemb'], n_timesteps, n_samples, options['dim_proj'])
 
     proj = bidirectional_lstm_layer(tparams, emb, options, "lstm_chars_1", mask=mask)
-    proj = bidirectional_lstm_layer(tparams, proj, options, "lstm_chars_2", mask=mask)
+    #proj = bidirectional_lstm_layer(tparams, proj, options, "lstm_chars_2", mask=mask)
 
     avg_per_word = per_word_averaging_layer(proj, wmask, maxw)
     avg_per_word = avg_per_word.dimshuffle(1, 0, 2)
 
     #avg_per_word = theano.printing.Print("avg", attrs=["shape"])(avg_per_word)
 
     proj2 = bidirectional_lstm_layer(tparams, avg_per_word, options, "lstm_words_1", mult=3)
-    proj2 = bidirectional_lstm_layer(tparams, proj2, options, "lstm_words_2", mult=3)
+    #proj2 = bidirectional_lstm_layer(tparams, proj2, options, "lstm_words_2", mult=3)
 
     pred = softmax_layer(proj2, tparams['U'], tparams['b'], y_mask, maxw, training)
 
@@ -88,7 +88,7 @@ def split_at(src, prop):
     return (src_chars, src_words, src_labels), (val_chars, val_words, val_labels)
 
 def train_lstm(
-    dim_proj_chars=16,  # character embedding dimension and LSTM number of hidden units.
+    dim_proj_chars=32,  # character embedding dimension and LSTM number of hidden units.
     patience=10,  # Number of epoch to wait before early stop if no progress
     max_epochs=5000,  # The maximum number of epoch to run
     dispFreq=10,  # Display to stdout the training progress every N updates
@@ -97,7 +97,7 @@ def train_lstm(
     optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
     encoder='lstm',  # TODO: can be removed must be lstm.
     saveto='lstm_model.npz',  # The best model will be saved there
-    validFreq=370,  # Compute the validation error after this number of update.
+    validFreq=450,  # Compute the validation error after this number of update.
     saveFreq=1110,  # Save the parameters after every saveFreq updates
     maxlen=100,  # Sequence longer then this get ignored
     batch_size=100,  # The batch size during training.
@@ -135,32 +135,31 @@ def train_lstm(
     if not os.path.isfile("substitutions.pkl"):
         raise Exception("substitutions.pkl wasn't found, have you run substitution.py?")
 
-    load_pos_tagged_data("Data/Brown.conll", char_dict, word_dict, pos_dict)
-    load_pos_tagged_data("Data/TweeboOct27.conll", char_dict, word_dict, pos_dict)
-    load_pos_tagged_data("Data/TweeboDaily547.conll", char_dict, word_dict, pos_dict)
+    #load_pos_tagged_data("Data/Brown.conll", char_dict, word_dict, pos_dict)
+    #load_pos_tagged_data("Data/TweeboOct27.conll", char_dict, word_dict, pos_dict)
+    #load_pos_tagged_data("Data/TweeboDaily547.conll", char_dict, word_dict, pos_dict)
+
+    load_pos_tagged_data("Data/Gate.conll", char_dict, word_dict, pos_dict)
 
     with open("substitutions.pkl", "rb") as fin:
         word_dict = pickle.load(fin)
 
     max_word_count = 0
     if not pretrain:
         # Now load the data for real
-        train = load_pos_tagged_data("Data/TweeboOct27.conll", char_dict, word_dict, pos_dict, 0)
-        max_word_count = get_max_word_count("Data/TweeboOct27.conll")
-        test = load_pos_tagged_data("Data/TweeboDaily547.conll", char_dict, word_dict, pos_dict, 16)
-        test, valid = split_at(test, 0.10)
-        max_word_count = max(max_word_count, get_max_word_count("Data/TweeboDaily547.conll"))
-        batch_size = 25
+        data = load_pos_tagged_data("Data/Gate.conll", char_dict, word_dict, pos_dict, 0)
+        train, eval = split_at(data, 0.30)
+        test, valid = split_at(eval, 0.50)
+        max_word_count = max(max_word_count, get_max_word_count("Data/Gate.conll"))
+        batch_size = 100
     else:
         # Pre-populate
         test = load_pos_tagged_data("Data/Brown.conll", char_dict, word_dict, pos_dict)
         max_word_count = get_max_word_count("Data/Brown.conll")
         train, valid = split_at(test, 0.05)
-	max_word_count = 38	# HACK: set to the same as Twitter
 
     ydim = numpy.max(numpy.amax(train[2])) + 1
-    ydim = 27 # Hard-code, one that appears in the testing set, not in the training set
-
+    print "ydim =", ydim
 
     model_options['ydim'] = ydim
     model_options['n_chars'] = len(char_dict)+1
@@ -274,7 +273,7 @@ def train_lstm(
                     valid_err = pred_error(f_pred, prepare_data, valid, kf_valid, 140, max_word_count, n_proj)
 
                     if not pretrain:
-                        train_err = pred_error(f_pred, prepare_data, train, kf, 140, max_word_count, n_proj)
+                        #train_err = pred_error(f_pred, prepare_data, train, kf, 140, max_word_count, n_proj)
                         test_err = pred_error(f_pred, prepare_data, test, kf_test, 140, max_word_count, n_proj)
                         history_errs.append([valid_err, test_err])
                     else:
@@ -287,8 +286,8 @@ def train_lstm(
                         best_p = unzip(tparams)
                         bad_counter = 0
                     if not pretrain:
-                        logging.info("Train %.4f, Valid %.4f, Test %.4f",
-                                     100*(1-train_err), 100*(1-valid_err), 100*(1-test_err))
+                        logging.info("Valid %.4f, Test %.4f",
+                                     100*(1-valid_err), 100*(1-test_err))
                     else:
                         logging.info("Valid %.4f", 100 * (1-valid_err))
 

diff --git a/modelio.py b/modelio.py
@@ -14,14 +14,19 @@
 
 def build_character_dictionary(path, chars = {}):
     with open(path, 'r') as fin:
+        lineno = 1
         for line in fin:
             line = line.strip()
             if len(line) == 0:
                 continue
-            word, _ = line.split('\t')
-            for c in word:
-                if c not in chars:
-                    chars[c] = len(chars) + 1
+            try:
+                word, _ = line.split('\t')
+                for c in word:
+                    if c not in chars:
+                        chars[c] = len(chars) + 1
+            except ValueError as ex:
+                print ex, lineno, line
+            lineno += 1
     return chars
 
 def build_word_dictionary(path, words = {}):

diff --git a/nn_params.py b/nn_params.py
@@ -45,17 +45,17 @@ def generate_init_params(options, params):
                                          params,
                                          prefix="lstm_chars_1")
 
-    params = param_init_bidirection_lstm(options,
-                                         params,
-                                         prefix="lstm_chars_2")
+    #params = param_init_bidirection_lstm(options,
+    #                                     params,
+    #                                     prefix="lstm_chars_2")
 
     params = param_init_bidirection_lstm(options,
                                          params,
                                          prefix="lstm_words_1", mult=3)
 
-    params = param_init_bidirection_lstm(options,
-                                         params,
-                                         prefix="lstm_words_2", mult=3)
+    #params = param_init_bidirection_lstm(options,
+    #                                     params,
+    #                                     prefix="lstm_words_2", mult=3)
 
     # classifier
     params['U'] = 0.01 * numpy.random.randn(options['dim_proj']*3,

diff --git a/server.py b/server.py
@@ -102,7 +102,7 @@ def hello():
 
     print chars, words
     # TODO: 32 is the n_proj
-    xc, xw, mask, wmask, y, y_mask = prepare_data(chars, words, labels, 140, 38, 16)
+    xc, xw, mask, wmask, y, y_mask = prepare_data(chars, words, labels, 140, 38, 32)
 
     pred = model[-3](xc, mask, wmask, y_mask)
     print pred