Perceptron ranker initial, rankers moved to one source file

UFAL-DSG · May 13, 2014 · 85151cd · 85151cd
1 parent 2b3c444
commit 85151cd
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 116 deletions.
diff --git a/bagel-data/config.py b/bagel-data/config.py
diff --git a/tgen/features.py b/tgen/features.py
@@ -10,13 +10,13 @@
 from functools import partial
 
 
-def find_nodes(node, scope):
+def find_nodes(node, scope, incremental=False):
     """Given a parent node and scope specifications (in a list), this returns the
     corresponding nodes.
     """
     nodes = []
     for scope_spec in scope:
-        if scope_spec == 'node':
+        if scope_spec == 'node' or incremental:
             nodes.apppend(node)
         elif scope_spec == 'tree':
             nodes.extend(node.root.get_descendants())
@@ -33,19 +33,19 @@ def find_nodes(node, scope):
     return nodes
 
 
-def same_as_current(cur_node, scope_func, attrib):
+def same_as_current(node, context, scope_func, attrib, incremental=False):
     """Return the number of nodes in the given scope that have the same value
     of the given attribute as the current node.
 
     @rtype: dict
     @return: dictionary with one key ('') and the number of matching values as a value
     """
     if attrib == 'right':
-        value = True if cur_node.parent and cur_node > cur_node.parent else False
+        value = True if node.parent and node > node.parent else False
     else:
-        value = getattr(cur_node, attrib)
+        value = getattr(node, attrib)
     num_matching = 0.0
-    for node in scope_func(cur_node):
+    for node in scope_func(node):
         if attrib == 'right':  # special handling for 'right'
             if node.parent and (node > node.parent) == value:
                 num_matching += 1
@@ -54,15 +54,15 @@ def same_as_current(cur_node, scope_func, attrib):
     return {'': num_matching}
 
 
-def value(cur_node, scope_func, attrib):
+def value(node, context, scope_func, attrib, incremental=False):
     """Return the number of nodes holding the individual values of the given attribute
     in the given scope.
 
     @rtype dict
     @return: dictionary with keys for values of the attribute, values for counts of matching nodes
     """
     ret = defaultdict(float)
-    for node in scope_func(cur_node):
+    for node in scope_func(node):
         if attrib == 'right':
             if node.parent and node > node.parent:
                 ret['True'] += 1
@@ -73,8 +73,14 @@ def value(cur_node, scope_func, attrib):
     return ret
 
 
-def prob(node, parent):
-    return {'': node[1]}
+def prob(node, context):
+    # TODO this won't work. Use wild attributes? Or some other structure?
+    return {'': context['node_prob']}
+
+
+def bias(node):
+    """A constant feature function, always returning 1"""
+    return {'': 1}
 
 
 class Features(object):
@@ -83,7 +89,7 @@ def __init__(self, cfg):
         self.features = self.parse_feature_spec(cfg)
 
     def parse_feature_spec(self, spec):
-        """Prepares feature feature function from specifications in the following format:
+        """Prepares feature functions from specifications in the following format:
 
         Label: value/same_as_current scope param1, ...
 
@@ -96,6 +102,8 @@ def parse_feature_spec(self, spec):
             label, func_name = re.split(r'[:\s]+', feat, 1)
             if func_name == 'prob':
                 features[label] = prob
+            elif func_name == 'bias':
+                features[label] = bias
             else:
                 func_name, func_scope, func_params = re.split(r'[:\s]+', func_name, 2)
                 func_params = re.split(r'[,\s]+', func_params)
@@ -110,7 +118,7 @@ def parse_feature_spec(self, spec):
                 features[label] = feat_func
         return features
 
-    def get_features(self, node, feats=defaultdict(float)):
+    def get_features(self, node, context, feats=defaultdict(float)):
         """Return features for the given node. Accumulates features from other nodes
         if given in the feats parameter.
 
@@ -119,7 +127,7 @@ def get_features(self, node, feats=defaultdict(float)):
         """
         feats_hier = {}
         for name, func in self.features.iteritems():
-            feats_hier[name] = func(node)
+            feats_hier[name] = func(node, context)
         for name, val in feats_hier.iteritems():
             for subname, subval in val.iteritems():
                 feats[name + '_' + subname if subname else name] += subval

diff --git a/tgen/interface.py b/tgen/interface.py
@@ -6,11 +6,3 @@
 
 class CandidateGenerator(object):
     pass
-
-
-class Ranker(object):
-
-    def get_best_child(self, parent, cdf):
-        raise NotImplementedError
-
-
diff --git a/tgen/percrank.py b/tgen/percrank.py
diff --git a/tgen/planner.py b/tgen/planner.py
@@ -164,6 +164,7 @@ def __init__(self, cfg):
         self.debug_out = None
         if 'debug_out' in cfg:
             self.debug_out = cfg['debug_out']
+        self.ranker = cfg['ranker']
 
     def generate_tree(self, da, gen_doc=None, gold_ttree=None):
         # TODO add future cost ?
@@ -176,15 +177,15 @@ def generate_tree(self, da, gen_doc=None, gold_ttree=None):
             cand, score = open_list.pop()
             if gold_ttree and cand == gold_ttree:
                 print >> self.debug_out, "IT %05d: CANDIDATE MATCHES GOLD" % num_iter
-                score = -1.0
+                # score = -1.0
             close_list.push(cand, score)
             if self.debug_out:
                 print >> self.debug_out, ("\n***\nIT %05d:%s\nO: %d C: %d\n***" %
                                           (num_iter, unicode(cand), len(open_list), len(close_list)))
                 self.debug_out.flush()
             successors = self.candgen.get_all_successors(cand, cdfs)
-            # TODO add real scoring here
-            open_list.pushall({s: float(len(s.get_descendants()))
+            # add candidates with score
+            open_list.pushall({s: self.ranker.score(s, da) * -1
                                for s in successors if not s in close_list})
 #             if self.debug_out:
 #                 print >> self.debug_out, "\n".join(map(unicode, self.open_list.members.keys()))

diff --git a/tgen/logreg_rank.py → tgen/rank.py b/tgen/logreg_rank.py → tgen/rank.py
@@ -2,21 +2,37 @@
 # -*- coding: utf-8 -*-
 
 """
-Ranker based on logistic regression.
-"""
+Candidate tree rankers.
 
+"""
 from __future__ import unicode_literals
+from sklearn.feature_extraction.dict_vectorizer import DictVectorizer
+import numpy as np
 import cPickle as pickle
+import operator
 
 from flect.logf import log_info
 from flect.model import Model
 from alex.components.nlg.tectotpl.core.util import file_stream
-
-from futil import read_das, read_ttrees
-from interface import Ranker
-import operator
 from flect.dataset import DataSet
+
 from features import Features
+from futil import read_das, read_ttrees
+
+class Ranker(object):
+
+    @staticmethod
+    def load_from_file(model_fname):
+        """Load a pre-trained model from a file."""
+        log_info("Loading ranker from %s..." % model_fname)
+        with file_stream(model_fname, 'rb', encoding=None) as fh:
+            return pickle.load(fh)
+
+    def save_to_file(self, model_fname):
+        """Save the model to a file."""
+        log_info("Saving ranker to %s..." % model_fname)
+        with file_stream(model_fname, 'wb', encoding=None) as fh:
+            pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)
 
 
 class LogisticRegressionRanker(Ranker):
@@ -97,19 +113,6 @@ def train(self, train_arff_fname):
         self.model = Model(self.cfg['model'])
         self.model.train(train_arff_fname)
 
-    @staticmethod
-    def load_from_file(model_fname):
-        """Load a pre-trained model from a file."""
-        log_info("Loading ranker from %s..." % model_fname)
-        with file_stream(model_fname, 'rb', encoding=None) as fh:
-            return pickle.load(fh)
-
-    def save_to_file(self, model_fname):
-        """Save the model to a file."""
-        log_info("Saving ranker to %s..." % model_fname)
-        with file_stream(model_fname, 'wb', encoding=None) as fh:
-            pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)
-
     def cdf_to_dist(self, cdf):
         """Convert a CDF to a distribution (keep the list format, just discount lower bounds)."""
         lo_bound = 0.0
@@ -130,3 +133,62 @@ def get_best_child(self, parent, da, cdf):
             log_info('Child: %s, score: %s' % (unicode(cdf[index][0]), unicode(rank)))
         log_info('Best child: %s, score: %s' % (unicode(cdf[best_index][0]), unicode(ranks[best_index])))
         return cdf[best_index][0]
+
+
+class PerceptronRanker(Ranker):
+
+    def __init__(self, cfg):
+        self.w = None
+        self.features = ['bias']
+        self.vectorizer = None
+        self.alpha = 1
+        self.passes = 5
+        self.train_cands = 10
+        if cfg:
+            if 'features' in cfg:
+                self.features.extend(cfg['features'])
+            if 'alpha' in cfg:
+                self.alpha = cfg['alpha']
+            if 'passes' in cfg:
+                self.passes = cfg['passes']
+            if 'train_cands' in cfg:
+                self.train_cands = cfg['train_cands']
+        # initialize feature functions
+        self.features = Features(self.features)
+
+    def score(self, cand_ttree, da):
+        feats = self.vectorizer.transform(self.features.get_features(cand_ttree, {'da': da}))
+        return self._score(feats)
+
+    def _score(self, cand_feats):
+        return self.w * cand_feats.toarray()[0]
+
+    def train(self, das_file, ttree_file):
+        # read input
+        das = read_das(das_file)
+        ttrees = read_ttrees(ttree_file)
+        # compute features for trees
+        X = []
+        for da, ttree in zip(das, ttrees.bundles):
+            ttree = ttree.get_zone(self.language, self.selector).ttree
+            X.append(self.features.get_features(ttree, {'da': da}))
+        # vectorize
+        self.vectorizer = DictVectorizer()
+        X = self.vectorizer.fit_transform(X, sparse=True)
+        # initialize weights
+        self.w = np.zeros(X.get_shape[1])  # number of columns
+        # 1st pass over training data -- just add weights
+        for inst in X:
+            self.w += self.alpha * inst.toarray()[0]
+        # further passes over training data -- compare the right instance to other, wrong ones
+        for _ in xrange(self.passes):
+            for inst in X:
+                # get some random 'other' candidates and score them along with the right one
+                cands = [inst] + [cand for cand in np.random.choice(X, self.train_cands)
+                                  if not np.array_equal(cand.toarray(), inst.toarray())]
+                scores = [self.score(cand) for cand in cands]
+                top_cand_idx = scores.index(max(scores))
+                # update weights if the system doesn't give the highest score to the right one
+                if top_cand_idx != 0:
+                    self.w += (self.alpha * inst.toarray()[0] -
+                               self.alpha * cands[top_cand_idx].toarray()[0])
diff --git a/tgen/tgen.py b/tgen/tgen.py
@@ -14,7 +14,10 @@
 rank_create_data -- create training data for logistic regression ranker
     - arguments: [-h use-headers] train-das train-ttrees candgen-model ranker-config output-train-data
 
-rank_train -- train logistic regression ranker
+logregrank_train -- train logistic regression local ranker
+    - arguments: ranker-config ranker-train-data output-model
+
+percrank_train -- train perceptron global ranker
     - arguments: ranker-config ranker-train-data output-model
 
 generate -- generate using the given candidate generator and ranker
@@ -39,6 +42,7 @@
 from getopt import getopt
 from eval import tp_fp_fn, f1_from_counts, p_r_f1_from_counts
 from alex.components.nlg.tectotpl.core.util import file_stream
+from percrank import PerceptronRanker
 
 
 if __name__ == '__main__':
@@ -87,7 +91,7 @@
         ranker.create_training_data(fname_ttrees_train, fname_da_train, candgen, fname_rank_train,
                                     header_file=header_file)
 
-    elif action == 'rank_train':
+    elif action == 'logregrank_train':
         if len(args) != 3:
             sys.exit(__doc__)
 
@@ -99,6 +103,16 @@
         ranker.train(fname_rank_train)
         ranker.save_to_file(fname_rank_model)
 
+    elif action == 'percrank_train':
+
+        fname_rank_config, fname_rank_train, fname_rank_model = args
+        log_info('Training ranker...')
+
+        rank_config = Config(fname_rank_config)
+        ranker = PerceptronRanker(rank_config)
+        ranker.train(fname_rank_train)
+        ranker.save_to_file(fname_rank_model)
+
     elif action == 'generate':
 
         opts, files = getopt(args, 'r:n:o:w:')