Skip to content

Commit

Permalink
Perceptron ranker initial, rankers moved to one source file
Browse files Browse the repository at this point in the history
  • Loading branch information
tuetschek committed May 13, 2014
1 parent 2b3c444 commit 85151cd
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 116 deletions.
50 changes: 0 additions & 50 deletions bagel-data/config.py

This file was deleted.

34 changes: 21 additions & 13 deletions tgen/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@
from functools import partial


def find_nodes(node, scope):
def find_nodes(node, scope, incremental=False):
"""Given a parent node and scope specifications (in a list), this returns the
corresponding nodes.
"""
nodes = []
for scope_spec in scope:
if scope_spec == 'node':
if scope_spec == 'node' or incremental:
nodes.apppend(node)
elif scope_spec == 'tree':
nodes.extend(node.root.get_descendants())
Expand All @@ -33,19 +33,19 @@ def find_nodes(node, scope):
return nodes


def same_as_current(cur_node, scope_func, attrib):
def same_as_current(node, context, scope_func, attrib, incremental=False):
"""Return the number of nodes in the given scope that have the same value
of the given attribute as the current node.
@rtype: dict
@return: dictionary with one key ('') and the number of matching values as a value
"""
if attrib == 'right':
value = True if cur_node.parent and cur_node > cur_node.parent else False
value = True if node.parent and node > node.parent else False
else:
value = getattr(cur_node, attrib)
value = getattr(node, attrib)
num_matching = 0.0
for node in scope_func(cur_node):
for node in scope_func(node):
if attrib == 'right': # special handling for 'right'
if node.parent and (node > node.parent) == value:
num_matching += 1
Expand All @@ -54,15 +54,15 @@ def same_as_current(cur_node, scope_func, attrib):
return {'': num_matching}


def value(cur_node, scope_func, attrib):
def value(node, context, scope_func, attrib, incremental=False):
"""Return the number of nodes holding the individual values of the given attribute
in the given scope.
@rtype dict
@return: dictionary with keys for values of the attribute, values for counts of matching nodes
"""
ret = defaultdict(float)
for node in scope_func(cur_node):
for node in scope_func(node):
if attrib == 'right':
if node.parent and node > node.parent:
ret['True'] += 1
Expand All @@ -73,8 +73,14 @@ def value(cur_node, scope_func, attrib):
return ret


def prob(node, parent):
return {'': node[1]}
def prob(node, context):
# TODO this won't work. Use wild attributes? Or some other structure?
return {'': context['node_prob']}


def bias(node):
"""A constant feature function, always returning 1"""
return {'': 1}


class Features(object):
Expand All @@ -83,7 +89,7 @@ def __init__(self, cfg):
self.features = self.parse_feature_spec(cfg)

def parse_feature_spec(self, spec):
"""Prepares feature feature function from specifications in the following format:
"""Prepares feature functions from specifications in the following format:
Label: value/same_as_current scope param1, ...
Expand All @@ -96,6 +102,8 @@ def parse_feature_spec(self, spec):
label, func_name = re.split(r'[:\s]+', feat, 1)
if func_name == 'prob':
features[label] = prob
elif func_name == 'bias':
features[label] = bias
else:
func_name, func_scope, func_params = re.split(r'[:\s]+', func_name, 2)
func_params = re.split(r'[,\s]+', func_params)
Expand All @@ -110,7 +118,7 @@ def parse_feature_spec(self, spec):
features[label] = feat_func
return features

def get_features(self, node, feats=defaultdict(float)):
def get_features(self, node, context, feats=defaultdict(float)):
"""Return features for the given node. Accumulates features from other nodes
if given in the feats parameter.
Expand All @@ -119,7 +127,7 @@ def get_features(self, node, feats=defaultdict(float)):
"""
feats_hier = {}
for name, func in self.features.iteritems():
feats_hier[name] = func(node)
feats_hier[name] = func(node, context)
for name, val in feats_hier.iteritems():
for subname, subval in val.iteritems():
feats[name + '_' + subname if subname else name] += subval
Expand Down
8 changes: 0 additions & 8 deletions tgen/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,3 @@

class CandidateGenerator(object):
pass


class Ranker(object):

def get_best_child(self, parent, cdf):
raise NotImplementedError


21 changes: 0 additions & 21 deletions tgen/percrank.py

This file was deleted.

7 changes: 4 additions & 3 deletions tgen/planner.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ def __init__(self, cfg):
self.debug_out = None
if 'debug_out' in cfg:
self.debug_out = cfg['debug_out']
self.ranker = cfg['ranker']

def generate_tree(self, da, gen_doc=None, gold_ttree=None):
# TODO add future cost ?
Expand All @@ -176,15 +177,15 @@ def generate_tree(self, da, gen_doc=None, gold_ttree=None):
cand, score = open_list.pop()
if gold_ttree and cand == gold_ttree:
print >> self.debug_out, "IT %05d: CANDIDATE MATCHES GOLD" % num_iter
score = -1.0
# score = -1.0
close_list.push(cand, score)
if self.debug_out:
print >> self.debug_out, ("\n***\nIT %05d:%s\nO: %d C: %d\n***" %
(num_iter, unicode(cand), len(open_list), len(close_list)))
self.debug_out.flush()
successors = self.candgen.get_all_successors(cand, cdfs)
# TODO add real scoring here
open_list.pushall({s: float(len(s.get_descendants()))
# add candidates with score
open_list.pushall({s: self.ranker.score(s, da) * -1
for s in successors if not s in close_list})
# if self.debug_out:
# print >> self.debug_out, "\n".join(map(unicode, self.open_list.members.keys()))
Expand Down
100 changes: 81 additions & 19 deletions tgen/logreg_rank.py → tgen/rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,37 @@
# -*- coding: utf-8 -*-

"""
Ranker based on logistic regression.
"""
Candidate tree rankers.
"""
from __future__ import unicode_literals
from sklearn.feature_extraction.dict_vectorizer import DictVectorizer
import numpy as np
import cPickle as pickle
import operator

from flect.logf import log_info
from flect.model import Model
from alex.components.nlg.tectotpl.core.util import file_stream

from futil import read_das, read_ttrees
from interface import Ranker
import operator
from flect.dataset import DataSet

from features import Features
from futil import read_das, read_ttrees

class Ranker(object):

@staticmethod
def load_from_file(model_fname):
"""Load a pre-trained model from a file."""
log_info("Loading ranker from %s..." % model_fname)
with file_stream(model_fname, 'rb', encoding=None) as fh:
return pickle.load(fh)

def save_to_file(self, model_fname):
"""Save the model to a file."""
log_info("Saving ranker to %s..." % model_fname)
with file_stream(model_fname, 'wb', encoding=None) as fh:
pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)


class LogisticRegressionRanker(Ranker):
Expand Down Expand Up @@ -97,19 +113,6 @@ def train(self, train_arff_fname):
self.model = Model(self.cfg['model'])
self.model.train(train_arff_fname)

@staticmethod
def load_from_file(model_fname):
"""Load a pre-trained model from a file."""
log_info("Loading ranker from %s..." % model_fname)
with file_stream(model_fname, 'rb', encoding=None) as fh:
return pickle.load(fh)

def save_to_file(self, model_fname):
"""Save the model to a file."""
log_info("Saving ranker to %s..." % model_fname)
with file_stream(model_fname, 'wb', encoding=None) as fh:
pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)

def cdf_to_dist(self, cdf):
"""Convert a CDF to a distribution (keep the list format, just discount lower bounds)."""
lo_bound = 0.0
Expand All @@ -130,3 +133,62 @@ def get_best_child(self, parent, da, cdf):
log_info('Child: %s, score: %s' % (unicode(cdf[index][0]), unicode(rank)))
log_info('Best child: %s, score: %s' % (unicode(cdf[best_index][0]), unicode(ranks[best_index])))
return cdf[best_index][0]


class PerceptronRanker(Ranker):

def __init__(self, cfg):
self.w = None
self.features = ['bias']
self.vectorizer = None
self.alpha = 1
self.passes = 5
self.train_cands = 10
if cfg:
if 'features' in cfg:
self.features.extend(cfg['features'])
if 'alpha' in cfg:
self.alpha = cfg['alpha']
if 'passes' in cfg:
self.passes = cfg['passes']
if 'train_cands' in cfg:
self.train_cands = cfg['train_cands']
# initialize feature functions
self.features = Features(self.features)

def score(self, cand_ttree, da):
feats = self.vectorizer.transform(self.features.get_features(cand_ttree, {'da': da}))
return self._score(feats)

def _score(self, cand_feats):
return self.w * cand_feats.toarray()[0]

def train(self, das_file, ttree_file):
# read input
das = read_das(das_file)
ttrees = read_ttrees(ttree_file)
# compute features for trees
X = []
for da, ttree in zip(das, ttrees.bundles):
ttree = ttree.get_zone(self.language, self.selector).ttree
X.append(self.features.get_features(ttree, {'da': da}))
# vectorize
self.vectorizer = DictVectorizer()
X = self.vectorizer.fit_transform(X, sparse=True)
# initialize weights
self.w = np.zeros(X.get_shape[1]) # number of columns
# 1st pass over training data -- just add weights
for inst in X:
self.w += self.alpha * inst.toarray()[0]
# further passes over training data -- compare the right instance to other, wrong ones
for _ in xrange(self.passes):
for inst in X:
# get some random 'other' candidates and score them along with the right one
cands = [inst] + [cand for cand in np.random.choice(X, self.train_cands)
if not np.array_equal(cand.toarray(), inst.toarray())]
scores = [self.score(cand) for cand in cands]
top_cand_idx = scores.index(max(scores))
# update weights if the system doesn't give the highest score to the right one
if top_cand_idx != 0:
self.w += (self.alpha * inst.toarray()[0] -
self.alpha * cands[top_cand_idx].toarray()[0])
18 changes: 16 additions & 2 deletions tgen/tgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
rank_create_data -- create training data for logistic regression ranker
- arguments: [-h use-headers] train-das train-ttrees candgen-model ranker-config output-train-data
rank_train -- train logistic regression ranker
logregrank_train -- train logistic regression local ranker
- arguments: ranker-config ranker-train-data output-model
percrank_train -- train perceptron global ranker
- arguments: ranker-config ranker-train-data output-model
generate -- generate using the given candidate generator and ranker
Expand All @@ -39,6 +42,7 @@
from getopt import getopt
from eval import tp_fp_fn, f1_from_counts, p_r_f1_from_counts
from alex.components.nlg.tectotpl.core.util import file_stream
from percrank import PerceptronRanker


if __name__ == '__main__':
Expand Down Expand Up @@ -87,7 +91,7 @@
ranker.create_training_data(fname_ttrees_train, fname_da_train, candgen, fname_rank_train,
header_file=header_file)

elif action == 'rank_train':
elif action == 'logregrank_train':
if len(args) != 3:
sys.exit(__doc__)

Expand All @@ -99,6 +103,16 @@
ranker.train(fname_rank_train)
ranker.save_to_file(fname_rank_model)

elif action == 'percrank_train':

fname_rank_config, fname_rank_train, fname_rank_model = args
log_info('Training ranker...')

rank_config = Config(fname_rank_config)
ranker = PerceptronRanker(rank_config)
ranker.train(fname_rank_train)
ranker.save_to_file(fname_rank_model)

elif action == 'generate':

opts, files = getopt(args, 'r:n:o:w:')
Expand Down

0 comments on commit 85151cd

Please sign in to comment.