From 07c06e4073217763590f4163fbe6ea75b18ea07e Mon Sep 17 00:00:00 2001
From: Hal Daume III <me@hal3.name>
Date: Mon, 16 Feb 2015 13:54:26 -0500
Subject: [PATCH 01/13] fix load/save bug in graph search

---
 Makefile                     | 4 ++--
 vowpalwabbit/search.cc       | 9 ++++++---
 vowpalwabbit/search_graph.cc | 8 ++++++--
 3 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/Makefile b/Makefile
index 28a091774ab..8521fc08396 100644
--- a/Makefile
+++ b/Makefile
@@ -66,8 +66,8 @@ endif
 FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) $(OPTIM_FLAGS) -D_FILE_OFFSET_BITS=64 -DNDEBUG $(BOOST_INCLUDE)  -fPIC #-DVW_LDA_NO_SSE
 
 # for profiling -- note that it needs to be gcc
-#FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) -O2 -fno-strict-aliasing -ffast-math -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -pg  -fPIC #-DVW_LDA_NO_S
-#CXX = g++
+FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) -O2 -fno-strict-aliasing -ffast-math -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -pg  -fPIC #-DVW_LDA_NO_S
+CXX = g++
 
 # for valgrind / gdb debugging
 #FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) -ffast-math -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -g -O0  -fPIC
diff --git a/vowpalwabbit/search.cc b/vowpalwabbit/search.cc
index 058d7fcfdb4..b885754fb3c 100644
--- a/vowpalwabbit/search.cc
+++ b/vowpalwabbit/search.cc
@@ -1134,7 +1134,9 @@ namespace Search {
         for (size_t i=0; i<condition_on_cnt; i++)
           priv.condition_on_actions[i] = ((1 <= condition_on[i]) && (condition_on[i] < priv.ptag_to_action.size())) ? priv.ptag_to_action[condition_on[i]] : 0;
 
-        if (cached_action_store_or_find(priv, mytag, condition_on, condition_on_names, priv.condition_on_actions.begin, condition_on_cnt, policy, learner_id, a, false))
+        bool not_test = priv.all->training && !ecs[0].test_only;
+
+        if (not_test && cached_action_store_or_find(priv, mytag, condition_on, condition_on_names, priv.condition_on_actions.begin, condition_on_cnt, policy, learner_id, a, false))
           // if this succeeded, 'a' has the right action
           priv.total_cache_hits++;
         else { // we need to predict, and then cache
@@ -1165,7 +1167,8 @@ namespace Search {
             for (size_t n=start_K; n<ec_cnt; n++)
               del_example_conditioning(priv, ecs[n]);
 
-          cached_action_store_or_find(priv, mytag, condition_on, condition_on_names, priv.condition_on_actions.begin, condition_on_cnt, policy, learner_id, a, true);
+          if (not_test)
+            cached_action_store_or_find(priv, mytag, condition_on, condition_on_names, priv.condition_on_actions.begin, condition_on_cnt, policy, learner_id, a, true);
         }
       }
 
@@ -1862,7 +1865,7 @@ namespace Search {
       ("search_trained_nb_policies", po::value<size_t>(), "the number of trained policies in a file")
       
       ("search_allowed_transitions",po::value<string>(),"read file of allowed transitions [def: all transitions are allowed]")
-      ("search_subsample_time",    po::value<float>(),  "instead of training at all timesteps, use a subset. if value in (0,1), train on a random v%. if v>=1, train on precisely v steps per example")
+      ("search_subsample_time",    po::value<float>(),  "instead of training at all timesteps, use a subset. if value in (0,1), train on a random v%. if v>=1, train on precisely v steps per example, if v<0, use active learning")
       ("search_neighbor_features", po::value<string>(), "copy features from neighboring lines. argument looks like: '-1:a,+2' meaning copy previous line namespace a and next next line from namespace _unnamed_, where ',' separates them")
       ("search_rollout_num_steps", po::value<size_t>(), "how many calls of \"loss\" before we stop really predicting on rollouts and switch to oracle (def: 0 means \"infinite\")")
       ("search_history_length",    po::value<size_t>(), "some tasks allow you to specify how much history their depend on; specify that here [def: 1]")
diff --git a/vowpalwabbit/search_graph.cc b/vowpalwabbit/search_graph.cc
index 8aa0c349450..cd22cc72323 100644
--- a/vowpalwabbit/search_graph.cc
+++ b/vowpalwabbit/search_graph.cc
@@ -65,6 +65,7 @@ namespace GraphTask {
     vector<size_t>   pred;  // predictions
     example*cur_node;       // pointer to the current node for add_edge_features_fn
     float* neighbor_predictions;  // prediction on this neighbor for add_edge_features_fn
+    weight* weight_vector;
   };
 
   inline bool example_is_test(polylabel&l) { return l.cs.costs.size() == 0; }
@@ -136,6 +137,7 @@ namespace GraphTask {
 
     D.mask = sch.get_vw_pointer_unsafe().reg.weight_mask;
     D.ss   = sch.get_vw_pointer_unsafe().reg.stride_shift;
+    D.weight_vector = sch.get_vw_pointer_unsafe().reg.weight_vector;
     
     D.N = 0;
     D.E = 0;
@@ -183,7 +185,8 @@ namespace GraphTask {
     example*node = D.cur_node;
     for (size_t k=0; k<=D.K; k++) {
       if (D.neighbor_predictions[k] == 0.) continue;
-      feature f = { fv * D.neighbor_predictions[k], (uint32_t) (( (fx >> D.ss) + 348919043 * k ) << D.ss) };
+      feature f = { fv * D.neighbor_predictions[k], (uint32_t) ((( ((fx & D.mask) >> D.ss) + 348919043 * k ) << D.ss) & D.mask) };
+      //cerr << "e: " << fx << " (:= " << ((fx & D.mask) >> D.ss) << ") / " << k << " -> " << f.weight_index << ", w=" << D.weight_vector[f.weight_index] << endl;
       node->atomics[neighbor_namespace].push_back(f);
       node->sum_feat_sq[neighbor_namespace] += f.x * f.x;
     }
@@ -193,7 +196,7 @@ namespace GraphTask {
   void add_edge_features_single_fn(task_data&D, float fv, uint32_t fx) {
     example*node = D.cur_node;
     size_t k = (size_t) D.neighbor_predictions[0];
-    feature f = { fv, (uint32_t) (( (fx >> D.ss) + 348919043 * k ) << D.ss) };
+    feature f = { fv, (uint32_t) (( ((fx & D.mask) >> D.ss) + 348919043 * k ) << D.ss) };
     node->atomics[neighbor_namespace].push_back(f);
     node->sum_feat_sq[neighbor_namespace] += f.x * f.x;
     // TODO: audit
@@ -218,6 +221,7 @@ namespace GraphTask {
       if (pred_total == 0.) continue;
       //for (size_t k=0; k<D.K+1; k++) D.neighbor_predictions[k] /= pred_total;
       example&edge = *ec[i];
+      //cerr << "adding to n=" << n << " from e=" << i << endl;
       if (pred_total <= 1.) {  // single edge
         D.neighbor_predictions[0] = (float)last_pred;
         GD::foreach_feature<task_data,uint32_t,add_edge_features_single_fn>(sch.get_vw_pointer_unsafe(), edge, D);

From 4172e153169f54dbbee5e7fd32d8c74d822e70a7 Mon Sep 17 00:00:00 2001
From: Hal Daume III <me@hal3.name>
Date: Mon, 16 Feb 2015 15:45:16 -0500
Subject: [PATCH 02/13] modularized active slightly so it does not get
 unnecessary arguments

---
 vowpalwabbit/active.cc | 14 +++++---------
 vowpalwabbit/active.h  |  8 ++++++++
 vowpalwabbit/search.cc | 18 +++++++++++++++---
 3 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/vowpalwabbit/active.cc b/vowpalwabbit/active.cc
index 09b72077751..63cbb3bfa73 100644
--- a/vowpalwabbit/active.cc
+++ b/vowpalwabbit/active.cc
@@ -2,14 +2,10 @@
 #include "rand48.h"
 #include "float.h"
 #include "vw.h"
+#include "active.h"
 
 using namespace LEARNER;
 
-struct active{
-  float active_c0;
-  vw* all;//statistics, loss
-};
-
 float get_active_coin_bias(float k, float avg_loss, float g, float c0)
 {
   float b,sb,rs,sl;
@@ -24,7 +20,7 @@ float get_active_coin_bias(float k, float avg_loss, float g, float c0)
   return b*rs*rs;
 }
   
-  float query_decision(active& a, example& ec, float k)
+  float query_decision(active& a, float ec_revert_weight, float k)
     {
       float bias, avg_loss, weighted_queries;
       if (k<=1.)
@@ -32,7 +28,7 @@ float get_active_coin_bias(float k, float avg_loss, float g, float c0)
       else{
 	weighted_queries = (float)(a.all->initial_t + a.all->sd->weighted_examples - a.all->sd->weighted_unlabeled_examples);
 	avg_loss = (float)(a.all->sd->sum_loss/k + sqrt((1.+0.5*log(k))/(weighted_queries+0.0001)));
-	bias = get_active_coin_bias(k, avg_loss, ec.revert_weight/k, a.active_c0);
+	bias = get_active_coin_bias(k, avg_loss, ec_revert_weight/k, a.active_c0);
       }
       if(frand48() < bias)
 	return 1.f / bias;
@@ -50,7 +46,7 @@ float get_active_coin_bias(float k, float avg_loss, float g, float c0)
 
 	float k = ec.example_t - ec.l.simple.weight;
 	ec.revert_weight = all.loss->getRevertingWeight(all.sd, ec.pred.scalar, all.eta/powf(k,all.power_t));
-	float importance = query_decision(a, ec, k);
+	float importance = query_decision(a, ec.revert_weight, k);
 
 	if(importance > 0){
 	  all.sd->queries += 1;
@@ -111,7 +107,7 @@ float get_active_coin_bias(float k, float avg_loss, float g, float c0)
     
     float ai=-1; 
     if(ld.label == FLT_MAX)
-      ai=query_decision(a, ec, (float)all.sd->weighted_unlabeled_examples);
+      ai=query_decision(a, ec.revert_weight, (float)all.sd->weighted_unlabeled_examples);
 
     all.print(all.raw_prediction, ec.partial_prediction, -1, ec.tag);
     for (size_t i = 0; i<all.final_prediction_sink.size(); i++)
diff --git a/vowpalwabbit/active.h b/vowpalwabbit/active.h
index 342870bbb98..081bb5a45c2 100644
--- a/vowpalwabbit/active.h
+++ b/vowpalwabbit/active.h
@@ -1 +1,9 @@
+#pragma once
+
+struct active{
+  float active_c0;
+  vw* all;//statistics, loss
+};
+
+float query_decision(active& a, example& ec, float k);
 LEARNER::base_learner* active_setup(vw& all);
diff --git a/vowpalwabbit/search.cc b/vowpalwabbit/search.cc
index b885754fb3c..0c0767919de 100644
--- a/vowpalwabbit/search.cc
+++ b/vowpalwabbit/search.cc
@@ -18,6 +18,7 @@ license as described in the file LICENSE.
 #include "search_graph.h"
 #include "csoaa.h"
 #include "beam.h"
+#include "active.h"
 
 using namespace LEARNER;
 using namespace std;
@@ -1186,9 +1187,20 @@ namespace Search {
   inline bool cmp_size_t_pair(const pair<size_t,size_t>& a, const pair<size_t,size_t>& b) { return ((a.first == b.first) && (a.second < b.second)) || (a.first < b.first); }
   void get_training_timesteps(search_private& priv, v_array< pair<size_t,size_t> >& timesteps) {  // timesteps are pairs of (beam elem, t) where beam elem == 0 means "default" for non-beam search
     timesteps.erase();
-    
+
+    // if there's active learning, we need to 
+    if (priv.subsample_timesteps <= -1) {
+      for (size_t t=0; t<priv.T; t++) {
+        active active_str = { 1., priv.all };
+        float k = (float)priv.total_examples_generated;
+        priv.ec_seq[t]->revert_weight = priv.all->loss->getRevertingWeight(priv.all->sd, priv.ec_seq[t].pred.scalar, priv.all->eta / powf(k, priv.all->power_t));
+        float importance = query_decision(active_str, *priv.ec_seq[t], k);
+        if (importance > 0.)
+          timesteps.push_back(pair<size_t,size_t>(0,t));
+      }
+    }
     // if there's no subsampling to do, just return [0,T)
-    if (priv.subsample_timesteps <= 0)
+    else if (priv.subsample_timesteps <= 0)
       for (size_t t=0; t<priv.T; t++)
         timesteps.push_back(pair<size_t,size_t>(0,t));
 
@@ -1865,7 +1877,7 @@ namespace Search {
       ("search_trained_nb_policies", po::value<size_t>(), "the number of trained policies in a file")
       
       ("search_allowed_transitions",po::value<string>(),"read file of allowed transitions [def: all transitions are allowed]")
-      ("search_subsample_time",    po::value<float>(),  "instead of training at all timesteps, use a subset. if value in (0,1), train on a random v%. if v>=1, train on precisely v steps per example, if v<0, use active learning")
+      ("search_subsample_time",    po::value<float>(),  "instead of training at all timesteps, use a subset. if value in (0,1), train on a random v%. if v>=1, train on precisely v steps per example, if v<=-1, use active learning")
       ("search_neighbor_features", po::value<string>(), "copy features from neighboring lines. argument looks like: '-1:a,+2' meaning copy previous line namespace a and next next line from namespace _unnamed_, where ',' separates them")
       ("search_rollout_num_steps", po::value<size_t>(), "how many calls of \"loss\" before we stop really predicting on rollouts and switch to oracle (def: 0 means \"infinite\")")
       ("search_history_length",    po::value<size_t>(), "some tasks allow you to specify how much history their depend on; specify that here [def: 1]")

From 83eda56c1498b4e63bba79decffb8cd878a18882 Mon Sep 17 00:00:00 2001
From: Hal Daume III <me@hal3.name>
Date: Tue, 17 Feb 2015 11:59:22 -0500
Subject: [PATCH 03/13] faster search, fewer predictions needed, much faster
 beam

---
 Makefile                             |  4 ++--
 test/train-sets/ref/search_er.stderr |  4 ++--
 vowpalwabbit/beam.h                  | 12 +++++-----
 vowpalwabbit/search.cc               | 35 ++++++++++++++++++++++++----
 4 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/Makefile b/Makefile
index 8521fc08396..28a091774ab 100644
--- a/Makefile
+++ b/Makefile
@@ -66,8 +66,8 @@ endif
 FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) $(OPTIM_FLAGS) -D_FILE_OFFSET_BITS=64 -DNDEBUG $(BOOST_INCLUDE)  -fPIC #-DVW_LDA_NO_SSE
 
 # for profiling -- note that it needs to be gcc
-FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) -O2 -fno-strict-aliasing -ffast-math -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -pg  -fPIC #-DVW_LDA_NO_S
-CXX = g++
+#FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) -O2 -fno-strict-aliasing -ffast-math -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -pg  -fPIC #-DVW_LDA_NO_S
+#CXX = g++
 
 # for valgrind / gdb debugging
 #FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) -ffast-math -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -g -O0  -fPIC
diff --git a/test/train-sets/ref/search_er.stderr b/test/train-sets/ref/search_er.stderr
index 27204d74ce9..a5e0767e786 100644
--- a/test/train-sets/ref/search_er.stderr
+++ b/test/train-sets/ref/search_er.stderr
@@ -11,7 +11,7 @@ loss       last        counter           output prefix          output prefix  p
 1.000000   1.000000          1  [4                   ] [1                   ]     0     0        1        0        1  0.000000
 2.000000   3.000000          2  [2 4 2 5 10 10       ] [4 4 4 10 10 10      ]     0     0        7        0        7  0.000000
 2.875000   3.750000          4  [1 4 4 1 10 10 10 1..] [4 4 4 4 10 10 10 1..]     0     0       32        0       32  0.000000
-1.437500   0.000000          8  [1 4 4 1 10 10 10 1..] [1 4 4 1 10 10 10 1..]     1     0       42        0       64  0.000001
+1.437500   0.000000          8  [1 4 4 1 10 10 10 1..] [1 4 4 1 10 10 10 1..]     1     0       64        0       64  0.000001
 
 finished run
 number of examples per pass = 4
@@ -19,4 +19,4 @@ passes used = 3
 weighted example sum = 12
 weighted label sum = 0
 average loss = 0.958333
-total feature number = 522
+total feature number = 1185
diff --git a/vowpalwabbit/beam.h b/vowpalwabbit/beam.h
index 71f45544deb..e5a45c701c2 100644
--- a/vowpalwabbit/beam.h
+++ b/vowpalwabbit/beam.h
@@ -91,16 +91,16 @@ inline int compare_on_hash_then_cost(const void *void_a, const void *void_b) {
     if (beam_size == 1) do_kbest = false;  // automatically turn of kbest
   }
 
-  bool insert(T*data, float cost, uint32_t hash) { // returns TRUE iff element was actually added
+  bool might_insert(float cost) {
     bool should_add = false;
-
     if (count < beam_size) should_add = true;
     else if (cost < worst_cost) should_add = true;
     if (cost > prune_if_gt) should_add = false;
-    
-    //cerr << "insert " << ((size_t)data) << " with cost=" << cost << " wc=" << worst_cost << " count=" << count << " size=" << beam_size << " has should_add=" << should_add << endl;
-    
-    if (!should_add) return false;
+    return should_add;
+  }   
+
+  bool insert(T*data, float cost, uint32_t hash) { // returns TRUE iff element was actually added
+    if (!might_insert(cost)) return false;
 
     //bool we_were_worse = false;
     // if (is_equivalent) {
diff --git a/vowpalwabbit/search.cc b/vowpalwabbit/search.cc
index 0c0767919de..ab638bb6b0f 100644
--- a/vowpalwabbit/search.cc
+++ b/vowpalwabbit/search.cc
@@ -38,7 +38,7 @@ namespace Search {
                                NULL };   // must NULL terminate!
 
   const bool PRINT_UPDATE_EVERY_EXAMPLE =0;
-  const bool PRINT_UPDATE_EVERY_PASS =0;
+  const bool PRINT_UPDATE_EVERY_PASS =1;
   const bool PRINT_CLOCK_TIME =0;
 
   string   neighbor_feature_space("neighbor");
@@ -170,6 +170,7 @@ namespace Search {
     v_array<action> condition_on_actions;
     v_array< pair<size_t,size_t> > timesteps;
     v_array<float> learn_losses;
+    v_array< pair<float,size_t> > active_uncertainty;
     
     LEARNER::base_learner* base_learner;
     clock_t start_clock_time;
@@ -544,6 +545,8 @@ namespace Search {
         size_t i = (allowed_actions_cnt > 0) ? allowed_actions[j] : j;
         if (i == ret) continue;
 
+        if (! priv.beam->might_insert( alternative_costs[i] )) continue;
+
         action_prefix* px = new action_prefix;
         *px = v_init<action>();
         px->resize(new_len+1);
@@ -709,6 +712,18 @@ namespace Search {
     priv.base_learner->predict(ec, policy);
     uint32_t act = ec.pred.multiclass;
 
+    if ((priv.state == INIT_TRAIN) && (priv.subsample_timesteps <= -1)) { // active learning
+      size_t K = cs_get_costs_size(priv.cb_learner, ec.l);
+      float min_cost = FLT_MAX, min_cost2 = FLT_MAX;
+      for (size_t k = 0; k < K; k++) {
+        float cost = cs_get_cost_partial_prediction(priv.cb_learner, ec.l, k);
+        if (cost < min_cost) { min_cost2 = min_cost; min_cost = cost; }
+        else if (cost < min_cost2) { min_cost2 = cost; }
+      }
+      if (min_cost2 < FLT_MAX)
+        priv.active_uncertainty.push_back( make_pair(min_cost2 - min_cost, priv.t) );
+    }
+    
     // in beam search mode, go through alternatives and add them as back-ups
     if (priv.beam) {
       float act_cost = 0;
@@ -725,6 +740,9 @@ namespace Search {
         action k_act = cs_get_cost_index(priv.cb_learner, ec.l, k);
         if (k_act == act) continue;  // skip the taken action
         float delta_cost = cs_get_cost_partial_prediction(priv.cb_learner, ec.l, k) - act_cost + priv.beam_initial_cost;
+
+        if (! priv.beam->might_insert( delta_cost )) continue;
+        
         // construct the action prefix
         action_prefix* px = new v_array<action>;
         *px = v_init<action>();
@@ -803,6 +821,7 @@ namespace Search {
       for (size_t k=start_K; k<ec_cnt; k++) {
         if (k == best_action) continue;
         float delta_cost = ecs[k].partial_prediction - best_prediction + priv.beam_initial_cost;
+        if (! priv.beam->might_insert( delta_cost )) continue;
         action_prefix* px = new v_array<action>;
         *px = v_init<action>();
         px->resize(new_len + 1);
@@ -1190,17 +1209,20 @@ namespace Search {
 
     // if there's active learning, we need to 
     if (priv.subsample_timesteps <= -1) {
-      for (size_t t=0; t<priv.T; t++) {
-        active active_str = { 1., priv.all };
+      for (size_t i=0; i<priv.active_uncertainty.size(); i++)
+        if (frand48() > priv.active_uncertainty[i].first)
+          timesteps.push_back(pair<size_t,size_t>(0, priv.active_uncertainty[i].second - 1));
+          /*
         float k = (float)priv.total_examples_generated;
         priv.ec_seq[t]->revert_weight = priv.all->loss->getRevertingWeight(priv.all->sd, priv.ec_seq[t].pred.scalar, priv.all->eta / powf(k, priv.all->power_t));
         float importance = query_decision(active_str, *priv.ec_seq[t], k);
         if (importance > 0.)
           timesteps.push_back(pair<size_t,size_t>(0,t));
-      }
+          */
     }
     // if there's no subsampling to do, just return [0,T)
-    else if (priv.subsample_timesteps <= 0)
+    else
+    if (priv.subsample_timesteps <= 0)
       for (size_t t=0; t<priv.T; t++)
         timesteps.push_back(pair<size_t,size_t>(0,t));
 
@@ -1412,6 +1434,7 @@ namespace Search {
 
     reset_search_structure(priv);
     priv.state = INIT_TRAIN;
+    priv.active_uncertainty.erase();
     priv.train_trajectory.erase();  // this is where we'll store the training sequence
     priv.task->run(sch, priv.ec_seq);
 
@@ -1437,6 +1460,7 @@ namespace Search {
     priv.T = priv.t;
     if (priv.beam) get_training_timesteps_beam(priv, *final_beam, priv.timesteps);
     else           get_training_timesteps(priv, priv.timesteps);
+
     priv.learn_losses.erase();
     size_t last_beam_id = 0;
     for (size_t tid=0; tid<priv.timesteps.size(); tid++) {
@@ -1602,6 +1626,7 @@ namespace Search {
 
     priv.label_is_test = mc_label_is_test;
 
+    priv.active_uncertainty = v_init< pair<float,size_t> >();
     priv.cross_validate = false;
     priv.A = 1;
     priv.num_learners = 1;

From 09ebea2fbc9dc0b7115b6043ef0f2a094c103621 Mon Sep 17 00:00:00 2001
From: Hal Daume III <me@hal3.name>
Date: Tue, 17 Feb 2015 15:04:35 -0500
Subject: [PATCH 04/13] code cleanup; fixed memory leak in dictionaries

---
 vowpalwabbit/beam.h           | 8 +-------
 vowpalwabbit/parse_args.cc    | 4 ++++
 vowpalwabbit/parse_example.cc | 3 ++-
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/vowpalwabbit/beam.h b/vowpalwabbit/beam.h
index e5a45c701c2..7f177569e93 100644
--- a/vowpalwabbit/beam.h
+++ b/vowpalwabbit/beam.h
@@ -91,13 +91,7 @@ inline int compare_on_hash_then_cost(const void *void_a, const void *void_b) {
     if (beam_size == 1) do_kbest = false;  // automatically turn of kbest
   }
 
-  bool might_insert(float cost) {
-    bool should_add = false;
-    if (count < beam_size) should_add = true;
-    else if (cost < worst_cost) should_add = true;
-    if (cost > prune_if_gt) should_add = false;
-    return should_add;
-  }   
+  inline bool might_insert(float cost) { return (cost <= prune_if_gt) && ((count < beam_size) || (cost < worst_cost)); }
 
   bool insert(T*data, float cost, uint32_t hash) { // returns TRUE iff element was actually added
     if (!might_insert(cost)) return false;
diff --git a/vowpalwabbit/parse_args.cc b/vowpalwabbit/parse_args.cc
index fe90da90240..3a3d984aa7d 100644
--- a/vowpalwabbit/parse_args.cc
+++ b/vowpalwabbit/parse_args.cc
@@ -120,6 +120,10 @@ void parse_dictionary_argument(vw&all, string str) {
     *arr = v_init<feature>();
     push_many(*arr, ec->atomics[def].begin, ec->atomics[def].size());
     map->put(ss, hash, arr);
+
+    // clear up ec
+    ec->tag.erase(); ec->indices.erase();
+    for (size_t i=0; i<256; i++) { ec->atomics[i].erase(); ec->audit_features[i].erase(); }
   }
   dealloc_example(all.p->lp.delete_label, *ec);
   free(ec);
diff --git a/vowpalwabbit/parse_example.cc b/vowpalwabbit/parse_example.cc
index c03097ffb18..3feb5626344 100644
--- a/vowpalwabbit/parse_example.cc
+++ b/vowpalwabbit/parse_example.cc
@@ -41,7 +41,8 @@ class TC_parser {
   example* ae;
   uint32_t* affix_features;
   bool* spelling_features;
-  v_array<char> spelling;
+  v_array<char> spelling = v_init<char>();
+
   vector<feature_dict*>* namespace_dictionaries;
   
   ~TC_parser(){ }

From 54f5c0bcc0520b30bd671cd47aaa14c8f1f7fa99 Mon Sep 17 00:00:00 2001
From: Hal Daume III <me@hal3.name>
Date: Wed, 18 Feb 2015 16:09:15 -0500
Subject: [PATCH 05/13] speed up python, minor tweak to output on beam

---
 python/pylibvw.cc      | 67 +++++++++++++++++++++++++++++-------------
 python/pyvw.py         |  5 ++--
 vowpalwabbit/search.cc | 13 ++++----
 3 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/python/pylibvw.cc b/python/pylibvw.cc
index 798eb44aae1..2decbe98826 100644
--- a/python/pylibvw.cc
+++ b/python/pylibvw.cc
@@ -178,21 +178,23 @@ void ex_push_feature_list(example_ptr ec, vw_ptr vw, unsigned char ns, py::list&
       else { cerr << "warning: malformed feature in list" << endl; continue; }
       ai = fv[0];
     }
-    
-    bool got = false;
-    py::extract<uint32_t> get_int(ai);
-    if (get_int.check()) { f.weight_index = get_int(); got = true; }
-    else {
+
+    if (f.x != 0.) {
+      bool got = false;
       py::extract<string> get_str(ai);
       if (get_str.check()) {
         f.weight_index = VW::hash_feature(*vw, get_str(), ns_hash);
         got = true;
-      } else { cerr << "warning: malformed feature in list" << endl; continue; }
-    }
-    if (got && (f.x != 0.)) {
-      ec->atomics[ns].push_back(f);
-      count++;
-      sum_sq += f.x * f.x;
+      } else {
+        py::extract<uint32_t> get_int(ai);
+        if (get_int.check()) { f.weight_index = get_int(); got = true; }
+        else { cerr << "warning: malformed feature in list" << endl; continue; }
+      }
+      if (got) {
+        ec->atomics[ns].push_back(f);
+        count++;
+        sum_sq += f.x * f.x;
+      }
     }
   }
   ec->num_features += count;
@@ -200,15 +202,6 @@ void ex_push_feature_list(example_ptr ec, vw_ptr vw, unsigned char ns, py::list&
   ec->total_sum_feat_sq += sum_sq;
 }
 
-bool ex_pop_feature(example_ptr ec, unsigned char ns) {
-  if (ec->atomics[ns].size() == 0) return false;
-  feature f = ec->atomics[ns].pop();
-  ec->num_features--;
-  ec->sum_feat_sq[ns] -= f.x * f.x;
-  ec->total_sum_feat_sq -= f.x * f.x;
-  return true;
-}
-
 void ex_push_namespace(example_ptr ec, unsigned char ns) {
   ec->indices.push_back(ns);
 }
@@ -219,6 +212,39 @@ void ex_ensure_namespace_exists(example_ptr ec, unsigned char ns) {
   ex_push_namespace(ec, ns);
 }
 
+void ex_push_dictionary(example_ptr ec, vw_ptr vw, py::dict& dict) {
+  py::object objectKey, objectVal;
+  const py::object objectKeys = dict.iterkeys();
+  const py::object objectVals = dict.itervalues();
+  unsigned long ulCount = boost::python::extract<unsigned long>(dict.attr("__len__")());
+  for (size_t u=0; u<ulCount; u++) {
+    objectKey = objectKeys.attr( "next" )();
+    objectVal = objectVals.attr( "next" )();
+
+    char chCheckKey = objectKey.ptr()->ob_type->tp_name[0];
+    if (chCheckKey != 's') continue;
+    chCheckKey = objectVal.ptr()->ob_type->tp_name[0];
+    if (chCheckKey != 'l') continue;
+
+    py::extract<string> ns_e(objectKey);
+    if (ns_e().length() < 1) continue;
+    py::extract<py::list> list_e(objectVal);
+    py::list list = list_e();
+    char ns = ns_e()[0];
+    ex_ensure_namespace_exists(ec, ns);
+    ex_push_feature_list(ec, vw, ns, list);
+  }
+}
+
+bool ex_pop_feature(example_ptr ec, unsigned char ns) {
+  if (ec->atomics[ns].size() == 0) return false;
+  feature f = ec->atomics[ns].pop();
+  ec->num_features--;
+  ec->sum_feat_sq[ns] -= f.x * f.x;
+  ec->total_sum_feat_sq -= f.x * f.x;
+  return true;
+}
+
 bool ex_pop_namespace(example_ptr ec) {
   if (ec->indices.size() == 0) return false;
   unsigned char ns = ec->indices.pop();
@@ -513,6 +539,7 @@ BOOST_PYTHON_MODULE(pylibvw) {
 
       .def("push_hashed_feature", &ex_push_feature, "Add a hashed feature to a given namespace (id=character-ord)")
       .def("push_feature_list", &ex_push_feature_list, "Add a (Python) list of features to a given namespace")
+      .def("push_feature_dict", &ex_push_dictionary, "Add a (Python) dictionary of namespace/feature-list pairs")
       .def("pop_feature", &ex_pop_feature, "Remove the top feature from a given namespace; returns True iff the list was non-empty")
       .def("push_namespace", &ex_push_namespace, "Add a new namespace")
       .def("ensure_namespace_exists", &ex_ensure_namespace_exists, "Add a new namespace if it doesn't already exist")
diff --git a/python/pyvw.py b/python/pyvw.py
index 7a68a287d18..9205e04993a 100644
--- a/python/pyvw.py
+++ b/python/pyvw.py
@@ -406,8 +406,9 @@ def __init__(self, vw, initStringOrDict=None, labelType=pylibvw.vw.lDefault):
             self.stride = vw.get_stride()
             self.finished = False
             self.setup_done = False
-            for ns_char,feats in initStringOrDict.iteritems():
-                self.push_features(ns_char, feats)
+            #for ns_char,feats in initStringOrDict.iteritems():
+            #    self.push_features(ns_char, feats)
+            self.push_feature_dict(vw, initStringOrDict)
             self.setup_example()
         else:
             raise TypeError('expecting string or dict as argument for example construction')
diff --git a/vowpalwabbit/search.cc b/vowpalwabbit/search.cc
index ab638bb6b0f..a6db0539371 100644
--- a/vowpalwabbit/search.cc
+++ b/vowpalwabbit/search.cc
@@ -1360,15 +1360,18 @@ namespace Search {
       for (; best != final_beam->end(); ++best)
         if (best->active) {
           new_tag.erase();
-          new_tag.resize(50, true);
-          int len = sprintf(new_tag.begin, "%-10.6f\t", best->cost);
-          new_tag.end = new_tag.begin + len;
+          if (priv.kbest > 1) {
+            new_tag.resize(50, true);
+            int len = sprintf(new_tag.begin, "%-10.6f\t", best->cost);
+            new_tag.end = new_tag.begin + len;
+          }
           push_many(new_tag, priv.ec_seq[0]->tag.begin, priv.ec_seq[0]->tag.size());
           for (int* sink = all.final_prediction_sink.begin; sink != all.final_prediction_sink.end; ++sink)
             all.print_text((int)*sink, best->data->second, new_tag);
         }
-      for (int* sink = all.final_prediction_sink.begin; sink != all.final_prediction_sink.end; ++sink)      
-        all.print_text((int)*sink, "", priv.ec_seq[0]->tag);
+      if (priv.kbest > 1)
+        for (int* sink = all.final_prediction_sink.begin; sink != all.final_prediction_sink.end; ++sink)      
+          all.print_text((int)*sink, "", priv.ec_seq[0]->tag);
       new_tag.delete_v();
     }
 

From 54ca541ca2ccbc774a05bd7cb2ad877fa61ce836 Mon Sep 17 00:00:00 2001
From: Hal Daume III <me@hal3.name>
Date: Wed, 18 Feb 2015 23:43:31 -0500
Subject: [PATCH 06/13] bug fix on learner_id, minor code cleanups

---
 vowpalwabbit/search.cc              |   5 +-
 vowpalwabbit/search_sequencetask.cc | 136 +++++++++++++++-------------
 2 files changed, 75 insertions(+), 66 deletions(-)

diff --git a/vowpalwabbit/search.cc b/vowpalwabbit/search.cc
index a6db0539371..a34ad426801 100644
--- a/vowpalwabbit/search.cc
+++ b/vowpalwabbit/search.cc
@@ -950,7 +950,7 @@ namespace Search {
       else if (losses[i] == min_loss) num_min++;
       if (losses[i] > max_loss) { max_loss = losses[i]; }
     }
-    
+
     int learner = select_learner(priv, priv.current_policy, priv.learn_learner_id, true);
     
     if (!priv.is_ldf) {   // not LDF
@@ -1148,6 +1148,7 @@ namespace Search {
         a = choose_oracle_action(priv, ec_cnt, oracle_actions, oracle_actions_cnt, allowed_actions, allowed_actions_cnt, priv.beam && (priv.state != INIT_TEST));
 
       if ((policy >= 0) || gte_here) {
+        priv.learn_learner_id = learner_id;
         int learner = select_learner(priv, policy, learner_id, false);
 
         ensure_size(priv.condition_on_actions, condition_on_cnt);
@@ -2121,7 +2122,7 @@ namespace Search {
     learner<search>& l = init_learner(&sch, base,
                                       search_predict_or_learn<true>,
                                       search_predict_or_learn<false>,
-                                      priv.total_number_of_policies);
+                                      priv.total_number_of_policies * priv.num_learners);
     l.set_finish_example(finish_example);
     l.set_end_examples(end_examples);
     l.set_finish(search_finish);
diff --git a/vowpalwabbit/search_sequencetask.cc b/vowpalwabbit/search_sequencetask.cc
index 8b361aefcb0..712e481f388 100644
--- a/vowpalwabbit/search_sequencetask.cc
+++ b/vowpalwabbit/search_sequencetask.cc
@@ -87,63 +87,66 @@ namespace SequenceSpanTask {
     EncodingType encoding;
     v_array<action> allowed_actions;
     v_array<action> only_two_allowed;  // used for BILOU encoding
+    size_t multipass;
   };
 
   void initialize(Search::search& sch, size_t& num_actions, po::variables_map& vm) {
-    task_data * my_task_data = new task_data();
+    task_data * D = new task_data();
     po::options_description sspan_opts("search sequencespan options");
     sspan_opts.add_options()("search_span_bilou", "switch to (internal) BILOU encoding instead of BIO encoding");
+    sspan_opts.add_options()("search_span_multipass", po::value<size_t>(&(D->multipass))->default_value(1), "do multiple passes");
     sch.add_program_options(vm, sspan_opts);
 
     if (vm.count("search_span_bilou")) {
       cerr << "switching to BILOU encoding for sequence span labeling" << endl;
-      my_task_data->encoding = BILOU;
+      D->encoding = BILOU;
       num_actions = num_actions * 2 - 1;
     } else
-      my_task_data->encoding = BIO;
+      D->encoding = BIO;
     
     
-    my_task_data->allowed_actions.erase();
+    D->allowed_actions.erase();
 
-    if (my_task_data->encoding == BIO) {
-      my_task_data->allowed_actions.push_back(1);
+    if (D->encoding == BIO) {
+      D->allowed_actions.push_back(1);
       for (action l=2; l<num_actions; l+=2)
-        my_task_data->allowed_actions.push_back(l);
-      my_task_data->allowed_actions.push_back(1);  // push back an extra 1 that we can overwrite later if we want
-    } else if (my_task_data->encoding == BILOU) {
-      my_task_data->allowed_actions.push_back(1);
+        D->allowed_actions.push_back(l);
+      D->allowed_actions.push_back(1);  // push back an extra 1 that we can overwrite later if we want
+    } else if (D->encoding == BILOU) {
+      D->allowed_actions.push_back(1);
       for (action l=2; l<num_actions; l+=4) {
-        my_task_data->allowed_actions.push_back(l);
-        my_task_data->allowed_actions.push_back(l+1);
+        D->allowed_actions.push_back(l);
+        D->allowed_actions.push_back(l+1);
       }
-      my_task_data->only_two_allowed.push_back(0);
-      my_task_data->only_two_allowed.push_back(0);
+      D->only_two_allowed.push_back(0);
+      D->only_two_allowed.push_back(0);
     }
 
-    sch.set_task_data<task_data>(my_task_data);
+    sch.set_task_data<task_data>(D);
     sch.set_options( Search::AUTO_CONDITION_FEATURES  |    // automatically add history features to our examples, please
                      Search::AUTO_HAMMING_LOSS        |    // please just use hamming loss on individual predictions -- we won't declare loss
                      Search::EXAMPLES_DONT_CHANGE     |    // we don't do any internal example munging
                      0);
+    sch.set_num_learners(D->multipass);
   }
 
   void finish(Search::search& sch) {
-    task_data * my_task_data = sch.get_task_data<task_data>();
-    my_task_data->allowed_actions.delete_v();
-    my_task_data->only_two_allowed.delete_v();
-    delete my_task_data;
+    task_data* D = sch.get_task_data<task_data>();
+    D->allowed_actions.delete_v();
+    D->only_two_allowed.delete_v();
+    delete D;
   }
 
   void setup(Search::search& sch, vector<example*>& ec) {
-    task_data * my_task_data = sch.get_task_data<task_data>();
-    if (my_task_data->encoding == BILOU)
+    task_data& D = *sch.get_task_data<task_data>();
+    if (D.encoding == BILOU)
       convert_bio_to_bilou(ec);
   }
 
   void takedown(Search::search& sch, vector<example*>& ec) {
-    task_data * my_task_data = sch.get_task_data<task_data>();
+    task_data& D = *sch.get_task_data<task_data>();
 
-    if (my_task_data->encoding == BILOU)
+    if (D.encoding == BILOU)
       for (size_t n=0; n<ec.size(); n++) {
         MULTICLASS::label_t ylab = ec[n]->l.multi;
         ylab.label = bilou_to_bio(ylab.label);
@@ -151,40 +154,45 @@ namespace SequenceSpanTask {
   }
   
   void run(Search::search& sch, vector<example*>& ec) {
-    task_data * my_task_data = sch.get_task_data<task_data>();
-    action last_prediction = 1;
-    v_array<action> * y_allowed = &(my_task_data->allowed_actions);
-    
-    for (size_t i=0; i<ec.size(); i++) {
-      action oracle = ec[i]->l.multi.label;
-      size_t len = y_allowed->size();
-      Search::predictor P(sch, (ptag)i+1);
-      if (my_task_data->encoding == BIO) {
-        if      (last_prediction == 1)       P.set_allowed(y_allowed->begin, len-1);
-        else if (last_prediction % 2 == 0) { (*y_allowed)[len-1] = last_prediction+1; P.set_allowed(*y_allowed); }
-        else                               { (*y_allowed)[len-1] = last_prediction;   P.set_allowed(*y_allowed); }
-        if ((oracle > 1) && (oracle % 2 == 1) && (last_prediction != oracle) && (last_prediction != oracle-1))
-          oracle = 1; // if we are supposed to I-X, but last wasn't B-X or I-X, then say O
-      } else if (my_task_data->encoding == BILOU) {
-        if ((last_prediction == 1) || ((last_prediction-2) % 4 == 0) || ((last_prediction-2) % 4 == 3)) { // O or unit-X or last-X
-          P.set_allowed(my_task_data->allowed_actions);
-          // we cannot allow in-X or last-X next
-          if ((oracle > 1) && (((oracle-2) % 4 == 2) || ((oracle-2) % 4 == 3)))
-            oracle = 1;
-        } else { // begin-X or in-X
-          action other = ((last_prediction-2) % 4 == 1) ? (last_prediction+2) : last_prediction;
-          P.set_allowed(last_prediction+1);
-          P.add_allowed(other);
-          if ((oracle != last_prediction+1) && (oracle != other))
-            oracle = other;
+    task_data& D = *sch.get_task_data<task_data>();
+    v_array<action> * y_allowed = &(D.allowed_actions);
+
+    for (size_t pass=1; pass<=D.multipass; pass++) {
+      action last_prediction = 1;
+      for (size_t i=0; i<ec.size(); i++) {
+        action oracle = ec[i]->l.multi.label;
+        size_t len = y_allowed->size();
+        Search::predictor P(sch, (ptag)i+1);
+        P.set_learner_id(pass-1);
+        if (D.encoding == BIO) {
+          if      (last_prediction == 1)       P.set_allowed(y_allowed->begin, len-1);
+          else if (last_prediction % 2 == 0) { (*y_allowed)[len-1] = last_prediction+1; P.set_allowed(*y_allowed); }
+          else                               { (*y_allowed)[len-1] = last_prediction;   P.set_allowed(*y_allowed); }
+          if ((oracle > 1) && (oracle % 2 == 1) && (last_prediction != oracle) && (last_prediction != oracle-1))
+            oracle = 1; // if we are supposed to I-X, but last wasn't B-X or I-X, then say O
+        } else if (D.encoding == BILOU) {
+          if ((last_prediction == 1) || ((last_prediction-2) % 4 == 0) || ((last_prediction-2) % 4 == 3)) { // O or unit-X or last-X
+            P.set_allowed(D.allowed_actions);
+            // we cannot allow in-X or last-X next
+            if ((oracle > 1) && (((oracle-2) % 4 == 2) || ((oracle-2) % 4 == 3)))
+              oracle = 1;
+          } else { // begin-X or in-X
+            action other = ((last_prediction-2) % 4 == 1) ? (last_prediction+2) : last_prediction;
+            P.set_allowed(last_prediction+1);
+            P.add_allowed(other);
+            if ((oracle != last_prediction+1) && (oracle != other))
+              oracle = other;
+          }
         }
-      }
-      last_prediction = P.set_input(*ec[i]).set_condition_range((ptag)i, sch.get_history_length(), 'p').set_oracle(oracle).predict();
+        P.set_input(*ec[i]);
+        P.set_condition_range((ptag)i, sch.get_history_length(), 'p');
+        if (pass > 1) P.add_condition_range((ptag)(i+1+sch.get_history_length()), sch.get_history_length()+1, 'a');
+        P.set_oracle(oracle);
+        last_prediction = P.predict();
       
-      action printed_prediction = (my_task_data->encoding == BIO) ? last_prediction : bilou_to_bio(last_prediction);
-      
-      if (sch.output().good())
-        sch.output() << printed_prediction << ' ';
+        if ((pass == D.multipass) && sch.output().good())
+          sch.output() << ((D.encoding == BIO) ? last_prediction : bilou_to_bio(last_prediction)) << ' ';
+      }
     }
   }
 }
@@ -197,20 +205,20 @@ namespace ArgmaxTask {
   };
 
   void initialize(Search::search& sch, size_t& num_actions, po::variables_map& vm) {
-    task_data* my_task_data = new task_data();
+    task_data* D = new task_data();
     
     po::options_description argmax_opts("argmax options");
     argmax_opts.add_options()
-      ("cost", po::value<float>(&(my_task_data->false_negative_cost))->default_value(10.0), "False Negative Cost")
-      ("negative_weight", po::value<float>(&(my_task_data->negative_weight))->default_value(1), "Relative weight of negative examples")
+      ("cost", po::value<float>(&(D->false_negative_cost))->default_value(10.0), "False Negative Cost")
+      ("negative_weight", po::value<float>(&(D->negative_weight))->default_value(1), "Relative weight of negative examples")
       ("max", "Disable structure: just predict the max");
     sch.add_program_options(vm, argmax_opts);
 
-    my_task_data->predict_max = vm.count("max") > 0;
+    D->predict_max = vm.count("max") > 0;
 
-    sch.set_task_data(my_task_data);
+    sch.set_task_data(D);
 
-    if (my_task_data->predict_max)
+    if (D->predict_max)
       sch.set_options( Search::EXAMPLES_DONT_CHANGE );   // we don't do any internal example munging
     else
       sch.set_options( Search::AUTO_CONDITION_FEATURES |    // automatically add history features to our examples, please
@@ -218,7 +226,7 @@ namespace ArgmaxTask {
   }
 
   void run(Search::search& sch, vector<example*>& ec) {
-    task_data * my_task_data = sch.get_task_data<task_data>();
+    task_data& D = *sch.get_task_data<task_data>();
     uint32_t max_prediction = 1;
     uint32_t max_label = 1;
 
@@ -227,14 +235,14 @@ namespace ArgmaxTask {
         
     for (ptag i=0; i<ec.size(); i++) {
       // labels should be 1 or 2, and our output is MAX of all predicted values
-      uint32_t oracle = my_task_data->predict_max ? max_label : ec[i]->l.multi.label;
+      uint32_t oracle = D.predict_max ? max_label : ec[i]->l.multi.label;
       uint32_t prediction = sch.predict(*ec[i], i+1, &oracle, 1, &i, "p");
 
       max_prediction = max(prediction, max_prediction);
     }
     float loss = 0.;
     if (max_label > max_prediction)
-      loss = my_task_data->false_negative_cost / my_task_data->negative_weight;
+      loss = D.false_negative_cost / D.negative_weight;
     else if (max_prediction > max_label)
       loss = 1.;
     sch.loss(loss);

From e58e27a3e6f302e2b7434876a0fa2f4f58b79676 Mon Sep 17 00:00:00 2001
From: Hal Daume III <me@hal3.name>
Date: Fri, 20 Feb 2015 14:47:39 -0500
Subject: [PATCH 07/13] added ability to unsetup example in python interface

---
 python/pylibvw.cc | 63 ++++++++++++++++++++++++++++++++++++++++++++---
 python/pyvw.py    | 40 ++++++++++++++++--------------
 2 files changed, 81 insertions(+), 22 deletions(-)

diff --git a/python/pylibvw.cc b/python/pylibvw.cc
index 2decbe98826..9b9f490284e 100644
--- a/python/pylibvw.cc
+++ b/python/pylibvw.cc
@@ -245,13 +245,18 @@ bool ex_pop_feature(example_ptr ec, unsigned char ns) {
   return true;
 }
 
-bool ex_pop_namespace(example_ptr ec) {
-  if (ec->indices.size() == 0) return false;
-  unsigned char ns = ec->indices.pop();
+void ex_erase_namespace(example_ptr ec, unsigned char ns) {
   ec->num_features -= ec->atomics[ns].size();
   ec->total_sum_feat_sq -= ec->sum_feat_sq[ns];
   ec->sum_feat_sq[ns] = 0.;
   ec->atomics[ns].erase();
+  ec->audit_features[ns].erase();
+}
+
+bool ex_pop_namespace(example_ptr ec) {
+  if (ec->indices.size() == 0) return false;
+  unsigned char ns = ec->indices.pop();
+  ex_erase_namespace(ec, ns);
   return true;
 }
 
@@ -259,6 +264,56 @@ void my_setup_example(vw_ptr vw, example_ptr ec) {
   VW::setup_example(*vw, ec.get());
 }
 
+void unsetup_example(vw_ptr vwP, example_ptr ae) {
+  vw&all = *vwP;
+  ae->partial_prediction = 0.;
+  ae->num_features = 0;
+  ae->total_sum_feat_sq = 0;
+  ae->loss = 0.;
+  
+  if (all.ignore_some) {
+    cerr << "error: cannot unsetup example when some namespaces are ignored!" << endl;
+    throw exception();
+  }
+
+  if(all.ngram_strings.size() > 0) {
+    cerr << "error: cannot unsetup example when ngrams are in use!" << endl;
+    throw exception();
+  }
+  
+  if (all.add_constant) {
+    ae->atomics[constant_namespace].erase();
+    ae->audit_features[constant_namespace].erase();
+    int hit_constant = -1;
+    size_t N = ae->indices.size();
+    for (size_t i=0; i<N; i++) {
+      size_t j = N - 1 - i;
+      if (ae->indices[j] == constant_namespace) {
+        if (hit_constant >= 0) { cerr << "error: hit constant namespace twice!" << endl; throw exception(); }
+        hit_constant = j;
+        break;
+      }
+    }
+    if (hit_constant >= 0) {
+      for (size_t i=hit_constant; i<N-1; i++)
+        ae->indices[i] = ae->indices[i+1];
+      ae->indices.pop();
+    }
+  }
+
+  uint32_t multiplier = all.wpp << all.reg.stride_shift;
+  if(multiplier != 1) { //make room for per-feature information.
+    for (unsigned char* i = ae->indices.begin; i != ae->indices.end; i++)
+      for(feature* j = ae->atomics[*i].begin; j != ae->atomics[*i].end; j++)
+        j->weight_index /= multiplier;
+    if (all.audit || all.hash_inv)
+      for (unsigned char* i = ae->indices.begin; i != ae->indices.end; i++)
+        for(audit_data* j = ae->audit_features[*i].begin; j != ae->audit_features[*i].end; j++)
+          j->weight_index /= multiplier;
+  }
+}
+
+
 void ex_set_label_string(example_ptr ec, vw_ptr vw, string label, size_t labelType) {
   // SPEEDUP: if it's already set properly, don't modify
   label_parser& old_lp = vw->p->lp;
@@ -492,6 +547,7 @@ BOOST_PYTHON_MODULE(pylibvw) {
       .def("hash_feature", &VW::hash_feature, "given a feature string (arg2) and a hashed namespace (arg3), hash that feature")
       .def("finish_example", &my_finish_example, "tell VW that you're done with a given example")
       .def("setup_example", &my_setup_example, "given an example that you've created by hand, prepare it for learning (eg, compute quadratic feature)")
+      .def("unsetup_example", &unsetup_example, "reverse the process of setup, so that you can go back and modify this example")
 
       .def("num_weights", &VW::num_weights, "how many weights are we learning?")
       .def("get_weight", &VW::get_weight, "get the weight for a particular index")
@@ -544,6 +600,7 @@ BOOST_PYTHON_MODULE(pylibvw) {
       .def("push_namespace", &ex_push_namespace, "Add a new namespace")
       .def("ensure_namespace_exists", &ex_ensure_namespace_exists, "Add a new namespace if it doesn't already exist")
       .def("pop_namespace", &ex_pop_namespace, "Remove the top namespace off; returns True iff the list was non-empty")
+      .def("erase_namespace", &ex_erase_namespace, "Remove all the features from a given namespace")
 
       .def("set_label_string", &ex_set_label_string, "(Re)assign the label of this example to this string")
       
diff --git a/python/pyvw.py b/python/pyvw.py
index 9205e04993a..1be51101f5a 100644
--- a/python/pyvw.py
+++ b/python/pyvw.py
@@ -265,7 +265,7 @@ def push_feature(self, feature, v=1.):
     def pop_feature(self):
         """Remove the top feature from the current namespace; returns True
         if a feature was removed, returns False if there were no
-        features to pop. Fails if setup has run."""
+        features to pop."""
         return self.ex.pop_feature(self.ns)
 
     def push_features(self, ns, featureList):
@@ -406,10 +406,8 @@ def __init__(self, vw, initStringOrDict=None, labelType=pylibvw.vw.lDefault):
             self.stride = vw.get_stride()
             self.finished = False
             self.setup_done = False
-            #for ns_char,feats in initStringOrDict.iteritems():
-            #    self.push_features(ns_char, feats)
             self.push_feature_dict(vw, initStringOrDict)
-            self.setup_example()
+            #self.setup_example()
         else:
             raise TypeError('expecting string or dict as argument for example construction')
 
@@ -469,6 +467,13 @@ def setup_example(self):
         self.vw.setup_example(self)
         self.setup_done = True
 
+    def unsetup_example(self):
+        """If this example has been setup, reverse that process so you can continue editing the examples."""
+        if not self.setup_done:
+            raise Exception('trying to unsetup_example that has not yet been setup')
+        self.vw.unsetup_example(self)
+        self.setup_done = False
+        
     def learn(self):
         """Learn on this example (and before learning, automatically
         call setup_example if the example hasn't yet been setup)."""
@@ -502,42 +507,40 @@ def get_feature_id(self, ns, feature, ns_hash=None):
 
 
     def push_hashed_feature(self, ns, f, v=1.):
-        """Add a hashed feature to a given namespace (fails if setup
-        has already run on this example). Fails if setup has run."""
-        if self.setup_done: raise Exception("error: modification to example after setup")
+        """Add a hashed feature to a given namespace."""
+        if self.setup_done: self.unsetup_example();
         pylibvw.example.push_hashed_feature(self, self.get_ns(ns).ord_ns, f, v)
 
     def push_feature(self, ns, feature, v=1., ns_hash=None):
-        """Add an unhashed feature to a given namespace (fails if
-        setup has already run on this example)."""
+        """Add an unhashed feature to a given namespace."""
         f = self.get_feature_id(ns, feature, ns_hash)
         self.push_hashed_feature(ns, f, v)
 
     def pop_feature(self, ns):
         """Remove the top feature from a given namespace; returns True
         if a feature was removed, returns False if there were no
-        features to pop. Fails if setup has run."""
-        if self.setup_done: raise Exception("error: modification to example after setup")
+        features to pop."""
+        if self.setup_done: self.unsetup_example();
         return pylibvw.example.pop_feature(self, self.get_ns(ns).ord_ns)
 
     def push_namespace(self, ns):
         """Push a new namespace onto this example. You should only do
         this if you're sure that this example doesn't already have the
-        given namespace. Fails if setup has run."""
-        if self.setup_done: raise Exception("error: modification to example after setup")
+        given namespace."""
+        if self.setup_done: self.unsetup_example();
         pylibvw.example.push_namespace(self, self.get_ns(ns).ord_ns)
 
     def pop_namespace(self):
         """Remove the top namespace from an example; returns True if a
         namespace was removed, or False if there were no namespaces
-        left. Fails if setup has run."""
-        if self.setup_done: raise Exception("error: modification to example after setup")
+        left."""
+        if self.setup_done: self.unsetup_example();
         return pylibvw.example.pop_namespace(self)
 
     def ensure_namespace_exists(self, ns):
         """Check to see if a namespace already exists. If it does, do
-        nothing. If it doesn't, add it. Fails if setup has run."""
-        if self.setup_done: raise Exception("error: modification to example after setup")
+        nothing. If it doesn't, add it."""
+        if self.setup_done: self.unsetup_example();
         return pylibvw.example.ensure_namespace_exists(self, self.get_ns(ns).ord_ns)
 
     def push_features(self, ns, featureList):
@@ -553,8 +556,7 @@ def push_features(self, ns, featureList):
            space_hash = vw.hash_space( 'x' )
            feat_hash  = vw.hash_feature( 'a', space_hash )
            ex.push_features('x', [feat_hash])    # note: 'x' should match the space_hash!
-
-        Fails if setup has run."""
+        """
         ns = self.get_ns(ns)
         self.ensure_namespace_exists(ns)
         self.push_feature_list(self.vw, ns.ord_ns, featureList)   # much faster just to do it in C++

From e31868d2a39d82960fca3a650d3b1034f693e921 Mon Sep 17 00:00:00 2001
From: Hal Daume III <me@hal3.name>
Date: Fri, 20 Feb 2015 14:47:58 -0500
Subject: [PATCH 08/13] added test of example modification in python

---
 python/test_partial_example.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 python/test_partial_example.py

diff --git a/python/test_partial_example.py b/python/test_partial_example.py
new file mode 100644
index 00000000000..f267cd84f01
--- /dev/null
+++ b/python/test_partial_example.py
@@ -0,0 +1,15 @@
+import pyvw
+
+vw = pyvw.vw('--audit')
+full = vw.example( { 'a': ['b'], 'x': ['y'] } )
+full.learn()
+
+part = vw.example( {'a': ['b'] } )
+part.learn()
+
+part.push_features('x', ['y'])
+part.learn()
+
+part.erase_namespace(ord('x'))
+part.push_features('x', ['z'])
+part.learn()

From 7ce724917ee498797511965f2c078b8eb4213da1 Mon Sep 17 00:00:00 2001
From: Hal Daume III <me@hal3.name>
Date: Sat, 28 Feb 2015 10:45:18 -0500
Subject: [PATCH 09/13] add xv to search, separate learners to search_graph

---
 vowpalwabbit/search.cc       | 75 +++++++++++++++++++++---------------
 vowpalwabbit/search_graph.cc | 13 +++++--
 2 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/vowpalwabbit/search.cc b/vowpalwabbit/search.cc
index a34ad426801..14035991b57 100644
--- a/vowpalwabbit/search.cc
+++ b/vowpalwabbit/search.cc
@@ -134,7 +134,7 @@ namespace Search {
     RollMethod rollout_method;     // 0=policy, 1=oracle, 2=mix_per_state, 3=mix_per_roll
     RollMethod rollin_method;
     float subsample_timesteps;     // train at every time step or just a (random) subset?
-    bool cross_validate;           // train two separate policies -- TODO how should we deal with this at test time? really we want three but that's hard to implement ;)
+    bool xv;           // train three separate policies -- two for providing examples to the other and a third training on the union (which will be used at test time -- TODO)
     
     bool   allow_current_policy;   // should the current policy be used for training? true for dagger
     bool   adaptive_beta;          // used to implement dagger-like algorithms. if true, beta = 1-(1-alpha)^n after n updates, and policy is mixed with oracle as \pi' = (1-beta)\pi^* + beta \pi
@@ -235,12 +235,12 @@ namespace Search {
   // for two-fold cross validation, we double the number of learners
   // and send examples to one or the other depending on the xor of
   // (is_training) and (example_id % 2)
-  int select_learner(search_private& priv, int policy, size_t learner_id, bool is_training) {
+  int select_learner(search_private& priv, int policy, size_t learner_id, bool is_gte, bool global_xv_train) {
     if (policy<0) return policy;  // optimal policy
     else {
       int p = (int) (policy*priv.num_learners+learner_id);
-      if (priv.cross_validate)
-        p = 2*p + ( is_training ^ (priv.all->sd->example_number % 2) );
+      if (priv.xv && !global_xv_train)
+        p = 2*p + 1 + ( is_gte ^ (priv.all->sd->example_number % 2) );
       return p;
     }
   }
@@ -950,8 +950,6 @@ namespace Search {
       else if (losses[i] == min_loss) num_min++;
       if (losses[i] > max_loss) { max_loss = losses[i]; }
     }
-
-    int learner = select_learner(priv, priv.current_policy, priv.learn_learner_id, true);
     
     if (!priv.is_ldf) {   // not LDF
       // since we're not LDF, it should be the case that ec_ref_cnt == 1
@@ -971,39 +969,54 @@ namespace Search {
       example& ec = priv.learn_ec_ref[0];
       polylabel old_label = ec.l;
       ec.l = labels;
-      ec.in_use = true;
       if (add_conditioning) add_example_conditioning(priv, ec, priv.learn_condition_on.begin, priv.learn_condition_on.size(), priv.learn_condition_on_names.begin, priv.learn_condition_on_act.begin);
-      priv.base_learner->learn(ec, learner);
+      for (size_t is_global_train=0; is_global_train<=priv.xv; is_global_train++) {
+        int learner = select_learner(priv, priv.current_policy, priv.learn_learner_id, true, is_global_train);
+        ec.in_use = true;
+        priv.base_learner->learn(ec, learner);
+      }
       if (add_conditioning) del_example_conditioning(priv, ec);
       ec.l = old_label;
       priv.total_examples_generated++;
     } else {              // is  LDF
       assert(losses.size() == priv.learn_ec_ref_cnt);
       size_t start_K = (priv.is_ldf && LabelDict::ec_is_example_header(priv.learn_ec_ref[0])) ? 1 : 0;
-      for (action a= (uint32_t)start_K; a<priv.learn_ec_ref_cnt; a++) {
-        example& ec = priv.learn_ec_ref[a];
 
-        CS::label& lab = ec.l.cs;
-        if (lab.costs.size() == 0) {
-          CS::wclass wc = { 0., 1, 0., 0. };
-          lab.costs.push_back(wc);
+      if (add_conditioning)
+        for (action a= (uint32_t)start_K; a<priv.learn_ec_ref_cnt; a++) {
+          example& ec = priv.learn_ec_ref[a];
+          add_example_conditioning(priv, ec, priv.learn_condition_on.begin, priv.learn_condition_on.size(), priv.learn_condition_on_names.begin, priv.learn_condition_on_act.begin);
         }
-        lab.costs[0].x = losses[a] - min_loss;
-        //cerr << "cost[" << a << "] = " << losses[a] << " - " << min_loss << " = " << lab.costs[0].x << endl;
-        ec.in_use = true;
-        if (add_conditioning) add_example_conditioning(priv, ec, priv.learn_condition_on.begin, priv.learn_condition_on.size(), priv.learn_condition_on_names.begin, priv.learn_condition_on_act.begin);
-        priv.base_learner->learn(ec, learner);
-        cdbg << "generate_training_example called learn on action a=" << a << ", costs.size=" << lab.costs.size() << " ec=" << &ec << endl;
-        priv.total_examples_generated++;
+      
+      for (size_t is_global_train=0; is_global_train<=priv.xv; is_global_train++) {
+        int learner = select_learner(priv, priv.current_policy, priv.learn_learner_id, true, is_global_train);
+
+        for (action a= (uint32_t)start_K; a<priv.learn_ec_ref_cnt; a++) {
+          example& ec = priv.learn_ec_ref[a];
+
+          CS::label& lab = ec.l.cs;
+          if (lab.costs.size() == 0) {
+            CS::wclass wc = { 0., 1, 0., 0. };
+            lab.costs.push_back(wc);
+          }
+          lab.costs[0].x = losses[a] - min_loss;
+          //cerr << "cost[" << a << "] = " << losses[a] << " - " << min_loss << " = " << lab.costs[0].x << endl;
+          ec.in_use = true;
+          priv.base_learner->learn(ec, learner);
+
+          cdbg << "generate_training_example called learn on action a=" << a << ", costs.size=" << lab.costs.size() << " ec=" << &ec << endl;
+          priv.total_examples_generated++;
+        }
+
+        priv.base_learner->learn(*priv.empty_example, learner);
+        cdbg << "generate_training_example called learn on empty_example" << endl;
       }
-      priv.base_learner->learn(*priv.empty_example, learner);
-      cdbg << "generate_training_example called learn on empty_example" << endl;
 
-      for (action a= (uint32_t)start_K; a<priv.learn_ec_ref_cnt; a++) {
-        example& ec = priv.learn_ec_ref[a];
-        if (add_conditioning) 
+      if (add_conditioning) 
+        for (action a= (uint32_t)start_K; a<priv.learn_ec_ref_cnt; a++) {
+          example& ec = priv.learn_ec_ref[a];
           del_example_conditioning(priv, ec);
-      }
+        }
     }
   }
 
@@ -1149,7 +1162,7 @@ namespace Search {
 
       if ((policy >= 0) || gte_here) {
         priv.learn_learner_id = learner_id;
-        int learner = select_learner(priv, policy, learner_id, false);
+        int learner = select_learner(priv, policy, learner_id, false, priv.state == INIT_TEST);
 
         ensure_size(priv.condition_on_actions, condition_on_cnt);
         for (size_t i=0; i<condition_on_cnt; i++)
@@ -1631,7 +1644,7 @@ namespace Search {
     priv.label_is_test = mc_label_is_test;
 
     priv.active_uncertainty = v_init< pair<float,size_t> >();
-    priv.cross_validate = false;
+    priv.xv = false;
     priv.A = 1;
     priv.num_learners = 1;
     priv.cb_learner = false;
@@ -1914,7 +1927,7 @@ namespace Search {
       ("search_no_caching",                             "turn off the built-in caching ability (makes things slower, but technically more safe)")
       ("search_beam",              po::value<size_t>(), "use beam search (arg = beam size, default 0 = no beam)")
       ("search_kbest",             po::value<size_t>(), "size of k-best list to produce (must be <= beam size)")
-      ("search_crossvalidate",                          "train two separate policies, alternating prediction/learning")
+      ("search_xv",                                     "train two separate policies, alternating prediction/learning")
       ;
     add_options(all);
     po::variables_map& vm = all.vm;
@@ -1946,7 +1959,7 @@ namespace Search {
                          "warning: specified --search_interpolation different than the one loaded from regressor. using loaded value of: ", "");
 
     if (vm.count("search_passes_per_policy"))       priv.passes_per_policy    = vm["search_passes_per_policy"].as<size_t>();
-    if (vm.count("search_crossvalidate"))           priv.cross_validate       = true;
+    if (vm.count("search_xv"))                      priv.xv       = true;
 
     if (vm.count("search_alpha"))                   priv.alpha                = vm["search_alpha"            ].as<float>();
     if (vm.count("search_beta"))                    priv.beta                 = vm["search_beta"             ].as<float>();
diff --git a/vowpalwabbit/search_graph.cc b/vowpalwabbit/search_graph.cc
index cd22cc72323..4b536ef24f7 100644
--- a/vowpalwabbit/search_graph.cc
+++ b/vowpalwabbit/search_graph.cc
@@ -27,8 +27,7 @@ label:weight |n features
 ...
 
 they are *implicitly* labeled starting at 1. (note the namespace
-needn't be called n.) if weight is
-omitted it is assumed to be 1.0.
+needn't be called n.) if weight is omitted it is assumed to be 1.0.
 
 edge lines look like:
 
@@ -52,6 +51,7 @@ namespace GraphTask {
     size_t num_loops;
     size_t K;  // number of labels, *NOT* including the +1 for 'unlabeled'
     bool   use_structure;
+    bool   separate_learners;
 
     // for adding new features
     size_t mask; // all->reg.weight_mask
@@ -75,15 +75,21 @@ namespace GraphTask {
     po::options_description sspan_opts("search graphtask options");
     sspan_opts.add_options()("search_graph_num_loops", po::value<size_t>(), "how many loops to run [def: 2]");
     sspan_opts.add_options()("search_graph_no_structure", "turn off edge features");
+    sspan_opts.add_options()("search_graph_separate_learners", "use a different learner for each pass");
     sch.add_program_options(vm, sspan_opts);
 
     D->num_loops = 2;
     D->use_structure = true;
     if (vm.count("search_graph_num_loops"))      D->num_loops = vm["search_graph_num_loops"].as<size_t>();
     if (vm.count("search_graph_no_structure"))   D->use_structure = false;
+    if (vm.count("search_graph_separate_learners")) D->separate_learners = true;
 
+    if (D->num_loops <= 1) { D->num_loops = 1; D->separate_learners = false; }
+    
     D->K = num_actions;
     D->neighbor_predictions = calloc_or_die<float>(D->K+1);
+
+    if (D->separate_learners) sch.set_num_learners(D->num_loops);
     
     sch.set_task_data<task_data>(D);
     sch.set_options( Search::AUTO_HAMMING_LOSS );
@@ -186,7 +192,6 @@ namespace GraphTask {
     for (size_t k=0; k<=D.K; k++) {
       if (D.neighbor_predictions[k] == 0.) continue;
       feature f = { fv * D.neighbor_predictions[k], (uint32_t) ((( ((fx & D.mask) >> D.ss) + 348919043 * k ) << D.ss) & D.mask) };
-      //cerr << "e: " << fx << " (:= " << ((fx & D.mask) >> D.ss) << ") / " << k << " -> " << f.weight_index << ", w=" << D.weight_vector[f.weight_index] << endl;
       node->atomics[neighbor_namespace].push_back(f);
       node->sum_feat_sq[neighbor_namespace] += f.x * f.x;
     }
@@ -221,7 +226,6 @@ namespace GraphTask {
       if (pred_total == 0.) continue;
       //for (size_t k=0; k<D.K+1; k++) D.neighbor_predictions[k] /= pred_total;
       example&edge = *ec[i];
-      //cerr << "adding to n=" << n << " from e=" << i << endl;
       if (pred_total <= 1.) {  // single edge
         D.neighbor_predictions[0] = (float)last_pred;
         GD::foreach_feature<task_data,uint32_t,add_edge_features_single_fn>(sch.get_vw_pointer_unsafe(), edge, D);
@@ -257,6 +261,7 @@ namespace GraphTask {
         if (add_features) add_edge_features(sch, D, n, ec);
         Search::predictor P = Search::predictor(sch, n+1);
         P.set_input(*ec[n]);
+        if (D.separate_learners) P.set_learner_id(loop);
         if (ec[n]->l.cs.costs.size() > 0) // for test examples
           P.set_oracle(ec[n]->l.cs.costs[0].class_index);
         // add all the conditioning

From 63287150d3b770ce67f383cb0b20da793ecfe253 Mon Sep 17 00:00:00 2001
From: Hal Daume III <me@hal3.name>
Date: Sat, 28 Feb 2015 11:26:16 -0500
Subject: [PATCH 10/13] searching for bug in multiclass

---
 test/train-sets/ref/dictionary_test.stderr | 20 ++++++++++----------
 test/train-sets/ref/search_er.stderr       |  8 ++++----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/test/train-sets/ref/dictionary_test.stderr b/test/train-sets/ref/dictionary_test.stderr
index dfbb1e393ae..32b9532cc30 100644
--- a/test/train-sets/ref/dictionary_test.stderr
+++ b/test/train-sets/ref/dictionary_test.stderr
@@ -10,21 +10,21 @@ Reading datafile = train-sets/dictionary_test.dat
 num sources = 1
 average    since         example     example  current  current  current
 loss       last          counter      weight    label  predict features
-1.000000   1.000000            1         1.0   1.0000  -1.0000        2
-1.000000   1.000000            2         2.0  -1.0000   1.0000        3
-1.000000   1.000000            4         4.0  -1.0000   1.0000        5
-0.750000   0.500000            8         8.0  -1.0000   1.0000        5
-0.500000   0.250000           16        16.0  -1.0000  -1.0000        5
-0.375000   0.250000           32        32.0  -1.0000  -1.0000        5
-0.312500   0.250000           64        64.0  -1.0000  -1.0000        5
-0.164062   0.015625          128       128.0  -1.0000  -1.0000        5
+1.000000   1.000000          1      1.0     1.0000  -1.0000        2
+1.000000   1.000000          2      2.0    -1.0000   1.0000        2
+0.500000   0.000000          4      4.0    -1.0000  -1.0000        2
+0.250000   0.000000          8      8.0    -1.0000  -1.0000        2
+0.125000   0.000000         16     16.0    -1.0000  -1.0000        2
+0.062500   0.000000         32     32.0    -1.0000  -1.0000        2
+0.031250   0.000000         64     64.0    -1.0000  -1.0000        2
+0.015625   0.000000        128    128.0    -1.0000  -1.0000        2
 
 finished run
 number of examples per pass = 4
 passes used = 32
 weighted example sum = 128
 weighted label sum = 0
-average loss = 0.164062
+average loss = 0.015625
 best constant = 0
 best constant's loss = 1
-total feature number = 448
+total feature number = 256
diff --git a/test/train-sets/ref/search_er.stderr b/test/train-sets/ref/search_er.stderr
index a5e0767e786..77b0402b6b7 100644
--- a/test/train-sets/ref/search_er.stderr
+++ b/test/train-sets/ref/search_er.stderr
@@ -9,14 +9,14 @@ num sources = 1
 average    since      instance            current true      current predicted   cur   cur   predic    cache  examples          
 loss       last        counter           output prefix          output prefix  pass   pol     made     hits    gener  beta    
 1.000000   1.000000          1  [4                   ] [1                   ]     0     0        1        0        1  0.000000
-2.000000   3.000000          2  [2 4 2 5 10 10       ] [4 4 4 10 10 10      ]     0     0        7        0        7  0.000000
-2.875000   3.750000          4  [1 4 4 1 10 10 10 1..] [4 4 4 4 10 10 10 1..]     0     0       32        0       32  0.000000
-1.437500   0.000000          8  [1 4 4 1 10 10 10 1..] [1 4 4 1 10 10 10 1..]     1     0       64        0       64  0.000001
+2.500000   4.000000          2  [2 4 2 5 10 10       ] [4 4 4 7 7 7         ]     0     0        7        0        7  0.000000
+3.250000   4.000000          4  [1 4 4 1 10 10 10 1..] [4 4 4 4 10 10 10 1..]     0     0       32        0       32  0.000000
+1.625000   0.000000          8  [1 4 4 1 10 10 10 1..] [1 4 4 1 10 10 10 1..]     1     0       64        0       64  0.000001
 
 finished run
 number of examples per pass = 4
 passes used = 3
 weighted example sum = 12
 weighted label sum = 0
-average loss = 0.958333
+average loss = 1.08333
 total feature number = 1185

From 760a8675f5e6561b13f1b202338edd81acd50758 Mon Sep 17 00:00:00 2001
From: Hal Daume III <me@hal3.name>
Date: Sat, 28 Feb 2015 12:14:03 -0500
Subject: [PATCH 11/13] fixed learner id bug

---
 vowpalwabbit/search.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vowpalwabbit/search.cc b/vowpalwabbit/search.cc
index 14035991b57..129befee115 100644
--- a/vowpalwabbit/search.cc
+++ b/vowpalwabbit/search.cc
@@ -1161,7 +1161,6 @@ namespace Search {
         a = choose_oracle_action(priv, ec_cnt, oracle_actions, oracle_actions_cnt, allowed_actions, allowed_actions_cnt, priv.beam && (priv.state != INIT_TEST));
 
       if ((policy >= 0) || gte_here) {
-        priv.learn_learner_id = learner_id;
         int learner = select_learner(priv, policy, learner_id, false, priv.state == INIT_TEST);
 
         ensure_size(priv.condition_on_actions, condition_on_cnt);
@@ -1193,7 +1192,10 @@ namespace Search {
             priv.learn_ec_ref_cnt = ec_cnt;
             ensure_size(priv.learn_allowed_actions, allowed_actions_cnt);
             memcpy(priv.learn_allowed_actions.begin, allowed_actions, allowed_actions_cnt * sizeof(action));
+            size_t old_learner_id = priv.learn_learner_id;
+            priv.learn_learner_id = learner_id;
             generate_training_example(priv, losses, false);
+            priv.learn_learner_id = old_learner_id;
             losses.delete_v();
           }
           

From d54bf1ad97d1e7b922de3c43463e5b7d6db951f3 Mon Sep 17 00:00:00 2001
From: Hal Daume III <me@hal3.name>
Date: Sun, 1 Mar 2015 21:07:23 -0500
Subject: [PATCH 12/13] fix major bug (re setup_example) in python search
 interface

---
 Makefile                       | 2 +-
 python/pyvw.py                 | 7 ++++---
 vowpalwabbit/Makefile          | 3 +++
 vowpalwabbit/gd.cc             | 7 +++++--
 vowpalwabbit/loss_functions.cc | 1 -
 5 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 28a091774ab..7855d5a6b62 100644
--- a/Makefile
+++ b/Makefile
@@ -70,7 +70,7 @@ FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) $(OPTIM_FLAGS) -D_
 #CXX = g++
 
 # for valgrind / gdb debugging
-#FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) -ffast-math -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -g -O0  -fPIC
+#FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -g -O0  -fPIC
 
 # for valgrind profiling: run 'valgrind --tool=callgrind PROGRAM' then 'callgrind_annotate --tree=both --inclusive=yes'
 #FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -g -O2 -fomit-frame-pointer -ffast-math -fno-strict-aliasing  -fPIC
diff --git a/python/pyvw.py b/python/pyvw.py
index 1be51101f5a..dd2d0677c1d 100644
--- a/python/pyvw.py
+++ b/python/pyvw.py
@@ -145,6 +145,8 @@ def predict(examples, my_tag, oracle, condition=None, allowed=None, learner_id=0
                         ec = examples[n]
                         while hasattr(ec, '__call__'): ec = ec()   # unfold the lambdas
                         if not isinstance(ec, example) and not isinstance(ec, pylibvw.example): raise TypeError('non-example in LDF example list in SearchTask.predict()')
+                        if hasattr(ec, 'setup_done') and not ec.setup_done:
+                            ec.setup_example()
                         P.set_input_at(n, ec)
                 else:
                     pass # TODO: do we need to set the examples even though they're not used?
@@ -399,15 +401,14 @@ def __init__(self, vw, initStringOrDict=None, labelType=pylibvw.vw.lDefault):
             self.setup_done = False
         elif isinstance(initStringOrDict, str):
             pylibvw.example.__init__(self, vw, labelType, initStringOrDict)
-            self.setup_done = True
+            self.setup_done = False
         elif isinstance(initStringOrDict, dict):
             pylibvw.example.__init__(self, vw, labelType)
             self.vw = vw
             self.stride = vw.get_stride()
             self.finished = False
-            self.setup_done = False
             self.push_feature_dict(vw, initStringOrDict)
-            #self.setup_example()
+            self.setup_done = False
         else:
             raise TypeError('expecting string or dict as argument for example construction')
 
diff --git a/vowpalwabbit/Makefile b/vowpalwabbit/Makefile
index 9ecd681c523..735012ffb8c 100644
--- a/vowpalwabbit/Makefile
+++ b/vowpalwabbit/Makefile
@@ -52,4 +52,7 @@ install: $(BINARIES)
 clean:
 	rm -f  *.o *.d $(BINARIES) *~ $(MANPAGES) libvw.a
 
+python: vw
+	cd .. ; $(MAKE) python
+
 .PHONY: all clean install test things
diff --git a/vowpalwabbit/gd.cc b/vowpalwabbit/gd.cc
index c99675de7dd..8511be5d2e6 100644
--- a/vowpalwabbit/gd.cc
+++ b/vowpalwabbit/gd.cc
@@ -316,8 +316,11 @@ float finalize_prediction(shared_data* sd, float ret)
 {
   if ( nanpattern(ret))
     {
-      cerr << "NAN prediction in example " << sd->example_number + 1 << ", forcing 0.0" << endl;
-      return 0.;
+      float ret = 0.;
+      if (ret > sd->max_label) ret = (float)sd->max_label;
+      if (ret < sd->min_label) ret = (float)sd->min_label;
+      cerr << "NAN prediction in example " << sd->example_number + 1 << ", forcing " << ret << endl;
+      return ret;
     }
   if ( ret > sd->max_label )
     return (float)sd->max_label;
diff --git a/vowpalwabbit/loss_functions.cc b/vowpalwabbit/loss_functions.cc
index ac924590b61..37c10321f99 100644
--- a/vowpalwabbit/loss_functions.cc
+++ b/vowpalwabbit/loss_functions.cc
@@ -6,7 +6,6 @@ license as described in the file LICENSE.
 #include<math.h>
 #include<iostream>
 #include<stdlib.h>
-#include<assert.h>
 #include<float.h>
 using namespace std;
 

From 1272f8babc6deecdcbe773aef7ea6d1a0305c9f5 Mon Sep 17 00:00:00 2001
From: Hal Daume III <me@hal3.name>
Date: Mon, 2 Mar 2015 14:27:24 -0500
Subject: [PATCH 13/13] fixed clang error

---
 vowpalwabbit/parse_example.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vowpalwabbit/parse_example.cc b/vowpalwabbit/parse_example.cc
index 2f1e833933c..45bfe13da91 100644
--- a/vowpalwabbit/parse_example.cc
+++ b/vowpalwabbit/parse_example.cc
@@ -43,10 +43,10 @@ class TC_parser {
   example* ae;
   uint32_t* affix_features;
   bool* spelling_features;
-  v_array<char> spelling = v_init<char>();
+  v_array<char> spelling;
 
   vector<feature_dict*>* namespace_dictionaries;
-  
+
   ~TC_parser(){ }
   
   inline float featureValue(){
@@ -303,6 +303,7 @@ class TC_parser {
   }
 
   TC_parser(char* reading_head, char* endLine, vw& all, example* ae){
+    spelling = v_init<char>();
     if (endLine != reading_head)
       {
 	this->beginLine = reading_head;