vowpalwabbit/cb_explore_adf.cc

#include "reductions.h"
#include "cb_adf.h"
#include "rand48.h"
#include "bs.h"
#include "gen_cs_example.h"
#include "cb_explore.h"
#include "explore.h"
#include <vector>


using namespace LEARNER;
using namespace ACTION_SCORE;
using namespace std;
using namespace CB_ALGS;
using namespace exploration;
//All exploration algorithms return a vector of id, probability tuples, sorted in order of scores. The probabilities are the probability with which each action should be replaced to the top of the list.

//tau first
#define EXPLORE_FIRST 0
//epsilon greedy
#define EPS_GREEDY 1
// bagging explorer
#define BAG_EXPLORE 2
//softmax
#define SOFTMAX 3
//cover
#define COVER 4

namespace CB_EXPLORE_ADF
{

struct cb_explore_adf
{
  v_array<action_score> action_probs;

  vector<uint32_t>* top_actions;

  size_t explore_type;

  size_t tau;
  float epsilon;
  size_t bag_size;
  size_t cover_size;
  float psi;
  bool nounif;
  float lambda;
  uint64_t offset;
  bool greedify;

  size_t counter;

  bool need_to_clear;
  vw* all;
  LEARNER::multi_learner* cs_ldf_learner;

  GEN_CS::cb_to_cs_adf gen_cs;
  COST_SENSITIVE::label cs_labels;
  v_array<CB::label> cb_labels;

  CB::label action_label;
  CB::label empty_label;

  COST_SENSITIVE::label cs_labels_2;

  v_array<COST_SENSITIVE::label> prepped_cs_labels;
};

template<class T> void swap(T& ele1, T& ele2)
{
  T temp = ele2;
  ele2 = ele1;
  ele1 = temp;
}

example* test_adf_sequence(multi_ex& ec_seq)
{
  uint32_t count = 0;
  example* ret = nullptr;
  for (size_t k = 0; k < ec_seq.size(); k++)
  {
    example *ec = ec_seq[k];

    if (ec->l.cb.costs.size() > 1)
      THROW("cb_adf: badly formatted example, only one cost can be known.");

    if (ec->l.cb.costs.size() == 1 && ec->l.cb.costs[0].cost != FLT_MAX)
    {
      ret = ec;
      count += 1;
    }

    if (CB::ec_is_example_header(*ec))
      if (k != 0)
        THROW("warning: example headers at position " << k << ": can only have in initial position!");
  }
  if (count == 0 || count == 1)
    return ret;
  else
    THROW("cb_adf: badly formatted example, only one line can have a cost");
}

template <bool is_learn>
void predict_or_learn_first(cb_explore_adf& data, multi_learner& base, multi_ex& examples)
{
  //Explore tau times, then act according to optimal.
  if (is_learn && data.gen_cs.known_cost.probability < 1 && test_adf_sequence(examples) != nullptr)
    multiline_learn_or_predict<true>(base, examples, data.offset);
  else
    multiline_learn_or_predict<true>(base, examples, data.offset);

  v_array<action_score>& preds = examples[0]->pred.a_s;
  uint32_t num_actions = (uint32_t)preds.size();

  if (data.tau)
  {
    float prob = 1.f / (float)num_actions;
    for (size_t i = 0; i < num_actions; i++)
      preds[i].score = prob;
    data.tau--;
  }
  else
  {
    for (size_t i = 1; i < num_actions; i++)
      preds[i].score = 0.;
    preds[0].score = 1.0;
  }

  enforce_minimum_probability(data.epsilon, true, begin_scores(preds), end_scores(preds));
}

template <bool is_learn>
void predict_or_learn_greedy(cb_explore_adf& data, multi_learner& base, multi_ex& examples)
{
  //Explore uniform random an epsilon fraction of the time.
  if (is_learn && test_adf_sequence(examples) != nullptr)
    multiline_learn_or_predict<true>(base, examples, data.offset);
  else
    multiline_learn_or_predict<false>(base, examples, data.offset);

  action_scores& preds = examples[0]->pred.a_s;

  // generate distribution over actions
  generate_epsilon_greedy(data.epsilon, 0, begin_scores(preds), end_scores(preds));
}

template <bool is_learn>
void predict_or_learn_bag(cb_explore_adf& data, multi_learner& base, multi_ex& examples)
{
  //Randomize over predictions from a base set of predictors
  v_array<action_score>& preds = examples[0]->pred.a_s;
  uint32_t num_actions = (uint32_t)examples.size();
  if (CB::ec_is_example_header(*examples[0]))
    num_actions--;
  if (num_actions == 0)
  {
    preds.clear();
    return;
  }

  data.action_probs.resize(num_actions);
  data.action_probs.clear();
  for (uint32_t i = 0; i < num_actions; i++)
    data.action_probs.push_back({ i,0. });
  vector<uint32_t>& top_actions = *data.top_actions;
  top_actions.resize(num_actions);
  std::fill(top_actions.begin(), top_actions.end(), 0);
  bool test_sequence = test_adf_sequence(examples) == nullptr;
  for (uint32_t i = 0; i < data.bag_size; i++)
  {
    // avoid updates to the random num generator
    // for greedify, always update first policy once
    uint32_t count = is_learn
                     ? ((data.greedify && i == 0) ? 1 : BS::weight_gen(*data.all))
                     : 0;


    if (is_learn && count > 0 && !test_sequence)
      multiline_learn_or_predict<true>(base, examples, data.offset, i);
    else
      multiline_learn_or_predict<false>(base, examples, data.offset, i);

    assert(preds.size() == num_actions);
    top_actions[preds[0].action]++;
    if (is_learn && !test_sequence)
      for (uint32_t j = 1; j < count; j++)
        multiline_learn_or_predict<true>(base, examples, data.offset, i);
  }

  // generate distribution over actions
  generate_bag(begin(top_actions), end(top_actions), begin_scores(data.action_probs), end_scores(data.action_probs));

  enforce_minimum_probability(data.epsilon, true, begin_scores(data.action_probs), end_scores(data.action_probs));
  qsort((void*) data.action_probs.begin(), data.action_probs.size(), sizeof(action_score), reverse_order);

  for (size_t i = 0; i < num_actions; i++)
    preds[i] = data.action_probs[i];
}

template <bool is_learn>
void predict_or_learn_cover(cb_explore_adf& data, multi_learner& base, multi_ex& examples)
{
  //Randomize over predictions from a base set of predictors
  //Use cost sensitive oracle to cover actions to form distribution.
  if (is_learn)
  {
    GEN_CS::gen_cs_example<false>(data.gen_cs, examples, data.cs_labels);
    multiline_learn_or_predict<true>(base, examples, data.offset);
  }
  else
  {
    GEN_CS::gen_cs_example_ips(examples, data.cs_labels);
    multiline_learn_or_predict<false>(base, examples, data.offset);
  }

  v_array<action_score>& preds = examples[0]->pred.a_s;
  uint32_t num_actions = (uint32_t)preds.size();

  float additive_probability = 1.f / (float)data.cover_size;
  float min_prob = min(1.f / num_actions, 1.f / (float)sqrt(data.counter * num_actions));
  v_array<action_score>& probs = data.action_probs;
  probs.clear();
  for(uint32_t i = 0; i < num_actions; i++)
    probs.push_back({i,0.});

  probs[preds[0].action].score += additive_probability;

  uint32_t shared = CB::ec_is_example_header(*examples[0]) ? 1 : 0;

  float norm = min_prob * num_actions + (additive_probability - min_prob);
  for (size_t i = 1; i < data.cover_size; i++)
  {
    //Create costs of each action based on online cover
    if (is_learn)
    {
      data.cs_labels_2.costs.clear();
      if (shared > 0)
        data.cs_labels_2.costs.push_back(data.cs_labels.costs[0]);
      for (uint32_t j = 0; j < num_actions; j++)
      {
        float pseudo_cost = data.cs_labels.costs[j+shared].x - data.psi * min_prob / (max(probs[j].score, min_prob) / norm);
        data.cs_labels_2.costs.push_back({pseudo_cost,j,0.,0.});
      }
      GEN_CS::call_cs_ldf<true>(*(data.cs_ldf_learner), examples, data.cb_labels, data.cs_labels_2, data.prepped_cs_labels, data.offset, i+1);
    }
    else
      GEN_CS::call_cs_ldf<false>(*(data.cs_ldf_learner), examples, data.cb_labels, data.cs_labels, data.prepped_cs_labels, data.offset, i+1);

    uint32_t action = preds[0].action;
    if (probs[action].score < min_prob)
      norm += max(0, additive_probability - (min_prob - probs[action].score));
    else
      norm += additive_probability;
    probs[action].score += additive_probability;
  }

  enforce_minimum_probability(min_prob * num_actions, !data.nounif, begin_scores(probs), end_scores(probs));

  qsort((void*) probs.begin(), probs.size(), sizeof(action_score), reverse_order);
  for (size_t i = 0; i < num_actions; i++)
    preds[i] = probs[i];

  ++data.counter;
}

template <bool is_learn>
void predict_or_learn_softmax(cb_explore_adf& data, multi_learner& base, multi_ex& examples)
{
  if (is_learn && test_adf_sequence(examples) != nullptr)
    multiline_learn_or_predict<true>(base, examples, data.offset);
  else
    multiline_learn_or_predict<false>(base, examples, data.offset);

  v_array<action_score>& preds = examples[0]->pred.a_s;
  generate_softmax(data.lambda, begin_scores(preds), end_scores(preds), begin_scores(preds), end_scores(preds));

  enforce_minimum_probability(data.epsilon, true, begin_scores(preds), end_scores(preds));
}

void finish(cb_explore_adf& data)
{
  delete data.top_actions;
  data.action_probs.delete_v();
  data.cs_labels.costs.delete_v();
  data.cs_labels_2.costs.delete_v();
  data.cb_labels.delete_v();
  for(size_t i = 0; i < data.prepped_cs_labels.size(); i++)
    data.prepped_cs_labels[i].costs.delete_v();
  data.prepped_cs_labels.delete_v();
  data.gen_cs.pred_scores.costs.delete_v();
}


//Semantics: Currently we compute the IPS loss no matter what flags
//are specified. We print the first action and probability, based on
//ordering by scores in the final output.

void output_example(vw& all, cb_explore_adf& c, multi_ex& ec_seq)
{
  if (ec_seq.size() <= 0) return;


  size_t num_features = 0;

  float loss = 0.;

  auto& ec = *ec_seq[0];
  ACTION_SCORE::action_scores preds = ec.pred.a_s;

  for (size_t i = 0; i < ec_seq.size(); i++)
    if (!CB::ec_is_example_header(*ec_seq[i]))
      num_features += ec_seq[i]->num_features;

  bool is_test = false;
  if (c.gen_cs.known_cost.probability > 0)
  {
    for (uint32_t i = 0; i < preds.size(); i++)
    {
      float l = get_unbiased_cost(&c.gen_cs.known_cost, preds[i].action);
      loss += l*preds[i].score;
    }
  }
  else
    is_test = true;
  all.sd->update(ec.test_only, c.gen_cs.known_cost.probability > 0, loss, ec.weight, num_features);

  for (int sink : all.final_prediction_sink)
    print_action_score(sink, ec.pred.a_s, ec.tag);

  if (all.raw_prediction > 0)
  {
    string outputString;
    stringstream outputStringStream(outputString);
    v_array<CB::cb_class> costs = ec.l.cb.costs;

    for (size_t i = 0; i < costs.size(); i++)
    {
      if (i > 0) outputStringStream << ' ';
      outputStringStream << costs[i].action << ':' << costs[i].partial_prediction;
    }
    all.print_text(all.raw_prediction, outputStringStream.str(), ec.tag);
  }

  CB::print_update(all, is_test, ec, &ec_seq, true);
}

void output_example_seq(vw& all, cb_explore_adf& data, multi_ex& ec_seq)
{
  if (ec_seq.size() > 0)
  {
    output_example(all, data, ec_seq);
    if (all.raw_prediction > 0)
      all.print_text(all.raw_prediction, "", ec_seq[0]->tag);
  }
}

void finish_multiline_example(vw& all, cb_explore_adf& data, multi_ex& ec_seq)
{
  if (ec_seq.size() > 0)
  {
    output_example_seq(all, data, ec_seq);
    CB_ADF::global_print_newline(all);
  }
  VW::clear_seq_and_finish_examples(all, ec_seq);
}

template <bool is_learn>
void do_actual_learning(cb_explore_adf& data, multi_learner& base, multi_ex& ec_seq)
{
  example* label_example=test_adf_sequence(ec_seq);
  data.gen_cs.known_cost = CB_ADF::get_observed_cost(ec_seq);

  if (label_example == nullptr || !is_learn)
  {
    if (label_example != nullptr)//extract label
    {
      data.action_label = label_example->l.cb;
      label_example->l.cb = data.empty_label;
    }
    switch (data.explore_type)
    {
    case EXPLORE_FIRST:
      predict_or_learn_first<false>(data, base, ec_seq);
      break;
    case EPS_GREEDY:
      predict_or_learn_greedy<false>(data, base, ec_seq);
      break;
    case SOFTMAX:
      predict_or_learn_softmax<false>(data, base, ec_seq);
      break;
    case BAG_EXPLORE:
      predict_or_learn_bag<false>(data, base, ec_seq);
      break;
    case COVER:
      predict_or_learn_cover<false>(data, base, ec_seq);
      break;
    default:
      THROW("Unknown explorer type specified for contextual bandit learning: " << data.explore_type);
    }
    if (label_example != nullptr)	//restore label
      label_example->l.cb = data.action_label;
  }
  else
  {
    /*	v_array<float> temp_probs;
    temp_probs = v_init<float>();
    do_actual_learning<false>(data,base);
    for (size_t i = 0; i < data.ec_seq[0]->pred.a_s.size(); i++)
    temp_probs.push_back(data.ec_seq[0]->pred.a_s[i].score);*/

    switch (data.explore_type)
    {
    case EXPLORE_FIRST:
      predict_or_learn_first<is_learn>(data, base, ec_seq);
      break;
    case EPS_GREEDY:
      predict_or_learn_greedy<is_learn>(data, base, ec_seq);
      break;
    case SOFTMAX:
      predict_or_learn_softmax<is_learn>(data, base, ec_seq);
      break;
    case BAG_EXPLORE:
      predict_or_learn_bag<is_learn>(data, base, ec_seq);
      break;
    case COVER:
      predict_or_learn_cover<is_learn>(data, base, ec_seq);
      break;
    default:
      THROW("Unknown explorer type specified for contextual bandit learning: " << data.explore_type);
    }

    /*	for (size_t i = 0; i < temp_probs.size(); i++)
      if (temp_probs[i] != data.ec_seq[0]->pred.a_s[i].score)
        cout << "problem! " << temp_probs[i] << " != " << data.ec_seq[0]->pred.a_s[i].score << " for " << data.ec_seq[0]->pred.a_s[i].action << endl;
        temp_probs.delete_v();*/
  }
}
}

using namespace CB_EXPLORE_ADF;

base_learner* cb_explore_adf_setup(arguments& arg)
{
  auto data = scoped_calloc_or_throw<cb_explore_adf>();
  if (arg.new_options("Contextual Bandit Exploration with Action Dependent Features")
      .critical("cb_explore_adf", "Online explore-exploit for a contextual bandit problem with multiline action dependent features")
      .keep("first", data->tau, "tau-first exploration")
      .keep("epsilon", data->epsilon, "epsilon-greedy exploration")
      .keep("bag", data->bag_size, "bagging-based exploration")
      .keep("cover", data->cover_size ,"Online cover based exploration")
      .keep("psi", data->psi, 1.0f, "disagreement parameter for cover")
      .keep(data->nounif, "nounif", "do not explore uniformly on zero-probability actions in cover")
      .keep("softmax", "softmax exploration")
      .keep(data->greedify, "greedify", "always update first policy once in bagging")
      .keep("lambda", data->lambda, 1.0f, "parameter for softmax").missing())
    return nullptr;

  data->all = arg.all;
  if (count(arg.args.begin(), arg.args.end(), "--cb_adf") == 0)
    arg.args.push_back("--cb_adf");

  arg.all->delete_prediction = delete_action_scores;

  size_t problem_multiplier = 1;

  if (arg.vm.count("cover"))
  {
    data->explore_type = COVER;
    problem_multiplier = data->cover_size+1;
  }
  else if (arg.vm.count("bag"))
  {
    data->explore_type = BAG_EXPLORE;
    problem_multiplier = data->bag_size;
    data->top_actions = new vector<uint32_t>;
  }
  else if (arg.vm.count("first"))
    data->explore_type = EXPLORE_FIRST;
  else if (arg.vm["softmax"].as<bool>())
    data->explore_type = SOFTMAX;
  else
    {
      if (!arg.vm.count("epsilon")) data->epsilon = 0.05f;
      data->explore_type = EPS_GREEDY;
    }

  multi_learner* base = as_multiline(setup_base(arg));
  arg.all->p->lp = CB::cb_label;
  arg.all->label_type = label_type::cb;

  //Extract from lower level reductions.
  data->gen_cs.scorer = arg.all->scorer;
  data->cs_ldf_learner = as_multiline(arg.all->cost_sensitive);
  data->gen_cs.cb_type = CB_TYPE_IPS;
  if (arg.vm.count("cb_type"))
  {
    std::string type_string;
    type_string = arg.vm["cb_type"].as<std::string>();

    if (type_string.compare("dr") == 0)
      data->gen_cs.cb_type = CB_TYPE_DR;
    else if (type_string.compare("ips") == 0)
      data->gen_cs.cb_type = CB_TYPE_IPS;
    else if (type_string.compare("mtr") == 0)
      if (arg.vm.count("cover"))
      {
        arg.trace_message << "warning: cover and mtr are not simultaneously supported yet, defaulting to ips" << endl;
        data->gen_cs.cb_type = CB_TYPE_IPS;
      }
      else
        data->gen_cs.cb_type = CB_TYPE_MTR;
    else
      arg.trace_message << "warning: cb_type must be in {'ips','dr'}; resetting to ips." << std::endl;
  }

  learner<cb_explore_adf,multi_ex>& l = init_learner(data, base,
    CB_EXPLORE_ADF::do_actual_learning<true>,
    CB_EXPLORE_ADF::do_actual_learning<false>,
    problem_multiplier,
    prediction_type::action_probs);

  l.set_finish_example(CB_EXPLORE_ADF::finish_multiline_example);
  l.set_finish(CB_EXPLORE_ADF::finish);
  l.set_test_example(CB::example_is_test);
  return make_base(l);
}