diff --git a/cs/cli/vowpalwabbit.cpp b/cs/cli/vowpalwabbit.cpp
index d40cb1ad78b..38e3f96f385 100644
--- a/cs/cli/vowpalwabbit.cpp
+++ b/cs/cli/vowpalwabbit.cpp
@@ -62,7 +62,7 @@ void VowpalWabbit::Driver()
 void VowpalWabbit::RunMultiPass()
 { if (m_vw->numpasses > 1)
   { try
-    { adjust_used_index(*m_vw);
+    {
       m_vw->do_reset_source = true;
       VW::start_parser(*m_vw);
       LEARNER::generic_driver(*m_vw);
@@ -307,7 +307,7 @@ List<VowpalWabbitExample^>^ VowpalWabbit::ParseDecisionServiceJson(cli::array<By
 			auto ex = GetOrCreateNativeExample();
 			state->examples->Add(ex);
 
-			v_array<example*> examples = v_init<example*>();
+			v_array<example*> examples;
 			example* native_example = ex->m_example;
 			examples.push_back(native_example);
 
@@ -326,9 +326,6 @@ List<VowpalWabbitExample^>^ VowpalWabbit::ParseDecisionServiceJson(cli::array<By
 			// finalize example
 			VW::setup_examples(*m_vw, examples);
 
-			// delete native array of pointers, keep examples
-			examples.delete_v();
-
 			header->EventId = gcnew String(interaction.eventId.c_str());
 			header->Actions = gcnew cli::array<int>((int)interaction.actions.size());
 			int index = 0;
@@ -789,7 +786,7 @@ VowpalWabbitExample^ VowpalWabbit::GetOrCreateNativeExample()
   if (ex == nullptr)
   { try
     { auto ex = VW::alloc_examples(0, 1);
-      m_vw->p->lp.default_label(&ex->l);
+      m_vw->p->lp.default_label(ex->l);
       return gcnew VowpalWabbitExample(this, ex);
     }
     CATCHRETHROW
@@ -797,7 +794,7 @@ VowpalWabbitExample^ VowpalWabbit::GetOrCreateNativeExample()
 
   try
   { VW::empty_example(*m_vw, *ex->m_example);
-    m_vw->p->lp.default_label(&ex->m_example->l);
+    m_vw->p->lp.default_label(ex->m_example->l);
 
     return ex;
   }
diff --git a/cs/cli/vw_example.cpp b/cs/cli/vw_example.cpp
index 8a73c46f74e..0dc7b24b091 100644
--- a/cs/cli/vw_example.cpp
+++ b/cs/cli/vw_example.cpp
@@ -97,7 +97,7 @@ void VowpalWabbitExample::Label::set(ILabel^ label)
 	label->UpdateExample(m_owner->Native->m_vw, m_example);
 
 	// we need to update the example weight as setup_example() can be called prior to this call.
-	m_example->weight = m_owner->Native->m_vw->p->lp.get_weight(&m_example->l);
+	m_example->weight = m_owner->Native->m_vw->p->lp.get_weight(m_example->l);
 }
 
 void VowpalWabbitExample::MakeEmpty(VowpalWabbit^ vw)
@@ -280,8 +280,8 @@ System::String^ VowpalWabbitExample::Diff(VowpalWabbit^ vw, VowpalWabbitExample^
 }
 
 String^ VowpalWabbitSimpleLabelComparator::Diff(VowpalWabbitExample^ ex1, VowpalWabbitExample^ ex2)
-{ auto s1 = ex1->m_example->l.simple;
-  auto s2 = ex2->m_example->l.simple;
+{ auto& s1 = ex1->m_example->l.simple();
+  auto& s2 = ex2->m_example->l.simple();
 
   if (!(FloatEqual(s1.initial, s2.initial) &&
         FloatEqual(s1.label, s2.label) &&
@@ -296,8 +296,8 @@ String^ VowpalWabbitSimpleLabelComparator::Diff(VowpalWabbitExample^ ex1, Vowpal
 }
 
 String^ VowpalWabbitContextualBanditLabelComparator::Diff(VowpalWabbitExample^ ex1, VowpalWabbitExample^ ex2)
-{ auto s1 = ex1->m_example->l.cb;
-  auto s2 = ex2->m_example->l.cb;
+{ auto& s1 = ex1->m_example->l.cb();
+  auto& s2 = ex2->m_example->l.cb();
 
   if (s1.costs.size() != s2.costs.size())
   { return System::String::Format("Cost size differ: {0} vs {1}", s1.costs.size(), s2.costs.size());
diff --git a/cs/cli/vw_prediction.cpp b/cs/cli/vw_prediction.cpp
index bfce5b3a80f..b93e67a5cd3 100644
--- a/cs/cli/vw_prediction.cpp
+++ b/cs/cli/vw_prediction.cpp
@@ -10,7 +10,8 @@
 namespace VW
 {
 void CheckExample(vw* vw, example* ex, prediction_type_t type)
-{ if (vw == nullptr)
+{
+  if (vw == nullptr)
     throw gcnew ArgumentNullException("vw");
 
   if (ex == nullptr)
@@ -18,7 +19,8 @@ void CheckExample(vw* vw, example* ex, prediction_type_t type)
 
   auto ex_pred_type = vw->l->pred_type;
   if (ex_pred_type != type)
-  { auto sb = gcnew StringBuilder();
+  {
+    auto sb = gcnew StringBuilder();
     sb->Append("Prediction type must be ");
     sb->Append(gcnew String(to_string(type)));
     sb->Append(" but is ");
@@ -29,20 +31,23 @@ void CheckExample(vw* vw, example* ex, prediction_type_t type)
 }
 
 float VowpalWabbitScalarPredictionFactory::Create(vw* vw, example* ex)
-{ CheckExample(vw, ex, PredictionType);
+{
+  CheckExample(vw, ex, PredictionType);
 
   try
-  { return VW::get_prediction(ex);
+  {
+    return VW::get_prediction(ex);
   }
   CATCHRETHROW
 }
 
-
 VowpalWabbitScalar VowpalWabbitScalarConfidencePredictionFactory::Create(vw* vw, example* ex)
-{ CheckExample(vw, ex, PredictionType);
+{
+  CheckExample(vw, ex, PredictionType);
 
   try
-  { VowpalWabbitScalar ret;
+  {
+    VowpalWabbitScalar ret;
 
     ret.Value = VW::get_prediction(ex);
     ret.Confidence = ex->confidence;
@@ -52,15 +57,16 @@ VowpalWabbitScalar VowpalWabbitScalarConfidencePredictionFactory::Create(vw* vw,
   CATCHRETHROW
 }
 
-cli::array<float>^ VowpalWabbitScalarsPredictionFactory::Create(vw* vw, example* ex)
-{ CheckExample(vw, ex, PredictionType);
+cli::array<float> ^ VowpalWabbitScalarsPredictionFactory::Create(vw* vw, example* ex)
+{
+  CheckExample(vw, ex, PredictionType);
 
   try
-  { auto& scalars = ex->pred.scalars;
+  {
+    auto& scalars = ex->pred.scalars();
     auto values = gcnew cli::array<float>((int)scalars.size());
     int index = 0;
-    for (float s : scalars)
-      values[index++] = s;
+    for (float s : scalars) values[index++] = s;
 
     return values;
   }
@@ -68,21 +74,24 @@ cli::array<float>^ VowpalWabbitScalarsPredictionFactory::Create(vw* vw, example*
 }
 
 float VowpalWabbitProbabilityPredictionFactory::Create(vw* vw, example* ex)
-{ CheckExample(vw, ex, PredictionType);
+{
+  CheckExample(vw, ex, PredictionType);
 
-  return ex->pred.prob;
+  return ex->pred.prob();
 }
 
 float VowpalWabbitCostSensitivePredictionFactory::Create(vw* vw, example* ex)
-{ CheckExample(vw, ex, PredictionType);
+{
+  CheckExample(vw, ex, PredictionType);
 
   try
-  { return VW::get_cost_sensitive_prediction(ex);
+  {
+    return VW::get_cost_sensitive_prediction(ex);
   }
   CATCHRETHROW
 }
 
-Dictionary<int, float>^ VowpalWabbitMulticlassProbabilitiesPredictionFactory::Create(vw* vw, example* ex)
+Dictionary<int, float> ^ VowpalWabbitMulticlassProbabilitiesPredictionFactory::Create(vw* vw, example* ex)
 {
 #if _DEBUG
   if (ex == nullptr)
@@ -91,33 +100,38 @@ Dictionary<int, float>^ VowpalWabbitMulticlassProbabilitiesPredictionFactory::Cr
   v_array<float> confidence_scores;
 
   try
-  { confidence_scores = VW::get_cost_sensitive_prediction_confidence_scores(ex);
+  {
+    confidence_scores = VW::get_cost_sensitive_prediction_confidence_scores(ex);
   }
   CATCHRETHROW
 
   auto values = gcnew Dictionary<int, float>();
   int i = 0;
   for (auto& val : confidence_scores)
-  { values->Add(++i, val);
+  {
+    values->Add(++i, val);
   }
 
   return values;
 }
 
 uint32_t VowpalWabbitMulticlassPredictionFactory::Create(vw* vw, example* ex)
-{ CheckExample(vw, ex, PredictionType);
+{
+  CheckExample(vw, ex, PredictionType);
 
-  return ex->pred.multiclass;
+  return ex->pred.multiclass();
 }
 
-cli::array<int>^ VowpalWabbitMultilabelPredictionFactory::Create(vw* vw, example* ex)
-{ CheckExample(vw, ex, prediction_type_t::multilabels);
+cli::array<int> ^ VowpalWabbitMultilabelPredictionFactory::Create(vw* vw, example* ex)
+{
+  CheckExample(vw, ex, prediction_type_t::multilabels);
 
   size_t length;
   uint32_t* labels;
 
   try
-  { labels = VW::get_multilabel_predictions(ex, length);
+  {
+    labels = VW::get_multilabel_predictions(ex, length);
   }
   CATCHRETHROW
 
@@ -132,15 +146,25 @@ cli::array<int>^ VowpalWabbitMultilabelPredictionFactory::Create(vw* vw, example
   return values;
 }
 
-cli::array<ActionScore>^ VowpalWabbitActionScoreBasePredictionFactory::Create(vw* vw, example* ex)
-{ CheckExample(vw, ex, PredictionType);
+cli::array<ActionScore> ^ VowpalWabbitActionScoreBasePredictionFactory::Create(vw* vw, example* ex)
+{
+  CheckExample(vw, ex, PredictionType);
 
-  auto& a_s = ex->pred.a_s;
-  auto values = gcnew cli::array<ActionScore>((int)a_s.size());
+  ACTION_SCORE::action_scores* a_s = nullptr;
+  if (ex->pred.get_type() == prediction_type_t::action_scores)
+  {
+    a_s = &ex->pred.action_scores();
+  }
+  else
+  {
+    a_s = &ex->pred.action_probs();
+  }
+  auto values = gcnew cli::array<ActionScore>((int)a_s->size());
 
   auto index = 0;
-  for (auto& as : a_s)
-  { values[index].Action = as.action;
+  for (auto& as : *a_s)
+  {
+    values[index].Action = as.action;
     values[index].Score = as.score;
     index++;
   }
@@ -148,22 +172,25 @@ cli::array<ActionScore>^ VowpalWabbitActionScoreBasePredictionFactory::Create(vw
   return values;
 }
 
-cli::array<float>^ VowpalWabbitTopicPredictionFactory::Create(vw* vw, example* ex)
-{ if (ex == nullptr)
+cli::array<float> ^ VowpalWabbitTopicPredictionFactory::Create(vw* vw, example* ex)
+{
+  if (ex == nullptr)
     throw gcnew ArgumentNullException("ex");
 
   auto values = gcnew cli::array<float>(vw->lda);
-  Marshal::Copy(IntPtr(ex->pred.scalars.begin()), values, 0, vw->lda);
+  Marshal::Copy(IntPtr(ex->pred.scalars().begin()), values, 0, vw->lda);
 
   return values;
 }
 
-System::Object^ VowpalWabbitDynamicPredictionFactory::Create(vw* vw, example* ex)
-{ if (ex == nullptr)
+System::Object ^ VowpalWabbitDynamicPredictionFactory::Create(vw* vw, example* ex)
+{
+  if (ex == nullptr)
     throw gcnew ArgumentNullException("ex");
 
   switch (vw->l->pred_type)
-  { case prediction_type_t::scalar:
+  {
+    case prediction_type_t::scalar:
       return VowpalWabbitPredictionType::Scalar->Create(vw, ex);
     case prediction_type_t::scalars:
       return VowpalWabbitPredictionType::Scalars->Create(vw, ex);
@@ -180,11 +207,12 @@ System::Object^ VowpalWabbitDynamicPredictionFactory::Create(vw* vw, example* ex
     case prediction_type_t::multiclassprobs:
       return VowpalWabbitPredictionType::MultiClassProbabilities->Create(vw, ex);
     default:
-    { auto sb = gcnew StringBuilder();
+    {
+      auto sb = gcnew StringBuilder();
       sb->Append("Unsupported prediction type: ");
       sb->Append(gcnew String(to_string(vw->l->pred_type)));
       throw gcnew ArgumentException(sb->ToString());
     }
   }
 }
-}
+}  // namespace VW
diff --git a/java/src/main/c++/jni_base_learner.h b/java/src/main/c++/jni_base_learner.h
index 6b1c0556372..0f6e55ddbad 100644
--- a/java/src/main/c++/jni_base_learner.h
+++ b/java/src/main/c++/jni_base_learner.h
@@ -74,9 +74,10 @@ T base_predict(JNIEnv* env, jobjectArray example_strings, jboolean learn, jlong
     rethrow_cpp_exception_as_java_exception(env);
   }
 
+  T result = predictor(first_example, env);
   vwInstance->finish_example(ex_coll);
 
-  return predictor(first_example, env);
+  return result;
 }
 
 #endif  // VW_BASE_LEARNER_H
diff --git a/java/src/main/c++/jni_spark_vw.cc b/java/src/main/c++/jni_spark_vw.cc
index 224298dcc26..49ddb788e5b 100644
--- a/java/src/main/c++/jni_spark_vw.cc
+++ b/java/src/main/c++/jni_spark_vw.cc
@@ -256,7 +256,7 @@ JNIEXPORT jlong JNICALL Java_org_vowpalwabbit_spark_VowpalWabbitExample_initiali
 
   try
   {
-    example* ex = VW::alloc_examples(0, 1);
+    example* ex = VW::alloc_examples(1);
     ex->interactions = &all->interactions;
 
     if (isEmpty)
@@ -265,7 +265,7 @@ JNIEXPORT jlong JNICALL Java_org_vowpalwabbit_spark_VowpalWabbitExample_initiali
       VW::read_line(*all, ex, &empty);
     }
     else
-      all->p->lp.default_label(&ex->l);
+      all->p->lp.default_label(ex->l);
 
     return (jlong) new VowpalWabbitExampleWrapper(all, ex);
   }
@@ -297,7 +297,7 @@ JNIEXPORT void JNICALL Java_org_vowpalwabbit_spark_VowpalWabbitExample_clear(JNI
   try
   {
     VW::empty_example(*all, *ex);
-    all->p->lp.default_label(&ex->l);
+    all->p->lp.default_label(ex->l);
   }
   catch (...)
   {
@@ -444,7 +444,7 @@ JNIEXPORT jobject JNICALL Java_org_vowpalwabbit_spark_VowpalWabbitExample_getPre
       ctr = env->GetMethodID(predClass, "<init>", "(F)V");
       CHECK_JNI_EXCEPTION(nullptr);
 
-      return env->NewObject(predClass, ctr, ex->pred.prob);
+      return env->NewObject(predClass, ctr, ex->pred.prob());
 
     case prediction_type_t::multiclass:
       predClass = env->FindClass("java/lang/Integer");
@@ -453,7 +453,7 @@ JNIEXPORT jobject JNICALL Java_org_vowpalwabbit_spark_VowpalWabbitExample_getPre
       ctr = env->GetMethodID(predClass, "<init>", "(I)V");
       CHECK_JNI_EXCEPTION(nullptr);
 
-      return env->NewObject(predClass, ctr, ex->pred.multiclass);
+      return env->NewObject(predClass, ctr, ex->pred.multiclass());
 
     case prediction_type_t::scalars:
       return scalars_predictor(ex, env);
diff --git a/java/src/main/c++/vowpalWabbit_learner_VWActionProbsLearner.cc b/java/src/main/c++/vowpalWabbit_learner_VWActionProbsLearner.cc
index 8434219515f..ee90a2ee592 100644
--- a/java/src/main/c++/vowpalWabbit_learner_VWActionProbsLearner.cc
+++ b/java/src/main/c++/vowpalWabbit_learner_VWActionProbsLearner.cc
@@ -7,17 +7,17 @@ jobject action_probs_prediction(example *vec, JNIEnv *env)
   jclass action_prob_class = env->FindClass("vowpalWabbit/responses/ActionProb");
   jmethodID action_prob_constructor = env->GetMethodID(action_prob_class, "<init>", "(IF)V");
 
-  // The action_probs prediction_type_t is just a placeholder identifying when the aciton_scores
+  // The action_probs prediction_type_t is just a placeholder identifying when the action_scores
   // should be treated as probabilities or scores.  That is why this function references a_s yet returns
   // ActionProbs to the Java side.
-  ACTION_SCORE::action_scores a_s = vec->pred.a_s;
+  const auto& a_s = vec->pred.action_probs();
   size_t num_values = a_s.size();
   jobjectArray j_action_probs = env->NewObjectArray(num_values, action_prob_class, 0);
 
   jclass action_probs_class = env->FindClass("vowpalWabbit/responses/ActionProbs");
   for (uint32_t i = 0; i < num_values; ++i)
   {
-    ACTION_SCORE::action_score a = a_s[i];
+    const auto& a = a_s[i];
     jobject j_action_prob = env->NewObject(action_prob_class, action_prob_constructor, a.action, a.score);
     env->SetObjectArrayElement(j_action_probs, i, j_action_prob);
   }
diff --git a/java/src/main/c++/vowpalWabbit_learner_VWActionScoresLearner.cc b/java/src/main/c++/vowpalWabbit_learner_VWActionScoresLearner.cc
index a5591383bdf..1259ce87d05 100644
--- a/java/src/main/c++/vowpalWabbit_learner_VWActionScoresLearner.cc
+++ b/java/src/main/c++/vowpalWabbit_learner_VWActionScoresLearner.cc
@@ -7,14 +7,14 @@ jobject action_scores_prediction(example *vec, JNIEnv *env)
   jclass action_score_class = env->FindClass("vowpalWabbit/responses/ActionScore");
   jmethodID action_score_constructor = env->GetMethodID(action_score_class, "<init>", "(IF)V");
 
-  ACTION_SCORE::action_scores a_s = vec->pred.a_s;
+  const auto a_s = vec->pred.action_scores();
   size_t num_values = a_s.size();
   jobjectArray j_action_scores = env->NewObjectArray(num_values, action_score_class, 0);
 
   jclass action_scores_class = env->FindClass("vowpalWabbit/responses/ActionScores");
   for (uint32_t i = 0; i < num_values; ++i)
   {
-    ACTION_SCORE::action_score a = a_s[i];
+    const auto a = a_s[i];
     jobject j_action_score = env->NewObject(action_score_class, action_score_constructor, a.action, a.score);
     env->SetObjectArrayElement(j_action_scores, i, j_action_score);
   }
diff --git a/java/src/main/c++/vowpalWabbit_learner_VWMulticlassLearner.cc b/java/src/main/c++/vowpalWabbit_learner_VWMulticlassLearner.cc
index 4541cd1a099..cb51bb267d2 100644
--- a/java/src/main/c++/vowpalWabbit_learner_VWMulticlassLearner.cc
+++ b/java/src/main/c++/vowpalWabbit_learner_VWMulticlassLearner.cc
@@ -2,7 +2,7 @@
 #include "vw.h"
 #include "jni_base_learner.h"
 
-jint multiclass_predictor(example *vec, JNIEnv *env) { return vec->pred.multiclass; }
+jint multiclass_predictor(example *vec, JNIEnv *env) { return vec->pred.multiclass(); }
 
 JNIEXPORT jint JNICALL Java_vowpalWabbit_learner_VWMulticlassLearner_predict(
     JNIEnv *env, jobject obj, jstring example_string, jboolean learn, jlong vwPtr)
diff --git a/java/src/main/c++/vowpalWabbit_learner_VWMultilabelsLearner.cc b/java/src/main/c++/vowpalWabbit_learner_VWMultilabelsLearner.cc
index e73fd4327f2..7f266ac218c 100644
--- a/java/src/main/c++/vowpalWabbit_learner_VWMultilabelsLearner.cc
+++ b/java/src/main/c++/vowpalWabbit_learner_VWMultilabelsLearner.cc
@@ -4,7 +4,7 @@
 
 jobject multilabel_predictor(example *vec, JNIEnv *env)
 {
-  auto& labels = vec->pred.multilabels.label_v;
+  auto& labels = vec->pred.multilabels().label_v;
   size_t num_values = labels.size();
   jintArray j_labels = env->NewIntArray(num_values);
   env->SetIntArrayRegion(j_labels, 0, num_values, (int *)labels.begin());
diff --git a/java/src/main/c++/vowpalWabbit_learner_VWProbLearner.cc b/java/src/main/c++/vowpalWabbit_learner_VWProbLearner.cc
index 4f6fa359e0a..cf8194eea4f 100644
--- a/java/src/main/c++/vowpalWabbit_learner_VWProbLearner.cc
+++ b/java/src/main/c++/vowpalWabbit_learner_VWProbLearner.cc
@@ -2,7 +2,7 @@
 #include "vw.h"
 #include "jni_base_learner.h"
 
-jfloat prob_predictor(example *vec, JNIEnv *env) { return vec->pred.prob; }
+jfloat prob_predictor(example *vec, JNIEnv *env) { return vec->pred.prob(); }
 
 JNIEXPORT jfloat JNICALL Java_vowpalWabbit_learner_VWProbLearner_predict(
     JNIEnv *env, jobject obj, jstring example_string, jboolean learn, jlong vwPtr)
diff --git a/java/src/main/c++/vowpalWabbit_learner_VWScalarLearner.cc b/java/src/main/c++/vowpalWabbit_learner_VWScalarLearner.cc
index def69372304..acb908a8366 100644
--- a/java/src/main/c++/vowpalWabbit_learner_VWScalarLearner.cc
+++ b/java/src/main/c++/vowpalWabbit_learner_VWScalarLearner.cc
@@ -2,7 +2,7 @@
 #include "vw.h"
 #include "jni_base_learner.h"
 
-jfloat scalar_predictor(example *vec, JNIEnv *env) { return vec->pred.scalar; }
+jfloat scalar_predictor(example *vec, JNIEnv *env) { return vec->pred.scalar(); }
 
 JNIEXPORT jfloat JNICALL Java_vowpalWabbit_learner_VWScalarLearner_predict(
     JNIEnv *env, jobject obj, jstring example_string, jboolean learn, jlong vwPtr)
diff --git a/java/src/main/c++/vowpalWabbit_learner_VWScalarsLearner.cc b/java/src/main/c++/vowpalWabbit_learner_VWScalarsLearner.cc
index 44386aefd79..5f4d5f20ac4 100644
--- a/java/src/main/c++/vowpalWabbit_learner_VWScalarsLearner.cc
+++ b/java/src/main/c++/vowpalWabbit_learner_VWScalarsLearner.cc
@@ -4,7 +4,7 @@
 
 jfloatArray scalars_predictor(example *vec, JNIEnv *env)
 {
-  auto& scalars = vec->pred.scalars;
+  auto& scalars = vec->pred.scalars();
   size_t num_values = scalars.size();
   jfloatArray r = env->NewFloatArray(num_values);
   env->SetFloatArrayRegion(r, 0, num_values, (float *)scalars.begin());
diff --git a/java/src/test/java/vowpalWabbit/learner/VWActionScoresLearnerTest.java b/java/src/test/java/vowpalWabbit/learner/VWActionScoresLearnerTest.java
index 8f1e27de934..c4ce9082b8c 100644
--- a/java/src/test/java/vowpalWabbit/learner/VWActionScoresLearnerTest.java
+++ b/java/src/test/java/vowpalWabbit/learner/VWActionScoresLearnerTest.java
@@ -5,6 +5,7 @@
 import org.junit.rules.TemporaryFolder;
 import vowpalWabbit.VWTestHelper;
 import vowpalWabbit.responses.ActionScores;
+import vowpalWabbit.responses.ActionProbs;
 
 import java.io.IOException;
 
@@ -85,40 +86,40 @@ private void testCBADF(boolean withRank) throws IOException {
         String cli = "--quiet --cb_adf -f " + model;
         if (withRank)
             cli += " --rank_all";
-        VWActionScoresLearner vw = VWLearners.create(cli);
-        ActionScores[] trainPreds = new ActionScores[cbADFTrain.length];
+        VWActionProbsLearner vw = VWLearners.create(cli);
+        ActionProbs[] trainPreds = new ActionProbs[cbADFTrain.length];
         for (int i=0; i<cbADFTrain.length; ++i) {
             trainPreds[i] = vw.learn(cbADFTrain[i]);
         }
 
-        ActionScores[] expectedTrainPreds = new ActionScores[]{
-            actionScores(
-                actionScore(0, 0),
-                actionScore(1, 0)
+        ActionProbs[] expectedTrainPreds = new ActionProbs[]{
+            actionProbs(
+                actionProb(0, 0),
+                actionProb(1, 0)
             ),
-            actionScores(
-                actionScore(0, 0.11246802f),
-                actionScore(1, 0.11246802f)
+            actionProbs(
+                actionProb(0, 0.11246802f),
+                actionProb(1, 0.11246802f)
             ),
-            actionScores(
-                actionScore(0, 0.3682006f),
-                actionScore(1, 0.5136312f)
+            actionProbs(
+                actionProb(0, 0.3682006f),
+                actionProb(1, 0.5136312f)
             ),
-            actionScores(
-                actionScore(0, 0.58848584f),
-                actionScore(1, 0.6244352f)
+            actionProbs(
+                actionProb(0, 0.58848584f),
+                actionProb(1, 0.6244352f)
             )
         };
         vw.close();
         assertArrayEquals(expectedTrainPreds, trainPreds);
 
         vw = VWLearners.create("--quiet -t -i " + model);
-        ActionScores[] testPreds = new ActionScores[]{vw.predict(cbADFTrain[0])};
+        ActionProbs[] testPreds = new ActionProbs[]{vw.predict(cbADFTrain[0])};
 
-        ActionScores[] expectedTestPreds = new ActionScores[]{
-            actionScores(
-                actionScore(0, 0.39904374f),
-                actionScore(1, 0.49083984f)
+        ActionProbs[] expectedTestPreds = new ActionProbs[]{
+            actionProbs(
+                actionProb(0, 0.39904374f),
+                actionProb(1, 0.49083984f)
             )
         };
 
diff --git a/java/src/test/java/vowpalWabbit/learner/VWScalarsLearnerTest.java b/java/src/test/java/vowpalWabbit/learner/VWScalarsLearnerTest.java
index 3584335d527..bef43f75d58 100644
--- a/java/src/test/java/vowpalWabbit/learner/VWScalarsLearnerTest.java
+++ b/java/src/test/java/vowpalWabbit/learner/VWScalarsLearnerTest.java
@@ -107,14 +107,18 @@ private String convertQuery(String q) {
         return sb.toString().trim();
     }
 
-    @Test
-    public void testLDALearnerPredict() throws IOException {
-        writeVwModelToDisk();
-        VWScalarsLearner v = rehydrateModel();
-        float[] vector = v.predict(convertQuery("| wondering we look since"));
-        assertNotNull(vector);
-        assertEquals(3, vector.length);
-    }
+    // LDA is unsafe to use from library mode right now due to the fact that it returns examples in its learn/predict function.
+    // As a part of issue #2245, (https://github.com/VowpalWabbit/vowpal_wabbit/issues/2245), this test should be turned back on.
+    // @Test
+    // public void testLDALearnerPredict() throws IOException {
+    //     writeVwModelToDisk();
+    //     VWScalarsLearner v = rehydrateModel();
+    //     String q = convertQuery("| wondering we look since");
+    //     assertEquals(q, "dfdf");
+    //     float[] vector = v.predict(q);
+    //     assertNotNull(vector);
+    //     assertEquals(3, vector.length);
+    // }
 
     private void writeVwModelToDisk() throws IOException {
         final VWScalarsLearner vwModel =  VWLearners.create(String.format("--quiet -b 4 --lda 3 -f %s --readable_model %s",
diff --git a/library/library_example.cc b/library/library_example.cc
index 28295c42b6a..8ac1c78c77d 100644
--- a/library/library_example.cc
+++ b/library/library_example.cc
@@ -13,7 +13,7 @@ int main(int argc, char *argv[])
 
   example *vec2 = VW::read_example(*model, (char*)"|s p^the_man w^the w^man |t p^un_homme w^un w^homme");
   model->learn(*vec2);
-  std::cerr << "p2 = " << vec2->pred.scalar << std::endl;
+  std::cerr << "p2 = " << vec2->pred.scalar() << std::endl;
   VW::finish_example(*model, *vec2);
 
   VW::primitive_feature_space features[2];
@@ -37,7 +37,7 @@ int main(int argc, char *argv[])
   example* vec3 = VW::import_example(*model, "", features, 2);
 
   model->learn(*vec3);
-  std::cerr << "p3 = " << vec3->pred.scalar << std::endl;
+  std::cerr << "p3 = " << vec3->pred.scalar() << std::endl;
   // TODO: this does not invoke m_vw->l->finish_example()
   VW::finish_example(*model, *vec3);
 
@@ -46,7 +46,7 @@ int main(int argc, char *argv[])
   vw* model2 = VW::initialize("--hash all -q st --noconstant -i train2.vw --no_stdin");
   vec2 = VW::read_example(*model2, (char*)" |s p^the_man w^the w^man |t p^un_homme w^un w^homme");
   model2->learn(*vec2);
-  std::cerr << "p4 = " << vec2->pred.scalar << std::endl;
+  std::cerr << "p4 = " << vec2->pred.scalar() << std::endl;
 
   size_t len=0;
   VW::primitive_feature_space* pfs = VW::export_example(*model2, vec2, len);
diff --git a/library/libsearch.h b/library/libsearch.h
index 7b1b276c04b..182bd322505 100644
--- a/library/libsearch.h
+++ b/library/libsearch.h
@@ -6,21 +6,21 @@ license as described in the file LICENSE.
 #ifndef LIBSEARCH_HOOKTASK_H
 #define LIBSEARCH_HOOKTASK_H
 
-#include "../vowpalwabbit/parser.h"
-#include "../vowpalwabbit/parse_example.h"
-#include "../vowpalwabbit/vw.h"
-#include "../vowpalwabbit/search.h"
-#include "../vowpalwabbit/search_hooktask.h"
+#include "parser.h"
+#include "parse_example.h"
+#include "vw.h"
+#include "search.h"
+#include "search_hooktask.h"
 
 template<class INPUT, class OUTPUT> class SearchTask
 {
 public:
   SearchTask(vw& vw_obj) : vw_obj(vw_obj), sch(*(Search::search*)vw_obj.searchstr)
-  { bogus_example = VW::alloc_examples(vw_obj.p->lp.label_size, 1);
-    VW::read_line(vw_obj, bogus_example, (char*)"1 | x");
-    VW::setup_example(vw_obj, bogus_example);
+  { 
+    VW::read_line(vw_obj, &bogus_example, (char*)"1 | x");
+    VW::setup_example(vw_obj, &bogus_example);
 
-    trigger.push_back(bogus_example);
+    trigger.push_back(&bogus_example);
 
     HookTask::task_data* d = sch.get_task_data<HookTask::task_data>();
     d->run_f = _search_run_fn;
@@ -31,23 +31,23 @@ template<class INPUT, class OUTPUT> class SearchTask
     d->extra_data2 = NULL;
   }
   virtual ~SearchTask()
-  { trigger.clear(); // the individual examples get cleaned up below
-    VW::dealloc_example(vw_obj.p->lp.delete_label, *bogus_example); free(bogus_example);
+  {
+    trigger.clear();
   }
 
   virtual void _run(Search::search&sch, INPUT& input_example, OUTPUT& output) {}  // YOU MUST DEFINE THIS FUNCTION!
   void       _setup(Search::search&sch, INPUT& input_example, OUTPUT& output) {}  // OPTIONAL
   void    _takedown(Search::search&sch, INPUT& input_example, OUTPUT& output) {}  // OPTIONAL
 
-  void   learn(INPUT& input_example, OUTPUT& output) { bogus_example->test_only = false; call_vw(input_example, output); }
-  void predict(INPUT& input_example, OUTPUT& output) { bogus_example->test_only = true;  call_vw(input_example, output); }
+  void   learn(INPUT& input_example, OUTPUT& output) { bogus_example.test_only = false; call_vw(input_example, output); }
+  void predict(INPUT& input_example, OUTPUT& output) { bogus_example.test_only = true;  call_vw(input_example, output); }
 
 protected:
   vw& vw_obj;
   Search::search& sch;
 
 private:
-  example* bogus_example;
+  example bogus_example;
   multi_ex trigger;
 
   void call_vw(INPUT& input_example, OUTPUT& output)
diff --git a/library/recommend.cc b/library/recommend.cc
index a3ab42b47ca..d5bdd15518f 100644
--- a/library/recommend.cc
+++ b/library/recommend.cc
@@ -230,12 +230,12 @@ int main(int argc, char* argv[])
 
         if (pr_queue.size() < (size_t)topk)
         {
-          pr_queue.push(std::make_pair(ex->pred.scalar, str));
+          pr_queue.push(std::make_pair(ex->pred.scalar(), str));
         }
-        else if (pr_queue.top().first < ex->pred.scalar)
+        else if (pr_queue.top().first < ex->pred.scalar())
         {
           pr_queue.pop();
-          pr_queue.push(std::make_pair(ex->pred.scalar, str));
+          pr_queue.push(std::make_pair(ex->pred.scalar(), str));
         }
 
         VW::finish_example(*model, *ex);
diff --git a/library/search_generate.cc b/library/search_generate.cc
index 91e7d93fe81..25ccdea51a8 100644
--- a/library/search_generate.cc
+++ b/library/search_generate.cc
@@ -242,7 +242,7 @@ class Generator : public SearchTask<input, output>
 
     Trie* cdict = dict;
 
-    v_array<action> ref = v_init<action>();
+    v_array<action> ref;
     int N = in.in.length();
     out = "^";
    std::vector<nextstr> next;
diff --git a/python/pylibvw.cc b/python/pylibvw.cc
index b24bdca8c37..5e923b5ff4e 100644
--- a/python/pylibvw.cc
+++ b/python/pylibvw.cc
@@ -145,27 +145,19 @@ size_t my_get_prediction_type(vw_ptr all)
     case prediction_type_t::multilabels:     return pMULTILABELS;
     case prediction_type_t::prob:            return pPROB;
     case prediction_type_t::multiclassprobs: return pMULTICLASSPROBS;
-    case prediction_type_t::decision_probs:  return pDECISION_SCORES;
+    case prediction_type_t::decision_scores:  return pDECISION_SCORES;
     default: THROW("unsupported prediction type used");
   }
 }
 
-void my_delete_example(void*voidec)
-{ example* ec = (example*) voidec;
-  size_t labelType = ec->example_counter;
-  label_parser* lp = get_label_parser(NULL, labelType);
-  VW::dealloc_example(lp ? lp->delete_label : NULL, *ec);
-  free(ec);
-}
-
 example* my_empty_example0(vw_ptr vw, size_t labelType)
 { label_parser* lp = get_label_parser(&*vw, labelType);
-  example* ec = VW::alloc_examples(lp->label_size, 1);
-  lp->default_label(&ec->l);
+  example* ec = VW::alloc_examples(1);
+  lp->default_label(ec->l);
   ec->interactions = &vw->interactions;
   if (labelType == lCOST_SENSITIVE)
   { COST_SENSITIVE::wclass zero = { 0., 1, 0., 0. };
-    ec->l.cs.costs.push_back(zero);
+    ec->l.cs().costs.push_back(zero);
   }
   ec->example_counter = labelType;
   return ec;
@@ -173,7 +165,7 @@ example* my_empty_example0(vw_ptr vw, size_t labelType)
 
 example_ptr my_empty_example(vw_ptr vw, size_t labelType)
 { example* ec = my_empty_example0(vw, labelType);
-  return boost::shared_ptr<example>(ec, my_delete_example);
+  return boost::shared_ptr<example>(ec);
 }
 
 example_ptr my_read_example(vw_ptr all, size_t labelType, char* str)
@@ -181,7 +173,7 @@ example_ptr my_read_example(vw_ptr all, size_t labelType, char* str)
   VW::read_line(*all, ec, str);
   VW::setup_example(*all, ec);
   ec->example_counter = labelType;
-  return boost::shared_ptr<example>(ec, my_delete_example);
+  return boost::shared_ptr<example>(ec);
 }
 
 example_ptr my_existing_example(vw_ptr all, size_t labelType, example_ptr existing_example)
@@ -241,7 +233,7 @@ void predict_or_learn(vw_ptr& all, py::list& ec)
 
 py::list my_parse(vw_ptr& all, char* str)
 {
-  v_array<example*> examples = v_init<example*>();
+  v_array<example*> examples;
   examples.push_back(&VW::get_unused_example(all.get()));
   all->p->text_reader(all.get(), str, strlen(str), examples);
 
@@ -254,8 +246,6 @@ py::list my_parse(vw_ptr& all, char* str)
     example_collection.append(
         boost::shared_ptr<example>(ex, dont_delete_me));
   }
-  examples.clear();
-  examples.delete_v();
   return example_collection;
 }
 
@@ -414,7 +404,7 @@ void my_setup_example(vw_ptr vw, example_ptr ec)
 }
 
 void unsetup_example(vw_ptr vwP, example_ptr ae)
-{ vw&all = *vwP;
+{ vw& all = *vwP;
   ae->partial_prediction = 0.;
   ae->num_features = 0;
   ae->total_sum_feat_sq = 0;
@@ -469,19 +459,19 @@ void ex_set_label_string(example_ptr ec, vw_ptr vw, std::string label, size_t la
   vw->p->lp = old_lp;
 }
 
-float ex_get_simplelabel_label(example_ptr ec) { return ec->l.simple.label; }
-float ex_get_simplelabel_weight(example_ptr ec) { return ec->l.simple.weight; }
-float ex_get_simplelabel_initial(example_ptr ec) { return ec->l.simple.initial; }
-float ex_get_simplelabel_prediction(example_ptr ec) { return ec->pred.scalar; }
-float ex_get_prob(example_ptr ec) { return ec->pred.prob; }
+float ex_get_simplelabel_label(example_ptr ec) { return ec->l.simple().label; }
+float ex_get_simplelabel_weight(example_ptr ec) { return ec->l.simple().weight; }
+float ex_get_simplelabel_initial(example_ptr ec) { return ec->l.simple().initial; }
+float ex_get_simplelabel_prediction(example_ptr ec) { return ec->pred.scalar(); }
+float ex_get_prob(example_ptr ec) { return ec->pred.prob(); }
 
-uint32_t ex_get_multiclass_label(example_ptr ec) { return ec->l.multi.label; }
-float ex_get_multiclass_weight(example_ptr ec) { return ec->l.multi.weight; }
-uint32_t ex_get_multiclass_prediction(example_ptr ec) { return ec->pred.multiclass; }
+uint32_t ex_get_multiclass_label(example_ptr ec) { return ec->l.multi().label; }
+float ex_get_multiclass_weight(example_ptr ec) { return ec->l.multi().weight; }
+uint32_t ex_get_multiclass_prediction(example_ptr ec) { return ec->pred.multiclass(); }
 
 py::list ex_get_scalars(example_ptr ec)
 { py::list values;
-  const auto& scalars = ec->pred.scalars;
+  const auto& scalars = ec->pred.scalars();
 
   for (float s : scalars)
   { values.append(s);
@@ -492,7 +482,7 @@ py::list ex_get_scalars(example_ptr ec)
 py::list ex_get_action_scores(example_ptr ec)
 {
   py::list values;
-  auto const& scores = ec->pred.a_s;
+  auto const& scores = ec->pred.action_scores();
   std::vector<float> ordered_scores(scores.size());
   for (auto const& action_score: scores)
   {
@@ -510,7 +500,7 @@ py::list ex_get_action_scores(example_ptr ec)
 py::list ex_get_decision_scores(example_ptr ec)
 {
   py::list values;
-  for (auto const& scores : ec->pred.decision_scores)
+  for (auto const& scores : ec->pred.decision_scores())
   {
     py::list inner_list;
     for (auto action_score: scores)
@@ -526,7 +516,7 @@ py::list ex_get_decision_scores(example_ptr ec)
 
 py::list ex_get_multilabel_predictions(example_ptr ec)
 { py::list values;
-  MULTILABEL::labels labels = ec->pred.multilabels;
+  MULTILABEL::labels labels = ec->pred.multilabels();
 
   for (uint32_t l : labels.label_v)
   { values.append(l);
@@ -534,19 +524,19 @@ py::list ex_get_multilabel_predictions(example_ptr ec)
   return values;
 }
 
-uint32_t ex_get_costsensitive_prediction(example_ptr ec) { return ec->pred.multiclass; }
-uint32_t ex_get_costsensitive_num_costs(example_ptr ec) { return (uint32_t)ec->l.cs.costs.size(); }
-float ex_get_costsensitive_cost(example_ptr ec, uint32_t i) { return ec->l.cs.costs[i].x; }
-uint32_t ex_get_costsensitive_class(example_ptr ec, uint32_t i) { return ec->l.cs.costs[i].class_index; }
-float ex_get_costsensitive_partial_prediction(example_ptr ec, uint32_t i) { return ec->l.cs.costs[i].partial_prediction; }
-float ex_get_costsensitive_wap_value(example_ptr ec, uint32_t i) { return ec->l.cs.costs[i].wap_value; }
+uint32_t ex_get_costsensitive_prediction(example_ptr ec) { return ec->pred.multiclass(); }
+uint32_t ex_get_costsensitive_num_costs(example_ptr ec) { return (uint32_t)ec->l.cs().costs.size(); }
+float ex_get_costsensitive_cost(example_ptr ec, uint32_t i) { return ec->l.cs().costs[i].x; }
+uint32_t ex_get_costsensitive_class(example_ptr ec, uint32_t i) { return ec->l.cs().costs[i].class_index; }
+float ex_get_costsensitive_partial_prediction(example_ptr ec, uint32_t i) { return ec->l.cs().costs[i].partial_prediction; }
+float ex_get_costsensitive_wap_value(example_ptr ec, uint32_t i) { return ec->l.cs().costs[i].wap_value; }
 
-uint32_t ex_get_cbandits_prediction(example_ptr ec) { return ec->pred.multiclass; }
-uint32_t ex_get_cbandits_num_costs(example_ptr ec) { return (uint32_t)ec->l.cb.costs.size(); }
-float ex_get_cbandits_cost(example_ptr ec, uint32_t i) { return ec->l.cb.costs[i].cost; }
-uint32_t ex_get_cbandits_class(example_ptr ec, uint32_t i) { return ec->l.cb.costs[i].action; }
-float ex_get_cbandits_probability(example_ptr ec, uint32_t i) { return ec->l.cb.costs[i].probability; }
-float ex_get_cbandits_partial_prediction(example_ptr ec, uint32_t i) { return ec->l.cb.costs[i].partial_prediction; }
+uint32_t ex_get_cbandits_prediction(example_ptr ec) { return ec->pred.multiclass(); }
+uint32_t ex_get_cbandits_num_costs(example_ptr ec) { return (uint32_t)ec->l.cb().costs.size(); }
+float ex_get_cbandits_cost(example_ptr ec, uint32_t i) { return ec->l.cb().costs[i].cost; }
+uint32_t ex_get_cbandits_class(example_ptr ec, uint32_t i) { return ec->l.cb().costs[i].action; }
+float ex_get_cbandits_probability(example_ptr ec, uint32_t i) { return ec->l.cb().costs[i].probability; }
+float ex_get_cbandits_partial_prediction(example_ptr ec, uint32_t i) { return ec->l.cb().costs[i].partial_prediction; }
 
 // example_counter is being overriden by lableType!
 size_t   get_example_counter(example_ptr ec) { return ec->example_counter; }
diff --git a/test/unit_test/ccb_parser_test.cc b/test/unit_test/ccb_parser_test.cc
index fff6f543de7..0a074dfdc67 100644
--- a/test/unit_test/ccb_parser_test.cc
+++ b/test/unit_test/ccb_parser_test.cc
@@ -8,112 +8,98 @@
 #include <vector>
 #include "conditional_contextual_bandit.h"
 #include "parser.h"
+#include "example.h"
 
-void parse_label(label_parser& lp, parser* p, VW::string_view label, CCB::label& l)
+void parse_label(label_parser& lp, parser* p, VW::string_view label, polylabel& l)
 {
   tokenize(' ', label, p->words);
-  lp.default_label(&l);
-  lp.parse_label(p, nullptr, &l, p->words);
+  lp.default_label(l);
+  lp.parse_label(p, nullptr, l, p->words);
 }
 
 BOOST_AUTO_TEST_CASE(ccb_parse_label)
 {
   auto lp = CCB::ccb_label_parser;
   parser p{8 /*ring_size*/, false /*strict parse*/};
-  p.words = v_init<VW::string_view>();
-  p.parse_name = v_init<VW::string_view>();
 
   {
-    auto label = scoped_calloc_or_throw<CCB::label>();
+    auto label = scoped_calloc_or_throw<polylabel>();
     parse_label(lp, &p, "ccb shared", *label);
-    BOOST_CHECK_EQUAL(label->explicit_included_actions.size(), 0);
-    BOOST_CHECK(label->outcome == nullptr);
-    BOOST_CHECK_EQUAL(label->type, CCB::example_type::shared);
-    lp.delete_label(label.get());
+    BOOST_CHECK_EQUAL(label->ccb().explicit_included_actions.size(), 0);
+    BOOST_CHECK(label->ccb().outcome == nullptr);
+    BOOST_CHECK_EQUAL(label->ccb().type, CCB::example_type::shared);
   }
   {
-    auto label = scoped_calloc_or_throw<CCB::label>();
+    auto label = scoped_calloc_or_throw<polylabel>();
     parse_label(lp, &p, "ccb action", *label.get());
-    BOOST_CHECK_EQUAL(label->explicit_included_actions.size(), 0);
-    BOOST_CHECK(label->outcome == nullptr);
-    BOOST_CHECK_EQUAL(label->type, CCB::example_type::action);
-    lp.delete_label(label.get());
+    BOOST_CHECK_EQUAL(label->ccb().explicit_included_actions.size(), 0);
+    BOOST_CHECK(label->ccb().outcome == nullptr);
+    BOOST_CHECK_EQUAL(label->ccb().type, CCB::example_type::action);
   }
   {
-    auto label = scoped_calloc_or_throw<CCB::label>();
+    auto label = scoped_calloc_or_throw<polylabel>();
     parse_label(lp, &p, "ccb slot", *label.get());
-    BOOST_CHECK_EQUAL(label->explicit_included_actions.size(), 0);
-    BOOST_CHECK(label->outcome == nullptr);
-    BOOST_CHECK_EQUAL(label->type, CCB::example_type::slot);
-    lp.delete_label(label.get());
+    BOOST_CHECK_EQUAL(label->ccb().explicit_included_actions.size(), 0);
+    BOOST_CHECK(label->ccb().outcome == nullptr);
+    BOOST_CHECK_EQUAL(label->ccb().type, CCB::example_type::slot);
   }
   {
-    auto label = scoped_calloc_or_throw<CCB::label>();
+    auto label = scoped_calloc_or_throw<polylabel>();
     parse_label(lp, &p, "ccb slot 1,3,4", *label.get());
-    BOOST_CHECK_EQUAL(label->explicit_included_actions.size(), 3);
-    BOOST_CHECK_EQUAL(label->explicit_included_actions[0], 1);
-    BOOST_CHECK_EQUAL(label->explicit_included_actions[1], 3);
-    BOOST_CHECK_EQUAL(label->explicit_included_actions[2], 4);
-    BOOST_CHECK(label->outcome == nullptr);
-    BOOST_CHECK_EQUAL(label->type, CCB::example_type::slot);
-    lp.delete_label(label.get());
+    BOOST_CHECK_EQUAL(label->ccb().explicit_included_actions.size(), 3);
+    BOOST_CHECK_EQUAL(label->ccb().explicit_included_actions[0], 1);
+    BOOST_CHECK_EQUAL(label->ccb().explicit_included_actions[1], 3);
+    BOOST_CHECK_EQUAL(label->ccb().explicit_included_actions[2], 4);
+    BOOST_CHECK(label->ccb().outcome == nullptr);
+    BOOST_CHECK_EQUAL(label->ccb().type, CCB::example_type::slot);
   }
   {
-    auto label = scoped_calloc_or_throw<CCB::label>();
+    auto label = scoped_calloc_or_throw<polylabel>();
     parse_label(lp, &p, "ccb slot 1:1.0:0.5 3", *label.get());
-    BOOST_CHECK_EQUAL(label->explicit_included_actions.size(), 1);
-    BOOST_CHECK_EQUAL(label->explicit_included_actions[0], 3);
-    BOOST_CHECK_CLOSE(label->outcome->cost, 1.0f, FLOAT_TOL);
-    BOOST_CHECK_EQUAL(label->outcome->probabilities.size(), 1);
-    BOOST_CHECK_EQUAL(label->outcome->probabilities[0].action, 1);
-    BOOST_CHECK_CLOSE(label->outcome->probabilities[0].score, .5f, FLOAT_TOL);
-    BOOST_CHECK_EQUAL(label->type, CCB::example_type::slot);
-    lp.delete_label(label.get());
+    BOOST_CHECK_EQUAL(label->ccb().explicit_included_actions.size(), 1);
+    BOOST_CHECK_EQUAL(label->ccb().explicit_included_actions[0], 3);
+    BOOST_CHECK_CLOSE(label->ccb().outcome->cost, 1.0f, FLOAT_TOL);
+    BOOST_CHECK_EQUAL(label->ccb().outcome->probabilities.size(), 1);
+    BOOST_CHECK_EQUAL(label->ccb().outcome->probabilities[0].action, 1);
+    BOOST_CHECK_CLOSE(label->ccb().outcome->probabilities[0].score, .5f, FLOAT_TOL);
+    BOOST_CHECK_EQUAL(label->ccb().type, CCB::example_type::slot);
   }
   {
-    auto label = scoped_calloc_or_throw<CCB::label>();
+    auto label = scoped_calloc_or_throw<polylabel>();
     parse_label(lp, &p, "ccb slot 1:-2.0:0.5,2:0.25,3:0.25 3,4", *label.get());
-    BOOST_CHECK_EQUAL(label->explicit_included_actions.size(), 2);
-    BOOST_CHECK_EQUAL(label->explicit_included_actions[0], 3);
-    BOOST_CHECK_EQUAL(label->explicit_included_actions[1], 4);
-    BOOST_CHECK_CLOSE(label->outcome->cost, -2.0f, FLOAT_TOL);
-    BOOST_CHECK_EQUAL(label->outcome->probabilities.size(), 3);
-    BOOST_CHECK_EQUAL(label->outcome->probabilities[0].action, 1);
-    BOOST_CHECK_CLOSE(label->outcome->probabilities[0].score, .5f, FLOAT_TOL);
-    BOOST_CHECK_EQUAL(label->outcome->probabilities[1].action, 2);
-    BOOST_CHECK_CLOSE(label->outcome->probabilities[1].score, .25f, FLOAT_TOL);
-    BOOST_CHECK_EQUAL(label->outcome->probabilities[2].action, 3);
-    BOOST_CHECK_CLOSE(label->outcome->probabilities[2].score, .25f, FLOAT_TOL);
-    BOOST_CHECK_EQUAL(label->type, CCB::example_type::slot);
-    lp.delete_label(label.get());
+    BOOST_CHECK_EQUAL(label->ccb().explicit_included_actions.size(), 2);
+    BOOST_CHECK_EQUAL(label->ccb().explicit_included_actions[0], 3);
+    BOOST_CHECK_EQUAL(label->ccb().explicit_included_actions[1], 4);
+    BOOST_CHECK_CLOSE(label->ccb().outcome->cost, -2.0f, FLOAT_TOL);
+    BOOST_CHECK_EQUAL(label->ccb().outcome->probabilities.size(), 3);
+    BOOST_CHECK_EQUAL(label->ccb().outcome->probabilities[0].action, 1);
+    BOOST_CHECK_CLOSE(label->ccb().outcome->probabilities[0].score, .5f, FLOAT_TOL);
+    BOOST_CHECK_EQUAL(label->ccb().outcome->probabilities[1].action, 2);
+    BOOST_CHECK_CLOSE(label->ccb().outcome->probabilities[1].score, .25f, FLOAT_TOL);
+    BOOST_CHECK_EQUAL(label->ccb().outcome->probabilities[2].action, 3);
+    BOOST_CHECK_CLOSE(label->ccb().outcome->probabilities[2].score, .25f, FLOAT_TOL);
+    BOOST_CHECK_EQUAL(label->ccb().type, CCB::example_type::slot);
   }
   {
-    auto label = scoped_calloc_or_throw<CCB::label>();
+    auto label = scoped_calloc_or_throw<polylabel>();
     BOOST_REQUIRE_THROW(parse_label(lp, &p, "shared", *label.get()), VW::vw_exception);
-    lp.delete_label(label.get());
   }
   {
-    auto label = scoped_calloc_or_throw<CCB::label>();
+    auto label = scoped_calloc_or_throw<polylabel>();
     BOOST_REQUIRE_THROW(parse_label(lp, &p, "other shared", *label.get()), VW::vw_exception);
-    lp.delete_label(label.get());
   }
   {
-    auto label = scoped_calloc_or_throw<CCB::label>();
+    auto label = scoped_calloc_or_throw<polylabel>();
     BOOST_REQUIRE_THROW(parse_label(lp, &p, "other", *label.get()), VW::vw_exception);
-    lp.delete_label(label.get());
   }
   {
-    auto label = scoped_calloc_or_throw<CCB::label>();
+    auto label = scoped_calloc_or_throw<polylabel>();
     BOOST_REQUIRE_THROW(parse_label(lp, &p, "ccb unknown", *label.get()), VW::vw_exception);
-    lp.delete_label(label.get());
   }
   {
-    auto label = scoped_calloc_or_throw<CCB::label>();
+    auto label = scoped_calloc_or_throw<polylabel>();
     BOOST_REQUIRE_THROW(parse_label(lp, &p, "ccb slot 1:1.0:0.5,4:0.7", *label.get()), VW::vw_exception);
-    lp.delete_label(label.get());
   }
-  p.words.delete_v();
-  p.parse_name.delete_v();
 }
 
 BOOST_AUTO_TEST_CASE(ccb_cache_label)
@@ -122,67 +108,51 @@ BOOST_AUTO_TEST_CASE(ccb_cache_label)
   //io.init();      TODO: figure out and fix leak caused by double init()
 
   parser p{8 /*ring_size*/, false /*strict parse*/};
-  p.words = v_init<VW::string_view>();
-  p.parse_name = v_init<VW::string_view>();
 
   auto lp = CCB::ccb_label_parser;
-  auto label = scoped_calloc_or_throw<CCB::label>();
+  auto label = scoped_calloc_or_throw<polylabel>();
   parse_label(lp, &p, "ccb slot 1:-2.0:0.5,2:0.25,3:0.25 3,4", *label.get());
-  lp.cache_label(label.get(), io);
+  lp.cache_label(*label.get(), io);
   io.space.end() = io.head;
   io.head = io.space.begin();
 
-  auto uncached_label = scoped_calloc_or_throw<CCB::label>();
-  lp.default_label(uncached_label.get());
-  lp.read_cached_label(nullptr, uncached_label.get(), io);
-
-  BOOST_CHECK_EQUAL(uncached_label->explicit_included_actions.size(), 2);
-  BOOST_CHECK_EQUAL(uncached_label->explicit_included_actions[0], 3);
-  BOOST_CHECK_EQUAL(uncached_label->explicit_included_actions[1], 4);
-  BOOST_CHECK_CLOSE(uncached_label->outcome->cost, -2.0f, FLOAT_TOL);
-  BOOST_CHECK_EQUAL(uncached_label->outcome->probabilities.size(), 3);
-  BOOST_CHECK_EQUAL(uncached_label->outcome->probabilities[0].action, 1);
-  BOOST_CHECK_CLOSE(uncached_label->outcome->probabilities[0].score, .5f, FLOAT_TOL);
-  BOOST_CHECK_EQUAL(uncached_label->outcome->probabilities[1].action, 2);
-  BOOST_CHECK_CLOSE(uncached_label->outcome->probabilities[1].score, .25f, FLOAT_TOL);
-  BOOST_CHECK_EQUAL(uncached_label->outcome->probabilities[2].action, 3);
-  BOOST_CHECK_CLOSE(uncached_label->outcome->probabilities[2].score, .25f, FLOAT_TOL);
-  BOOST_CHECK_EQUAL(uncached_label->type, CCB::example_type::slot);
-  lp.delete_label(label.get());
-  lp.delete_label(uncached_label.get());
-  p.words.delete_v();
-  p.parse_name.delete_v();
+  auto uncached_label = scoped_calloc_or_throw<polylabel>();
+  lp.read_cached_label(nullptr, *uncached_label.get(), io);
+
+  BOOST_CHECK_EQUAL(uncached_label->ccb().explicit_included_actions.size(), 2);
+  BOOST_CHECK_EQUAL(uncached_label->ccb().explicit_included_actions[0], 3);
+  BOOST_CHECK_EQUAL(uncached_label->ccb().explicit_included_actions[1], 4);
+  BOOST_CHECK_CLOSE(uncached_label->ccb().outcome->cost, -2.0f, FLOAT_TOL);
+  BOOST_CHECK_EQUAL(uncached_label->ccb().outcome->probabilities.size(), 3);
+  BOOST_CHECK_EQUAL(uncached_label->ccb().outcome->probabilities[0].action, 1);
+  BOOST_CHECK_CLOSE(uncached_label->ccb().outcome->probabilities[0].score, .5f, FLOAT_TOL);
+  BOOST_CHECK_EQUAL(uncached_label->ccb().outcome->probabilities[1].action, 2);
+  BOOST_CHECK_CLOSE(uncached_label->ccb().outcome->probabilities[1].score, .25f, FLOAT_TOL);
+  BOOST_CHECK_EQUAL(uncached_label->ccb().outcome->probabilities[2].action, 3);
+  BOOST_CHECK_CLOSE(uncached_label->ccb().outcome->probabilities[2].score, .25f, FLOAT_TOL);
+  BOOST_CHECK_EQUAL(uncached_label->ccb().type, CCB::example_type::slot);
 }
 
 BOOST_AUTO_TEST_CASE(ccb_copy_label)
 {
   parser p{8 /*ring_size*/, false /*strict parse*/};
-  p.words = v_init<VW::string_view>();
-  p.parse_name = v_init<VW::string_view>();
   auto lp = CCB::ccb_label_parser;
 
-  auto label = scoped_calloc_or_throw<CCB::label>();
-  parse_label(lp, &p, "ccb slot 1:-2.0:0.5,2:0.25,3:0.25 3,4", *label.get());
-
-  auto copied_to = scoped_calloc_or_throw<CCB::label>();
-  lp.default_label(copied_to.get());
-
-  lp.copy_label(copied_to.get(), label.get());
-
-  BOOST_CHECK_EQUAL(copied_to->explicit_included_actions.size(), 2);
-  BOOST_CHECK_EQUAL(copied_to->explicit_included_actions[0], 3);
-  BOOST_CHECK_EQUAL(copied_to->explicit_included_actions[1], 4);
-  BOOST_CHECK_CLOSE(copied_to->outcome->cost, -2.0f, FLOAT_TOL);
-  BOOST_CHECK_EQUAL(copied_to->outcome->probabilities.size(), 3);
-  BOOST_CHECK_EQUAL(copied_to->outcome->probabilities[0].action, 1);
-  BOOST_CHECK_CLOSE(copied_to->outcome->probabilities[0].score, .5f, FLOAT_TOL);
-  BOOST_CHECK_EQUAL(copied_to->outcome->probabilities[1].action, 2);
-  BOOST_CHECK_CLOSE(copied_to->outcome->probabilities[1].score, .25f, FLOAT_TOL);
-  BOOST_CHECK_EQUAL(copied_to->outcome->probabilities[2].action, 3);
-  BOOST_CHECK_CLOSE(copied_to->outcome->probabilities[2].score, .25f, FLOAT_TOL);
-  BOOST_CHECK_EQUAL(copied_to->type, CCB::example_type::slot);
-  lp.delete_label(label.get());
-  lp.delete_label(copied_to.get());
-  p.words.delete_v();
-  p.parse_name.delete_v();
+  polylabel label;
+  parse_label(lp, &p, "ccb slot 1:-2.0:0.5,2:0.25,3:0.25 3,4", label);
+
+  polylabel copied_to = label;
+
+  BOOST_CHECK_EQUAL(copied_to.ccb().explicit_included_actions.size(), 2);
+  BOOST_CHECK_EQUAL(copied_to.ccb().explicit_included_actions[0], 3);
+  BOOST_CHECK_EQUAL(copied_to.ccb().explicit_included_actions[1], 4);
+  BOOST_CHECK_CLOSE(copied_to.ccb().outcome->cost, -2.0f, FLOAT_TOL);
+  BOOST_CHECK_EQUAL(copied_to.ccb().outcome->probabilities.size(), 3);
+  BOOST_CHECK_EQUAL(copied_to.ccb().outcome->probabilities[0].action, 1);
+  BOOST_CHECK_CLOSE(copied_to.ccb().outcome->probabilities[0].score, .5f, FLOAT_TOL);
+  BOOST_CHECK_EQUAL(copied_to.ccb().outcome->probabilities[1].action, 2);
+  BOOST_CHECK_CLOSE(copied_to.ccb().outcome->probabilities[1].score, .25f, FLOAT_TOL);
+  BOOST_CHECK_EQUAL(copied_to.ccb().outcome->probabilities[2].action, 3);
+  BOOST_CHECK_CLOSE(copied_to.ccb().outcome->probabilities[2].score, .25f, FLOAT_TOL);
+  BOOST_CHECK_EQUAL(copied_to.ccb().type, CCB::example_type::slot);
 }
diff --git a/test/unit_test/ccb_test.cc b/test/unit_test/ccb_test.cc
index 0bd27e3aee0..82a2fd394ba 100644
--- a/test/unit_test/ccb_test.cc
+++ b/test/unit_test/ccb_test.cc
@@ -56,7 +56,7 @@ BOOST_AUTO_TEST_CASE(ccb_explicit_included_actions_no_overlap)
 
   vw.predict(examples);
 
-  auto& decision_scores = examples[0]->pred.decision_scores;
+  auto& decision_scores = examples[0]->pred.decision_scores();
   BOOST_CHECK_EQUAL(decision_scores.size(), 3);
 
   BOOST_CHECK_EQUAL(decision_scores[0].size(), 1);
diff --git a/test/unit_test/dsjson_parser_test.cc b/test/unit_test/dsjson_parser_test.cc
index 1fb95b4bc03..0889c3aaa7d 100644
--- a/test/unit_test/dsjson_parser_test.cc
+++ b/test/unit_test/dsjson_parser_test.cc
@@ -11,7 +11,7 @@
 
 multi_ex parse_dsjson(vw& all, std::string line)
 {
-  auto examples = v_init<example*>();
+  v_array<example*> examples;
   examples.push_back(&VW::get_unused_example(&all));
   DecisionServiceInteraction interaction;
 
@@ -22,7 +22,6 @@ multi_ex parse_dsjson(vw& all, std::string line)
   for (size_t i = 0; i < examples.size(); ++i) {
 	  result.push_back(examples[i]);
   }
-  examples.delete_v();
   return result;
 }
 
@@ -96,18 +95,18 @@ BOOST_AUTO_TEST_CASE(parse_dsjson_cb)
   BOOST_CHECK_EQUAL(examples.size(), 4);
 
   // Shared example
-  BOOST_CHECK_EQUAL(examples[0]->l.cb.costs.size(), 1);
-  BOOST_CHECK_CLOSE(examples[0]->l.cb.costs[0].probability, -1.f, FLOAT_TOL);
-  BOOST_CHECK_CLOSE(examples[0]->l.cb.costs[0].cost, FLT_MAX, FLOAT_TOL);
+  BOOST_CHECK_EQUAL(examples[0]->l.cb().costs.size(), 1);
+  BOOST_CHECK_CLOSE(examples[0]->l.cb().costs[0].probability, -1.f, FLOAT_TOL);
+  BOOST_CHECK_CLOSE(examples[0]->l.cb().costs[0].cost, FLT_MAX, FLOAT_TOL);
 
   // Action examples
-  BOOST_CHECK_EQUAL(examples[1]->l.cb.costs.size(), 0);
-  BOOST_CHECK_EQUAL(examples[2]->l.cb.costs.size(), 1);
-  BOOST_CHECK_EQUAL(examples[3]->l.cb.costs.size(), 0);
+  BOOST_CHECK_EQUAL(examples[1]->l.cb().costs.size(), 0);
+  BOOST_CHECK_EQUAL(examples[2]->l.cb().costs.size(), 1);
+  BOOST_CHECK_EQUAL(examples[3]->l.cb().costs.size(), 0);
 
-  BOOST_CHECK_CLOSE(examples[2]->l.cb.costs[0].probability, 0.8166667, FLOAT_TOL);
-  BOOST_CHECK_CLOSE(examples[2]->l.cb.costs[0].cost, -1.0, FLOAT_TOL);
-  BOOST_CHECK_EQUAL(examples[2]->l.cb.costs[0].action, 2);
+  BOOST_CHECK_CLOSE(examples[2]->l.cb().costs[0].probability, 0.8166667, FLOAT_TOL);
+  BOOST_CHECK_CLOSE(examples[2]->l.cb().costs[0].cost, -1.0, FLOAT_TOL);
+  BOOST_CHECK_EQUAL(examples[2]->l.cb().costs[0].action, 2);
   VW::finish_example(*vw, examples);
   VW::finish(*vw);
 }
@@ -167,13 +166,13 @@ BOOST_AUTO_TEST_CASE(parse_dsjson_ccb)
   auto examples = parse_dsjson(*vw, json_text);
 
   BOOST_CHECK_EQUAL(examples.size(), 5);
-  BOOST_CHECK_EQUAL(examples[0]->l.conditional_contextual_bandit.type, CCB::example_type::shared);
-  BOOST_CHECK_EQUAL(examples[1]->l.conditional_contextual_bandit.type, CCB::example_type::action);
-  BOOST_CHECK_EQUAL(examples[2]->l.conditional_contextual_bandit.type, CCB::example_type::action);
-  BOOST_CHECK_EQUAL(examples[3]->l.conditional_contextual_bandit.type, CCB::example_type::slot);
-  BOOST_CHECK_EQUAL(examples[4]->l.conditional_contextual_bandit.type, CCB::example_type::slot);
+  BOOST_CHECK_EQUAL(examples[0]->l.ccb().type, CCB::example_type::shared);
+  BOOST_CHECK_EQUAL(examples[1]->l.ccb().type, CCB::example_type::action);
+  BOOST_CHECK_EQUAL(examples[2]->l.ccb().type, CCB::example_type::action);
+  BOOST_CHECK_EQUAL(examples[3]->l.ccb().type, CCB::example_type::slot);
+  BOOST_CHECK_EQUAL(examples[4]->l.ccb().type, CCB::example_type::slot);
 
-  auto label1 = examples[3]->l.conditional_contextual_bandit;
+  auto& label1 = examples[3]->l.ccb();
   BOOST_CHECK_EQUAL(label1.explicit_included_actions.size(), 2);
   BOOST_CHECK_EQUAL(label1.explicit_included_actions[0], 1);
   BOOST_CHECK_EQUAL(label1.explicit_included_actions[1], 2);
@@ -182,7 +181,7 @@ BOOST_AUTO_TEST_CASE(parse_dsjson_ccb)
   BOOST_CHECK_EQUAL(label1.outcome->probabilities[0].action, 1);
   BOOST_CHECK_CLOSE(label1.outcome->probabilities[0].score, .25f, .0001f);
 
-  auto label2 = examples[4]->l.conditional_contextual_bandit;
+  auto& label2 = examples[4]->l.ccb();
   BOOST_CHECK_EQUAL(label2.explicit_included_actions.size(), 0);
   BOOST_CHECK_CLOSE(label2.outcome->cost, 4.f, .0001f);
   BOOST_CHECK_EQUAL(label2.outcome->probabilities.size(), 2);
@@ -261,13 +260,13 @@ BOOST_AUTO_TEST_CASE(parse_dsjson_cb_as_ccb)
   auto examples = parse_dsjson(*vw, json_text);
 
   BOOST_CHECK_EQUAL(examples.size(), 5);
-  BOOST_CHECK_EQUAL(examples[0]->l.conditional_contextual_bandit.type, CCB::example_type::shared);
-  BOOST_CHECK_EQUAL(examples[1]->l.conditional_contextual_bandit.type, CCB::example_type::action);
-  BOOST_CHECK_EQUAL(examples[2]->l.conditional_contextual_bandit.type, CCB::example_type::action);
-  BOOST_CHECK_EQUAL(examples[3]->l.conditional_contextual_bandit.type, CCB::example_type::action);
-  BOOST_CHECK_EQUAL(examples[4]->l.conditional_contextual_bandit.type, CCB::example_type::slot);
+  BOOST_CHECK_EQUAL(examples[0]->l.ccb().type, CCB::example_type::shared);
+  BOOST_CHECK_EQUAL(examples[1]->l.ccb().type, CCB::example_type::action);
+  BOOST_CHECK_EQUAL(examples[2]->l.ccb().type, CCB::example_type::action);
+  BOOST_CHECK_EQUAL(examples[3]->l.ccb().type, CCB::example_type::action);
+  BOOST_CHECK_EQUAL(examples[4]->l.ccb().type, CCB::example_type::slot);
 
-  auto label2 = examples[4]->l.conditional_contextual_bandit;
+  auto& label2 = examples[4]->l.ccb();
   BOOST_CHECK_EQUAL(label2.explicit_included_actions.size(), 0);
   BOOST_CHECK_CLOSE(label2.outcome->cost, -1.f, .0001f);
   BOOST_CHECK_EQUAL(label2.outcome->probabilities.size(), 1);
diff --git a/test/unit_test/json_parser_test.cc b/test/unit_test/json_parser_test.cc
index 0a810f879e2..cb93da1329f 100644
--- a/test/unit_test/json_parser_test.cc
+++ b/test/unit_test/json_parser_test.cc
@@ -11,7 +11,7 @@
 
 multi_ex parse_json(vw& all, std::string line)
 {
-  auto examples = v_init<example*>();
+  v_array<example*> examples;
   examples.push_back(&VW::get_unused_example(&all));
   VW::read_line_json<true>(
       all, examples, (char*)line.c_str(), (VW::example_factory_t)&VW::get_unused_example, (void*)&all);
@@ -20,7 +20,6 @@ multi_ex parse_json(vw& all, std::string line)
   for (size_t i = 0; i < examples.size(); ++i) {
 	  result.push_back(examples[i]);
   }
-  examples.delete_v();
   return result;
 }
 
@@ -42,7 +41,7 @@ BOOST_AUTO_TEST_CASE(parse_json_simple)
   auto examples = parse_json(*vw, json_text);
 
   BOOST_CHECK_EQUAL(examples.size(), 1);
-  BOOST_CHECK_CLOSE(examples[0]->l.simple.label, 1.f, FLOAT_TOL);
+  BOOST_CHECK_CLOSE(examples[0]->l.simple().label, 1.f, FLOAT_TOL);
   VW::finish_example(*vw, examples);
   VW::finish(*vw);
 }
@@ -81,18 +80,18 @@ BOOST_AUTO_TEST_CASE(parse_json_cb)
   auto examples = parse_json(*vw, json_text);
   BOOST_CHECK_EQUAL(examples.size(), 4);
 
-  BOOST_CHECK_EQUAL(examples[0]->l.cb.costs.size(), 1);
-  BOOST_CHECK_CLOSE(examples[0]->l.cb.costs[0].probability, -1.f, FLOAT_TOL);
-  BOOST_CHECK_CLOSE(examples[0]->l.cb.costs[0].cost, FLT_MAX, FLOAT_TOL);
+  BOOST_CHECK_EQUAL(examples[0]->l.cb().costs.size(), 1);
+  BOOST_CHECK_CLOSE(examples[0]->l.cb().costs[0].probability, -1.f, FLOAT_TOL);
+  BOOST_CHECK_CLOSE(examples[0]->l.cb().costs[0].cost, FLT_MAX, FLOAT_TOL);
 
   // Action examples
-  BOOST_CHECK_EQUAL(examples[1]->l.cb.costs.size(), 1);
-  BOOST_CHECK_EQUAL(examples[2]->l.cb.costs.size(), 0);
-  BOOST_CHECK_EQUAL(examples[3]->l.cb.costs.size(), 0);
+  BOOST_CHECK_EQUAL(examples[1]->l.cb().costs.size(), 1);
+  BOOST_CHECK_EQUAL(examples[2]->l.cb().costs.size(), 0);
+  BOOST_CHECK_EQUAL(examples[3]->l.cb().costs.size(), 0);
 
-  BOOST_CHECK_CLOSE(examples[1]->l.cb.costs[0].probability, 0.5, FLOAT_TOL);
-  BOOST_CHECK_CLOSE(examples[1]->l.cb.costs[0].cost, 1.0, FLOAT_TOL);
-  BOOST_CHECK_EQUAL(examples[1]->l.cb.costs[0].action, 1);
+  BOOST_CHECK_CLOSE(examples[1]->l.cb().costs[0].probability, 0.5, FLOAT_TOL);
+  BOOST_CHECK_CLOSE(examples[1]->l.cb().costs[0].cost, 1.0, FLOAT_TOL);
+  BOOST_CHECK_EQUAL(examples[1]->l.cb().costs[0].action, 1);
   VW::finish_example(*vw, examples);
   VW::finish(*vw);
 }
@@ -154,16 +153,16 @@ BOOST_AUTO_TEST_CASE(parse_json_ccb)
   auto examples = parse_json(*vw, json_text);
 
   BOOST_CHECK_EQUAL(examples.size(), 8);
-  BOOST_CHECK_EQUAL(examples[0]->l.conditional_contextual_bandit.type, CCB::example_type::shared);
-  BOOST_CHECK_EQUAL(examples[1]->l.conditional_contextual_bandit.type, CCB::example_type::action);
-  BOOST_CHECK_EQUAL(examples[2]->l.conditional_contextual_bandit.type, CCB::example_type::action);
-  BOOST_CHECK_EQUAL(examples[3]->l.conditional_contextual_bandit.type, CCB::example_type::action);
-  BOOST_CHECK_EQUAL(examples[4]->l.conditional_contextual_bandit.type, CCB::example_type::action);
-  BOOST_CHECK_EQUAL(examples[5]->l.conditional_contextual_bandit.type, CCB::example_type::slot);
-  BOOST_CHECK_EQUAL(examples[6]->l.conditional_contextual_bandit.type, CCB::example_type::slot);
-  BOOST_CHECK_EQUAL(examples[7]->l.conditional_contextual_bandit.type, CCB::example_type::slot);
-
-  auto label1 = examples[5]->l.conditional_contextual_bandit;
+  BOOST_CHECK_EQUAL(examples[0]->l.ccb().type, CCB::example_type::shared);
+  BOOST_CHECK_EQUAL(examples[1]->l.ccb().type, CCB::example_type::action);
+  BOOST_CHECK_EQUAL(examples[2]->l.ccb().type, CCB::example_type::action);
+  BOOST_CHECK_EQUAL(examples[3]->l.ccb().type, CCB::example_type::action);
+  BOOST_CHECK_EQUAL(examples[4]->l.ccb().type, CCB::example_type::action);
+  BOOST_CHECK_EQUAL(examples[5]->l.ccb().type, CCB::example_type::slot);
+  BOOST_CHECK_EQUAL(examples[6]->l.ccb().type, CCB::example_type::slot);
+  BOOST_CHECK_EQUAL(examples[7]->l.ccb().type, CCB::example_type::slot);
+
+  auto& label1 = examples[5]->l.ccb();
   BOOST_CHECK_EQUAL(label1.explicit_included_actions.size(), 2);
   BOOST_CHECK_EQUAL(label1.explicit_included_actions[0], 1);
   BOOST_CHECK_EQUAL(label1.explicit_included_actions[1], 2);
@@ -172,11 +171,11 @@ BOOST_AUTO_TEST_CASE(parse_json_ccb)
   BOOST_CHECK_EQUAL(label1.outcome->probabilities[0].action, 1);
   BOOST_CHECK_CLOSE(label1.outcome->probabilities[0].score, .25f, .0001f);
 
-  auto label2 = examples[6]->l.conditional_contextual_bandit;
+  auto& label2 = examples[6]->l.ccb();
   BOOST_CHECK_EQUAL(label2.explicit_included_actions.size(), 0);
   BOOST_CHECK(label2.outcome == nullptr);
 
-  auto label3 = examples[7]->l.conditional_contextual_bandit;
+  auto& label3 = examples[7]->l.ccb();
   BOOST_CHECK_EQUAL(label3.explicit_included_actions.size(), 0);
   BOOST_CHECK_CLOSE(label3.outcome->cost, 4.f, .0001f);
   BOOST_CHECK_EQUAL(label3.outcome->probabilities.size(), 2);
@@ -222,13 +221,13 @@ BOOST_AUTO_TEST_CASE(parse_json_cb_as_ccb)
   auto examples = parse_json(*vw, json_text);
 
   BOOST_CHECK_EQUAL(examples.size(), 5);
-  BOOST_CHECK_EQUAL(examples[0]->l.conditional_contextual_bandit.type, CCB::example_type::shared);
-  BOOST_CHECK_EQUAL(examples[1]->l.conditional_contextual_bandit.type, CCB::example_type::action);
-  BOOST_CHECK_EQUAL(examples[2]->l.conditional_contextual_bandit.type, CCB::example_type::action);
-  BOOST_CHECK_EQUAL(examples[3]->l.conditional_contextual_bandit.type, CCB::example_type::action);
-  BOOST_CHECK_EQUAL(examples[4]->l.conditional_contextual_bandit.type, CCB::example_type::slot);
+  BOOST_CHECK_EQUAL(examples[0]->l.ccb().type, CCB::example_type::shared);
+  BOOST_CHECK_EQUAL(examples[1]->l.ccb().type, CCB::example_type::action);
+  BOOST_CHECK_EQUAL(examples[2]->l.ccb().type, CCB::example_type::action);
+  BOOST_CHECK_EQUAL(examples[3]->l.ccb().type, CCB::example_type::action);
+  BOOST_CHECK_EQUAL(examples[4]->l.ccb().type, CCB::example_type::slot);
 
-  auto label1 = examples[4]->l.conditional_contextual_bandit;
+  auto& label1 = examples[4]->l.ccb();
   BOOST_CHECK_EQUAL(label1.explicit_included_actions.size(), 0);
   BOOST_CHECK_CLOSE(label1.outcome->cost, 1.f, .0001f);
   BOOST_CHECK_EQUAL(label1.outcome->probabilities.size(), 1);
diff --git a/test/unit_test/prediction_test.cc b/test/unit_test/prediction_test.cc
index 1a58fda3a29..d81e08c802b 100644
--- a/test/unit_test/prediction_test.cc
+++ b/test/unit_test/prediction_test.cc
@@ -20,7 +20,7 @@ BOOST_AUTO_TEST_CASE(predict_modifying_state)
     vw.learn(learn_example);
     vw.finish_example(learn_example);
     vw.predict(predict_example);
-    prediction_one = predict_example.pred.scalar;
+    prediction_one = predict_example.pred.scalar();
     vw.finish_example(predict_example);
     VW::finish(vw);
   }
@@ -35,7 +35,7 @@ BOOST_AUTO_TEST_CASE(predict_modifying_state)
     vw.learn(learn_example);
     vw.finish_example(learn_example);
     vw.predict(predict_example);
-    prediction_two = predict_example.pred.scalar;
+    prediction_two = predict_example.pred.scalar();
     vw.finish_example(predict_example);
     VW::finish(vw);
   }
diff --git a/vowpalwabbit/CMakeLists.txt b/vowpalwabbit/CMakeLists.txt
index 6ebbc844138..bf083af75af 100644
--- a/vowpalwabbit/CMakeLists.txt
+++ b/vowpalwabbit/CMakeLists.txt
@@ -48,7 +48,7 @@ set(vw_all_sources
   ccb_label.cc classweight.cc comp_io.cc conditional_contextual_bandit.cc confidence.cc
   cost_sensitive.cc cs_active.cc csoaa.cc distributionally_robust.cc ect.cc example.cc explore_eval.cc ftrl.cc gd_mf.cc gd.cc
   gen_cs_example.cc global_data.cc interact.cc interactions.cc io_buf.cc kernel_svm.cc
-  label_dictionary.cc lda_core.cc learner.cc log_multi.cc loss_functions.cc lrq.cc lrqfa.cc
+  label_dictionary.cc label_parser.cc lda_core.cc learner.cc log_multi.cc loss_functions.cc lrq.cc lrqfa.cc
   marginal.cc memory_tree.cc mf.cc multiclass.cc multilabel_oaa.cc multilabel.cc mwt.cc network.cc
   nn.cc no_label.cc noop.cc oaa.cc OjaNewton.cc options_boost_po.cc options_serializer_boost_po.cc
   parse_args.cc parse_example.cc parse_primitives.cc parse_regressor.cc parser.cc print.cc rand48.cc
diff --git a/vowpalwabbit/OjaNewton.cc b/vowpalwabbit/OjaNewton.cc
index 9f5886c297a..36232f9caa5 100644
--- a/vowpalwabbit/OjaNewton.cc
+++ b/vowpalwabbit/OjaNewton.cc
@@ -47,7 +47,7 @@ struct OjaNewton
   float* vv;
   float* tmp;
 
-  example** buffer;
+  std::vector<example*> buffer;
   float* weight_buffer;
   struct update_data data;
 
@@ -345,7 +345,6 @@ struct OjaNewton
     free(ev);
     free(b);
     free(D);
-    free(buffer);
     free(weight_buffer);
     free(zv);
     free(vv);
@@ -368,8 +367,19 @@ struct OjaNewton
   }
 };
 
-void keep_example(vw& all, OjaNewton& /* ON */, example& ec) { output_and_account_example(all, ec); }
+void keep_example_but_delete_after_epoch_processed(vw& all, OjaNewton& ON, example& ec)
+{
+  output_and_account_example(all, ec);
 
+  if (ON.cnt == ON.epoch_size)
+  {
+    ON.cnt = 0;
+    for (auto example_ptr : ON.buffer)
+    {
+      VW::finish_example(*ON.all, *example_ptr);
+    }
+  }
+}
 void make_pred(update_data& data, float x, float& wref)
 {
   int m = data.ON->m;
@@ -392,7 +402,7 @@ void predict(OjaNewton& ON, base_learner&, example& ec)
   ON.data.prediction = 0;
   GD::foreach_feature<update_data, make_pred>(*ON.all, ec, ON.data);
   ec.partial_prediction = (float)ON.data.prediction;
-  ec.pred.scalar = GD::finalize_prediction(ON.all->sd, ec.partial_prediction);
+  ec.pred.scalar() = GD::finalize_prediction(ON.all->sd, ec.partial_prediction);
 }
 
 void update_Z_and_wbar(update_data& data, float x, float& wref)
@@ -454,7 +464,7 @@ void learn(OjaNewton& ON, base_learner& base, example& ec)
   predict(ON, base, ec);
 
   update_data& data = ON.data;
-  data.g = ON.all->loss->first_derivative(ON.all->sd, ec.pred.scalar, ec.l.simple.label) * ec.l.simple.weight;
+  data.g = ON.all->loss->first_derivative(ON.all->sd, ec.pred.scalar(), ec.l.simple().label) * ec.l.simple().weight;
   data.g /= 2;  // for half square loss
 
   if (ON.normalize)
@@ -493,15 +503,6 @@ void learn(OjaNewton& ON, base_learner& base, example& ec)
 
   ON.update_b();
   ON.check();
-
-  if (ON.cnt == ON.epoch_size)
-  {
-    ON.cnt = 0;
-    for (int k = 0; k < ON.epoch_size; k++)
-    {
-      VW::finish_example(*ON.all, *ON.buffer[k]);
-    }
-  }
 }
 
 void save_load(OjaNewton& ON, io_buf& model_file, bool read, bool text)
@@ -582,7 +583,7 @@ base_learner* OjaNewton_setup(options_i& options, vw& all)
     ON->D[i] = 1;
   }
 
-  ON->buffer = calloc_or_throw<example*>(ON->epoch_size);
+  ON->buffer.resize(ON->epoch_size, nullptr);
   ON->weight_buffer = calloc_or_throw<float>(ON->epoch_size);
 
   ON->zv = calloc_or_throw<float>(ON->m + 1);
@@ -598,6 +599,7 @@ base_learner* OjaNewton_setup(options_i& options, vw& all)
 
   learner<OjaNewton, example>& l = init_learner(ON, learn, predict, all.weights.stride());
   l.set_save_load(save_load);
-  l.set_finish_example(keep_example);
+  l.set_finish_example(keep_example_but_delete_after_epoch_processed);
+  l.label_type = label_type_t::simple;
   return make_base(l);
 }
diff --git a/vowpalwabbit/action_score.cc b/vowpalwabbit/action_score.cc
index 16121b6a8dc..06cd70d5924 100644
--- a/vowpalwabbit/action_score.cc
+++ b/vowpalwabbit/action_score.cc
@@ -11,7 +11,7 @@
 
 namespace ACTION_SCORE
 {
-void print_action_score(int f, v_array<action_score>& a_s, v_array<char>& tag)
+void print_action_score(int f, const v_array<action_score>& a_s, const v_array<char>& tag)
 {
   if (f >= 0)
   {
@@ -31,11 +31,4 @@ void print_action_score(int f, v_array<action_score>& a_s, v_array<char>& tag)
       std::cerr << "write error: " << strerror(errno) << std::endl;
   }
 }
-
-void delete_action_scores(void* v)
-{
-  v_array<action_score>* cs = (v_array<action_score>*)v;
-  cs->delete_v();
-}
-
 }  // namespace ACTION_SCORE
diff --git a/vowpalwabbit/action_score.h b/vowpalwabbit/action_score.h
index 29a6b010fda..8e052214d25 100644
--- a/vowpalwabbit/action_score.h
+++ b/vowpalwabbit/action_score.h
@@ -78,7 +78,6 @@ inline int score_comp(const void* p1, const void* p2)
 
 inline int reverse_order(const void* p1, const void* p2) { return score_comp(p2, p1); }
 
-void print_action_score(int f, v_array<action_score>& a_s, v_array<char>&);
+void print_action_score(int f, const v_array<action_score>& a_s, const v_array<char>&);
 
-void delete_action_scores(void* v);
 }  // namespace ACTION_SCORE
diff --git a/vowpalwabbit/active.cc b/vowpalwabbit/active.cc
index 5817ea659d6..ddee95e5c4e 100644
--- a/vowpalwabbit/active.cc
+++ b/vowpalwabbit/active.cc
@@ -57,7 +57,7 @@ void predict_or_learn_simulation(active& a, single_learner& base, example& ec)
     float k = (float)all.sd->t;
     float threshold = 0.f;
 
-    ec.confidence = fabsf(ec.pred.scalar - threshold) / base.sensitivity(ec);
+    ec.confidence = fabsf(ec.pred.scalar() - threshold) / base.sensitivity(ec);
     float importance = query_decision(a, ec.confidence, k);
 
     if (importance > 0)
@@ -68,7 +68,7 @@ void predict_or_learn_simulation(active& a, single_learner& base, example& ec)
     }
     else
     {
-      ec.l.simple.label = FLT_MAX;
+      ec.l.simple().label = FLT_MAX;
       ec.weight = 0.f;
     }
   }
@@ -82,10 +82,10 @@ void predict_or_learn_active(active& a, single_learner& base, example& ec)
   else
     base.predict(ec);
 
-  if (ec.l.simple.label == FLT_MAX)
+  if (ec.l.simple().label == FLT_MAX)
   {
     float threshold = (a.all->sd->max_label + a.all->sd->min_label) * 0.5f;
-    ec.confidence = fabsf(ec.pred.scalar - threshold) / base.sensitivity(ec);
+    ec.confidence = fabsf(ec.pred.scalar() - threshold) / base.sensitivity(ec);
   }
 }
 
@@ -109,7 +109,7 @@ void active_print_result(int f, float res, float weight, v_array<char> tag)
 
 void output_and_account_example(vw& all, active& a, example& ec)
 {
-  label_data& ld = ec.l.simple;
+  label_data& ld = ec.l.simple();
 
   all.sd->update(ec.test_only, ld.label != FLT_MAX, ec.loss, ec.weight, ec.num_features);
   if (ld.label != FLT_MAX && !ec.test_only)
@@ -123,7 +123,7 @@ void output_and_account_example(vw& all, active& a, example& ec)
   all.print_by_ref(all.raw_prediction, ec.partial_prediction, -1, ec.tag);
   for (auto i : all.final_prediction_sink)
   {
-    active_print_result(i, ec.pred.scalar, ai, ec.tag);
+    active_print_result(i, ec.pred.scalar(), ai, ec.tag);
   }
 
   print_update(all, ec);
@@ -171,5 +171,7 @@ base_learner* active_setup(options_i& options, vw& all)
     l->set_finish_example(return_active_example);
   }
 
+  l->label_type = label_type_t::simple;
+
   return make_base(*l);
 }
diff --git a/vowpalwabbit/active_cover.cc b/vowpalwabbit/active_cover.cc
index b9de29eada6..056e5151d13 100644
--- a/vowpalwabbit/active_cover.cc
+++ b/vowpalwabbit/active_cover.cc
@@ -53,7 +53,7 @@ bool dis_test(vw& all, example& ec, single_learner& base, float /* prediction */
 
   // Get loss difference
   float middle = 0.f;
-  ec.confidence = fabsf(ec.pred.scalar - middle) / base.sensitivity(ec);
+  ec.confidence = fabsf(ec.pred.scalar() - middle) / base.sensitivity(ec);
 
   float k = (float)all.sd->t;
   float loss_delta = ec.confidence / k;
@@ -112,7 +112,7 @@ float query_decision(active_cover& a, single_learner& l, example& ec, float pred
   for (size_t i = 0; i < a.cover_size; i++)
   {
     l.predict(ec, i + 1);
-    q2 += ((float)(sign(ec.pred.scalar) != sign(prediction))) * (a.lambda_n[i] / a.lambda_d[i]);
+    q2 += ((float)(sign(ec.pred.scalar()) != sign(prediction))) * (a.lambda_n[i] / a.lambda_d[i]);
   }
 
   p = std::sqrt(q2) / (1 + std::sqrt(q2));
@@ -141,10 +141,10 @@ void predict_or_learn_active_cover(active_cover& a, single_learner& base, exampl
   {
     vw& all = *a.all;
 
-    float prediction = ec.pred.scalar;
+    float prediction = ec.pred.scalar();
     float t = (float)a.all->sd->t;
     float ec_input_weight = ec.weight;
-    float ec_input_label = ec.l.simple.label;
+    float ec_input_label = ec.l.simple().label;
 
     // Compute threshold defining allowed set A
     float threshold = get_threshold((float)all.sd->sum_loss, t, a.active_c0, a.alpha);
@@ -155,7 +155,7 @@ void predict_or_learn_active_cover(active_cover& a, single_learner& base, exampl
     // Query (or not)
     if (!in_dis)  // Use predicted label
     {
-      ec.l.simple.label = sign(prediction);
+      ec.l.simple().label = sign(prediction);
       ec.weight = ec_input_weight;
       base.learn(ec, 0);
     }
@@ -163,21 +163,21 @@ void predict_or_learn_active_cover(active_cover& a, single_learner& base, exampl
     {
       all.sd->queries += 1;
       ec.weight = ec_input_weight * importance;
-      ec.l.simple.label = ec_input_label;
+      ec.l.simple().label = ec_input_label;
       base.learn(ec, 0);
     }
     else  // skipped example
     {
       // Make sure the loss computation does not include
       // skipped examples
-      ec.l.simple.label = FLT_MAX;
+      ec.l.simple().label = FLT_MAX;
       ec.weight = 0;
     }
 
     // Update the learners in the cover and their weights
     float q2 = 4.f * pmin * pmin;
     float p, s, cost, cost_delta = 0;
-    float ec_output_label = ec.l.simple.label;
+    float ec_output_label = ec.l.simple().label;
     float ec_output_weight = ec.weight;
     float r = 2.f * threshold * t * a.alpha / a.active_c0 / a.beta_scale;
 
@@ -206,7 +206,7 @@ void predict_or_learn_active_cover(active_cover& a, single_learner& base, exampl
 
       // Choose min-cost label as the label
       // Set importance weight to be the cost difference
-      ec.l.simple.label = -1.f * sign(cost_delta) * sign(prediction);
+      ec.l.simple().label = -1.f * sign(cost_delta) * sign(prediction);
       ec.weight = ec_input_weight * fabs(cost_delta);
 
       // Update learner
@@ -214,20 +214,20 @@ void predict_or_learn_active_cover(active_cover& a, single_learner& base, exampl
       base.predict(ec, i + 1);
 
       // Update numerator of lambda
-      a.lambda_n[i] += 2.f * ((float)(sign(ec.pred.scalar) != sign(prediction))) * cost_delta;
+      a.lambda_n[i] += 2.f * ((float)(sign(ec.pred.scalar()) != sign(prediction))) * cost_delta;
       a.lambda_n[i] = fmax(a.lambda_n[i], 0.f);
 
       // Update denominator of lambda
-      a.lambda_d[i] += ((float)(sign(ec.pred.scalar) != sign(prediction) && in_dis)) / (float)pow(q2, 1.5);
+      a.lambda_d[i] += ((float)(sign(ec.pred.scalar()) != sign(prediction) && in_dis)) / (float)pow(q2, 1.5);
 
       // Accumulating weights of learners in the cover
-      q2 += ((float)(sign(ec.pred.scalar) != sign(prediction))) * (a.lambda_n[i] / a.lambda_d[i]);
+      q2 += ((float)(sign(ec.pred.scalar()) != sign(prediction))) * (a.lambda_n[i] / a.lambda_d[i]);
     }
 
     // Restoring the weight, the label, and the prediction
     ec.weight = ec_output_weight;
-    ec.l.simple.label = ec_output_label;
-    ec.pred.scalar = prediction;
+    ec.l.simple().label = ec_output_label;
+    ec.pred.scalar() = prediction;
   }
 }
 
@@ -281,6 +281,6 @@ base_learner* active_cover_setup(options_i& options, vw& all)
   // Create new learner
   learner<active_cover, example>& l = init_learner(
       data, base, predict_or_learn_active_cover<true>, predict_or_learn_active_cover<false>, data->cover_size + 1);
-
+  l.label_type = label_type_t::simple;
   return make_base(l);
 }
diff --git a/vowpalwabbit/audit_regressor.cc b/vowpalwabbit/audit_regressor.cc
index 3478c7d63d8..905693ad507 100644
--- a/vowpalwabbit/audit_regressor.cc
+++ b/vowpalwabbit/audit_regressor.cc
@@ -269,12 +269,14 @@ LEARNER::base_learner* audit_regressor_setup(options_i& options, vw& all)
   dat->out_file = new io_buf();
   dat->out_file->open_file(out_file.c_str(), all.stdin_off, io_buf::WRITE);
 
+  auto base = as_singleline(setup_base(options, all));
   LEARNER::learner<audit_regressor_data, example>& ret =
-      LEARNER::init_learner(dat, as_singleline(setup_base(options, all)), audit_regressor, audit_regressor, 1);
+      LEARNER::init_learner(dat, base, audit_regressor, audit_regressor, 1);
   ret.set_end_examples(end_examples);
   ret.set_finish_example(finish_example);
   ret.set_finish(finish);
   ret.set_init_driver(init_driver);
+  ret.label_type = base->label_type;
 
   return LEARNER::make_base<audit_regressor_data>(ret);
 }
diff --git a/vowpalwabbit/autolink.cc b/vowpalwabbit/autolink.cc
index 988c34933dd..1b88c546e29 100644
--- a/vowpalwabbit/autolink.cc
+++ b/vowpalwabbit/autolink.cc
@@ -53,7 +53,7 @@ void VW::autolink::learn(LEARNER::single_learner& base, example& ec)
 void VW::autolink::prepare_example(LEARNER::single_learner& base, example& ec)
 {
   base.predict(ec);
-  float base_pred = ec.pred.scalar;
+  float base_pred = ec.pred.scalar();
 
   // Add features of label.
   ec.indices.push_back(autolink_namespace);
@@ -63,7 +63,7 @@ void VW::autolink::prepare_example(LEARNER::single_learner& base, example& ec)
     if (base_pred != 0.)
     {
       fs.push_back(base_pred, AUTOCONSTANT + (i << _stride_shift));
-      base_pred *= ec.pred.scalar;
+      base_pred *= ec.pred.scalar();
     }
   }
   ec.total_sum_feat_sq += fs.sum_feat_sq;
@@ -97,6 +97,9 @@ LEARNER::base_learner* autolink_setup(options_i& options, vw& all)
     return nullptr;
 
   auto autolink_reduction = scoped_calloc_or_throw<VW::autolink>(d, all.weights.stride_shift());
-  return make_base(init_learner(
-      autolink_reduction, as_singleline(setup_base(options, all)), predict_or_learn<true>, predict_or_learn<false>));
+  auto base = as_singleline(setup_base(options, all));
+  auto learner = make_base(init_learner(
+      autolink_reduction, base, predict_or_learn<true>, predict_or_learn<false>));
+  learner->label_type = base->label_type;
+  return learner;
 }
diff --git a/vowpalwabbit/baseline.cc b/vowpalwabbit/baseline.cc
index 6c51d4fb864..01700d55bc0 100644
--- a/vowpalwabbit/baseline.cc
+++ b/vowpalwabbit/baseline.cc
@@ -72,7 +72,7 @@ struct baseline
   ~baseline()
   {
     if (ec)
-      VW::dealloc_example(simple_label.delete_label, *ec);
+      ec->~example();
     free(ec);
   }
 };
@@ -113,7 +113,7 @@ void predict_or_learn(baseline& data, single_learner& base, example& ec)
     }
     VW::copy_example_metadata(/*audit=*/false, data.ec, &ec);
     base.predict(*data.ec);
-    ec.l.simple.initial = data.ec->pred.scalar;
+    ec.l.simple().initial = data.ec->pred.scalar();
     base.predict(ec);
   }
   else
@@ -121,10 +121,10 @@ void predict_or_learn(baseline& data, single_learner& base, example& ec)
 
   if (is_learn)
   {
-    const float pred = ec.pred.scalar;  // save 'safe' prediction
+    const float pred = ec.pred.scalar();  // save 'safe' prediction
 
     // now learn
-    data.ec->l.simple = ec.l.simple;
+    data.ec->l.simple() = ec.l.simple();
     if (!data.global_only)
     {
       // move label & constant features data over to baseline example
@@ -150,7 +150,7 @@ void predict_or_learn(baseline& data, single_learner& base, example& ec)
       base.learn(*data.ec);
 
     // regress residual
-    ec.l.simple.initial = data.ec->pred.scalar;
+    ec.l.simple().initial = data.ec->pred.scalar();
     base.learn(ec);
 
     if (!data.global_only)
@@ -160,7 +160,7 @@ void predict_or_learn(baseline& data, single_learner& base, example& ec)
     }
 
     // return the safe prediction
-    ec.pred.scalar = pred;
+    ec.pred.scalar() = pred;
   }
 }
 
@@ -175,15 +175,15 @@ float sensitivity(baseline& data, base_learner& base, example& ec)
 
   // sensitivity of baseline term
   VW::copy_example_metadata(/*audit=*/false, data.ec, &ec);
-  data.ec->l.simple.label = ec.l.simple.label;
-  data.ec->pred.scalar = ec.pred.scalar;
+  data.ec->l.simple().label = ec.l.simple().label;
+  data.ec->pred.scalar() = ec.pred.scalar();
   // std::cout << "before base" << std::endl;
   const float baseline_sens = base.sensitivity(*data.ec);
   // std::cout << "base sens: " << baseline_sens << std::endl;
 
   // sensitivity of residual
   as_singleline(&base)->predict(*data.ec);
-  ec.l.simple.initial = data.ec->pred.scalar;
+  ec.l.simple().initial = data.ec->pred.scalar();
   const float sens = base.sensitivity(ec);
   // std::cout << " residual sens: " << sens << std::endl;
   return baseline_sens + sens;
@@ -213,7 +213,7 @@ base_learner* baseline_setup(options_i& options, vw& all)
     return nullptr;
 
   // initialize baseline example
-  data->ec = VW::alloc_examples(simple_label.label_size, 1);
+  data->ec = VW::alloc_examples(1);
   data->ec->interactions = &all.interactions;
 
   data->all = &all;
@@ -227,6 +227,6 @@ base_learner* baseline_setup(options_i& options, vw& all)
   learner<baseline, example>& l = init_learner(data, base, predict_or_learn<true>, predict_or_learn<false>);
 
   l.set_sensitivity(sensitivity);
-
+  l.label_type = label_type_t::simple;
   return make_base(l);
 }
diff --git a/vowpalwabbit/best_constant.cc b/vowpalwabbit/best_constant.cc
index 7b1216f3928..225d418acf5 100644
--- a/vowpalwabbit/best_constant.cc
+++ b/vowpalwabbit/best_constant.cc
@@ -33,7 +33,7 @@ bool get_best_constant(vw& all, float& best_constant, float& best_constant_loss)
   else
     return false;
 
-  if ((label1_cnt + label2_cnt) <= 0.)
+  if ((label1_cnt + label2_cnt) <= 0.f)
     return false;
 
   auto funcName = all.loss->getType();
diff --git a/vowpalwabbit/bfgs.cc b/vowpalwabbit/bfgs.cc
index 354f0caa200..8efaebc449d 100644
--- a/vowpalwabbit/bfgs.cc
+++ b/vowpalwabbit/bfgs.cc
@@ -105,7 +105,6 @@ struct bfgs
 
   ~bfgs()
   {
-    predictions.delete_v();
     free(mem);
     free(rho);
     free(alpha);
@@ -143,7 +142,7 @@ void reset_state(vw& all, bfgs& b, bool zero)
 // w[2] = step direction
 // w[3] = preconditioner
 
-constexpr bool test_example(example& ec) noexcept { return ec.l.simple.label == FLT_MAX; }
+bool test_example(example& ec) noexcept { return ec.l.simple().label == FLT_MAX; }
 
 float bfgs_predict(vw& all, example& ec)
 {
@@ -156,7 +155,7 @@ inline void add_grad(float& d, float f, float& fw) { (&fw)[W_GT] += d * f; }
 float predict_and_gradient(vw& all, example& ec)
 {
   float fp = bfgs_predict(all, ec);
-  label_data& ld = ec.l.simple;
+  label_data& ld = ec.l.simple();
   all.set_minmax(all.sd, ld.label);
 
   float loss_grad = all.loss->first_derivative(all.sd, fp, ld.label) * ec.weight;
@@ -169,7 +168,7 @@ inline void add_precond(float& d, float f, float& fw) { (&fw)[W_COND] += d * f *
 
 void update_preconditioner(vw& all, example& ec)
 {
-  float curvature = all.loss->second_derivative(all.sd, ec.pred.scalar, ec.l.simple.label) * ec.weight;
+  float curvature = all.loss->second_derivative(all.sd, ec.pred.scalar(), ec.l.simple().label) * ec.weight;
   GD::foreach_feature<float, add_precond>(all, ec, curvature);
 }
 
@@ -177,7 +176,7 @@ inline void add_DIR(float& p, const float fx, float& fw) { p += (&fw)[W_DIR] * f
 
 float dot_with_direction(vw& all, example& ec)
 {
-  float temp = ec.l.simple.initial;
+  float temp = ec.l.simple().initial;
   GD::foreach_feature<float, add_DIR>(all, ec, temp);
   return temp;
 }
@@ -859,7 +858,7 @@ int process_pass(vw& all, bfgs& b)
 
 void process_example(vw& all, bfgs& b, example& ec)
 {
-  label_data& ld = ec.l.simple;
+  label_data& ld = ec.l.simple();
   if (b.first_pass)
     b.importance_weight_sum += ec.weight;
 
@@ -868,10 +867,10 @@ void process_example(vw& all, bfgs& b, example& ec)
   /********************************************************************/
   if (b.gradient_pass)
   {
-    ec.pred.scalar = predict_and_gradient(all, ec);  // w[0] & w[1]
-    ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) * ec.weight;
+    ec.pred.scalar() = predict_and_gradient(all, ec);  // w[0] & w[1]
+    ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar(), ld.label) * ec.weight;
     b.loss_sum += ec.loss;
-    b.predictions.push_back(ec.pred.scalar);
+    b.predictions.push_back(ec.pred.scalar());
   }
   /********************************************************************/
   /* II) CURVATURE CALCULATION ****************************************/
@@ -881,13 +880,13 @@ void process_example(vw& all, bfgs& b, example& ec)
     float d_dot_x = dot_with_direction(all, ec);   // w[2]
     if (b.example_number >= b.predictions.size())  // Make things safe in case example source is strange.
       b.example_number = b.predictions.size() - 1;
-    ec.pred.scalar = b.predictions[b.example_number];
+    ec.pred.scalar() = b.predictions[b.example_number];
     ec.partial_prediction = b.predictions[b.example_number];
-    ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) * ec.weight;
+    ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar(), ld.label) * ec.weight;
     float sd = all.loss->second_derivative(all.sd, b.predictions[b.example_number++], ld.label);
     b.curvature += ((double)d_dot_x) * d_dot_x * sd * ec.weight;
   }
-  ec.updated_prediction = ec.pred.scalar;
+  ec.updated_prediction = ec.pred.scalar();
 
   if (b.preconditioner_pass)
     update_preconditioner(all, ec);  // w[3]
@@ -955,7 +954,7 @@ template <bool audit>
 void predict(bfgs& b, base_learner&, example& ec)
 {
   vw* all = b.all;
-  ec.pred.scalar = bfgs_predict(*all, ec);
+  ec.pred.scalar() = bfgs_predict(*all, ec);
   if (audit)
     GD::print_audit_features(*(b.all), ec);
 }
@@ -1166,6 +1165,7 @@ base_learner* bfgs_setup(options_i& options, vw& all)
   l->set_save_load(save_load);
   l->set_init_driver(init_driver);
   l->set_end_pass(end_pass);
+  l->label_type = label_type_t::simple;
 
   return make_base(*l);
 }
diff --git a/vowpalwabbit/binary.cc b/vowpalwabbit/binary.cc
index c6441509c8c..f3fcbc83a90 100644
--- a/vowpalwabbit/binary.cc
+++ b/vowpalwabbit/binary.cc
@@ -15,16 +15,16 @@ void predict_or_learn(char&, LEARNER::single_learner& base, example& ec)
   else
     base.predict(ec);
 
-  if (ec.pred.scalar > 0)
-    ec.pred.scalar = 1;
+  if (ec.pred.scalar() > 0)
+    ec.pred.scalar() = 1;
   else
-    ec.pred.scalar = -1;
+    ec.pred.scalar() = -1;
 
-  if (ec.l.simple.label != FLT_MAX)
+  if (ec.l.simple().label != FLT_MAX)
   {
-    if (fabs(ec.l.simple.label) != 1.f)
-      std::cout << "You are using label " << ec.l.simple.label << " not -1 or 1 as loss function expects!" << std::endl;
-    else if (ec.l.simple.label == ec.pred.scalar)
+    if (fabs(ec.l.simple().label) != 1.f)
+      std::cout << "You are using label " << ec.l.simple().label << " not -1 or 1 as loss function expects!" << std::endl;
+    else if (ec.l.simple().label == ec.pred.scalar())
       ec.loss = 0.;
     else
       ec.loss = ec.weight;
@@ -43,5 +43,6 @@ LEARNER::base_learner* binary_setup(options_i& options, vw& all)
 
   LEARNER::learner<char, example>& ret =
       LEARNER::init_learner(as_singleline(setup_base(options, all)), predict_or_learn<true>, predict_or_learn<false>);
+  ret.label_type = label_type_t::simple;
   return make_base(ret);
 }
diff --git a/vowpalwabbit/boosting.cc b/vowpalwabbit/boosting.cc
index 17a33f6e9db..0f848254711 100644
--- a/vowpalwabbit/boosting.cc
+++ b/vowpalwabbit/boosting.cc
@@ -74,7 +74,7 @@ struct boosting
 template <bool is_learn>
 void predict_or_learn(boosting& o, LEARNER::single_learner& base, example& ec)
 {
-  label_data& ld = ec.l.simple;
+  label_data& ld = ec.l.simple();
 
   float final_prediction = 0;
 
@@ -112,25 +112,25 @@ void predict_or_learn(boosting& o, LEARNER::single_learner& base, example& ec)
 
       base.predict(ec, i);
 
-      // ec.pred.scalar is now the i-th learner prediction on this example
-      s += ld.label * ec.pred.scalar;
+      // ec.pred.scalar() is now the i-th learner prediction on this example
+      s += ld.label * ec.pred.scalar();
 
-      final_prediction += ec.pred.scalar;
+      final_prediction += ec.pred.scalar();
 
       base.learn(ec, i);
     }
     else
     {
       base.predict(ec, i);
-      final_prediction += ec.pred.scalar;
+      final_prediction += ec.pred.scalar();
     }
   }
 
   ec.weight = u;
   ec.partial_prediction = final_prediction;
-  ec.pred.scalar = sign(final_prediction);
+  ec.pred.scalar() = sign(final_prediction);
 
-  if (ld.label == ec.pred.scalar)
+  if (ld.label == ec.pred.scalar())
     ec.loss = 0.;
   else
     ec.loss = ec.weight;
@@ -142,7 +142,7 @@ void predict_or_learn(boosting& o, LEARNER::single_learner& base, example& ec)
 template <bool is_learn>
 void predict_or_learn_logistic(boosting& o, LEARNER::single_learner& base, example& ec)
 {
-  label_data& ld = ec.l.simple;
+  label_data& ld = ec.l.simple();
 
   float final_prediction = 0;
 
@@ -163,13 +163,13 @@ void predict_or_learn_logistic(boosting& o, LEARNER::single_learner& base, examp
 
       base.predict(ec, i);
       float z;
-      z = ld.label * ec.pred.scalar;
+      z = ld.label * ec.pred.scalar();
 
       s += z * o.alpha[i];
 
-      // if ld.label * ec.pred.scalar < 0, learner i made a mistake
+      // if ld.label * ec.pred.scalar() < 0, learner i made a mistake
 
-      final_prediction += ec.pred.scalar * o.alpha[i];
+      final_prediction += ec.pred.scalar() * o.alpha[i];
 
       // update alpha
       o.alpha[i] += eta * z / (1 + correctedExp(s));
@@ -183,15 +183,15 @@ void predict_or_learn_logistic(boosting& o, LEARNER::single_learner& base, examp
     else
     {
       base.predict(ec, i);
-      final_prediction += ec.pred.scalar * o.alpha[i];
+      final_prediction += ec.pred.scalar() * o.alpha[i];
     }
   }
 
   ec.weight = u;
   ec.partial_prediction = final_prediction;
-  ec.pred.scalar = sign(final_prediction);
+  ec.pred.scalar() = sign(final_prediction);
 
-  if (ld.label == ec.pred.scalar)
+  if (ld.label == ec.pred.scalar())
     ec.loss = 0.;
   else
     ec.loss = ec.weight;
@@ -200,7 +200,7 @@ void predict_or_learn_logistic(boosting& o, LEARNER::single_learner& base, examp
 template <bool is_learn>
 void predict_or_learn_adaptive(boosting& o, LEARNER::single_learner& base, example& ec)
 {
-  label_data& ld = ec.l.simple;
+  label_data& ld = ec.l.simple();
 
   float final_prediction = 0, partial_prediction = 0;
 
@@ -225,16 +225,16 @@ void predict_or_learn_adaptive(boosting& o, LEARNER::single_learner& base, examp
       base.predict(ec, i);
       float z;
 
-      z = ld.label * ec.pred.scalar;
+      z = ld.label * ec.pred.scalar();
 
       s += z * o.alpha[i];
 
       if (v_partial_sum <= stopping_point)
       {
-        final_prediction += ec.pred.scalar * o.alpha[i];
+        final_prediction += ec.pred.scalar() * o.alpha[i];
       }
 
-      partial_prediction += ec.pred.scalar * o.alpha[i];
+      partial_prediction += ec.pred.scalar() * o.alpha[i];
 
       v_partial_sum += o.v[i];
 
@@ -259,7 +259,7 @@ void predict_or_learn_adaptive(boosting& o, LEARNER::single_learner& base, examp
       base.predict(ec, i);
       if (v_partial_sum <= stopping_point)
       {
-        final_prediction += ec.pred.scalar * o.alpha[i];
+        final_prediction += ec.pred.scalar() * o.alpha[i];
       }
       else
       {
@@ -282,9 +282,9 @@ void predict_or_learn_adaptive(boosting& o, LEARNER::single_learner& base, examp
 
   ec.weight = u;
   ec.partial_prediction = final_prediction;
-  ec.pred.scalar = sign(final_prediction);
+  ec.pred.scalar() = sign(final_prediction);
 
-  if (ld.label == ec.pred.scalar)
+  if (ld.label == ec.pred.scalar())
     ec.loss = 0.;
   else
     ec.loss = ec.weight;
@@ -448,6 +448,6 @@ LEARNER::base_learner* boosting_setup(options_i& options, vw& all)
     THROW("Unrecognized boosting algorithm: \'" << data->alg << "\' Bailing!");
 
   l->set_finish_example(return_example);
-
+  l->label_type = label_type_t::simple;
   return make_base(*l);
 }
diff --git a/vowpalwabbit/bs.cc b/vowpalwabbit/bs.cc
index 51eb0d12807..977e5f098fd 100644
--- a/vowpalwabbit/bs.cc
+++ b/vowpalwabbit/bs.cc
@@ -24,18 +24,16 @@ struct bs
   size_t bs_type;
   float lb;
   float ub;
-  std::vector<double>* pred_vec;
+  std::vector<double> pred_vec;
   vw* all;  // for raw prediction and loss
   std::shared_ptr<rand_state> _random_state;
-
-  ~bs() { delete pred_vec; }
 };
 
 void bs_predict_mean(vw& all, example& ec, std::vector<double>& pred_vec)
 {
-  ec.pred.scalar = (float)accumulate(pred_vec.cbegin(), pred_vec.cend(), 0.0) / pred_vec.size();
-  if (ec.weight > 0 && ec.l.simple.label != FLT_MAX)
-    ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar, ec.l.simple.label) * ec.weight;
+  ec.pred.scalar() = (float)accumulate(pred_vec.cbegin(), pred_vec.cend(), 0.0) / pred_vec.size();
+  if (ec.weight > 0 && ec.l.simple().label != FLT_MAX)
+    ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar(), ec.l.simple().label) * ec.weight;
 }
 
 void bs_predict_vote(example& ec, std::vector<double>& pred_vec)
@@ -124,11 +122,11 @@ void bs_predict_vote(example& ec, std::vector<double>& pred_vec)
   delete[] pred_vec_int;
 
   // ld.prediction = sum_labels/(float)counter; //replace line below for: "avg on votes" and getLoss()
-  ec.pred.scalar = (float)current_label;
+  ec.pred.scalar() = (float)current_label;
 
   // ec.loss = all.loss->getLoss(all.sd, ld.prediction, ld.label) * ec.weight; //replace line below for: "avg on votes"
   // and getLoss()
-  ec.loss = ((ec.pred.scalar == ec.l.simple.label) ? 0.f : 1.f) * ec.weight;
+  ec.loss = ((ec.pred.scalar() == ec.l.simple().label) ? 0.f : 1.f) * ec.weight;
 }
 
 void print_result(int f, float res, v_array<char> tag, float lb, float ub)
@@ -148,7 +146,7 @@ void print_result(int f, float res, v_array<char> tag, float lb, float ub)
 
 void output_example(vw& all, bs& d, example& ec)
 {
-  label_data& ld = ec.l.simple;
+  label_data& ld = ec.l.simple();
 
   all.sd->update(ec.test_only, ld.label != FLT_MAX, ec.loss, ec.weight, ec.num_features);
   if (ld.label != FLT_MAX && !ec.test_only)
@@ -158,7 +156,7 @@ void output_example(vw& all, bs& d, example& ec)
   {
     d.lb = FLT_MAX;
     d.ub = -FLT_MAX;
-    for (double v : *d.pred_vec)
+    for (double v : d.pred_vec)
     {
       if (v > d.ub)
         d.ub = (float)v;
@@ -167,7 +165,8 @@ void output_example(vw& all, bs& d, example& ec)
     }
   }
 
-  for (int sink : all.final_prediction_sink) print_result(sink, ec.pred.scalar, ec.tag, d.lb, d.ub);
+  for (int sink : all.final_prediction_sink)
+    print_result(sink, ec.pred.scalar(), ec.tag, d.lb, d.ub);
 
   print_update(all, ec);
 }
@@ -181,7 +180,7 @@ void predict_or_learn(bs& d, single_learner& base, example& ec)
   float weight_temp = ec.weight;
 
   std::stringstream outputStringStream;
-  d.pred_vec->clear();
+  d.pred_vec.clear();
 
   for (size_t i = 1; i <= d.B; i++)
   {
@@ -192,7 +191,7 @@ void predict_or_learn(bs& d, single_learner& base, example& ec)
     else
       base.predict(ec, i - 1);
 
-    d.pred_vec->push_back(ec.pred.scalar);
+    d.pred_vec.push_back(ec.pred.scalar());
 
     if (shouldOutput)
     {
@@ -207,10 +206,10 @@ void predict_or_learn(bs& d, single_learner& base, example& ec)
   switch (d.bs_type)
   {
     case BS_TYPE_MEAN:
-      bs_predict_mean(all, ec, *d.pred_vec);
+      bs_predict_mean(all, ec, d.pred_vec);
       break;
     case BS_TYPE_VOTE:
-      bs_predict_vote(ec, *d.pred_vec);
+      bs_predict_vote(ec, d.pred_vec);
       break;
     default:
       THROW("Unknown bs_type specified: " << d.bs_type);
@@ -256,14 +255,13 @@ base_learner* bs_setup(options_i& options, vw& all)
   else  // by default use mean
     data->bs_type = BS_TYPE_MEAN;
 
-  data->pred_vec = new std::vector<double>();
-  data->pred_vec->reserve(data->B);
+  data->pred_vec.reserve(data->B);
   data->all = &all;
   data->_random_state = all.get_random_state();
 
   learner<bs, example>& l = init_learner(
       data, as_singleline(setup_base(options, all)), predict_or_learn<true>, predict_or_learn<false>, data->B);
   l.set_finish_example(finish_example);
-
+  l.label_type = label_type_t::simple;
   return make_base(l);
 }
diff --git a/vowpalwabbit/cache.cc b/vowpalwabbit/cache.cc
index 011b462b8bd..289e91d4422 100644
--- a/vowpalwabbit/cache.cc
+++ b/vowpalwabbit/cache.cc
@@ -67,7 +67,8 @@ int read_cached_features(vw* all, v_array<example*>& examples)
   ae->sorted = all->p->sorted_cache;
   io_buf* input = all->p->input;
 
-  size_t total = all->p->lp.read_cached_label(all->p->_shared_data, &ae->l, *input);
+  all->p->lp.default_label(ae->l);
+  size_t total = all->p->lp.read_cached_label(all->p->_shared_data, ae->l, *input);
   if (total == 0)
     return 0;
   if (read_cached_tag(*input, ae) == 0)
@@ -188,7 +189,7 @@ void output_features(io_buf& cache, unsigned char index, features& fs, uint64_t
   *(size_t*)storage_size_loc = c - storage_size_loc - sizeof(size_t);
 }
 
-void cache_tag(io_buf& cache, v_array<char> tag)
+void cache_tag(io_buf& cache, const v_array<char>& tag)
 {
   char* c;
   cache.buf_write(c, sizeof(size_t) + tag.size());
diff --git a/vowpalwabbit/cache.h b/vowpalwabbit/cache.h
index 696b9b42976..d05a4245956 100644
--- a/vowpalwabbit/cache.h
+++ b/vowpalwabbit/cache.h
@@ -11,7 +11,7 @@ char* run_len_decode(char* p, size_t& i);
 char* run_len_encode(char* p, size_t i);
 
 int read_cached_features(vw* all, v_array<example*>& examples);
-void cache_tag(io_buf& cache, v_array<char> tag);
+void cache_tag(io_buf& cache, const v_array<char>& tag);
 void cache_features(io_buf& cache, example* ae, uint64_t mask);
 void output_byte(io_buf& cache, unsigned char s);
 void output_features(io_buf& cache, unsigned char index, features& fs, uint64_t mask);
diff --git a/vowpalwabbit/cb.cc b/vowpalwabbit/cb.cc
index c26581badf7..17d8b1b58fe 100644
--- a/vowpalwabbit/cb.cc
+++ b/vowpalwabbit/cb.cc
@@ -13,12 +13,12 @@ using namespace LEARNER;
 
 namespace CB
 {
-char* bufread_label(CB::label* ld, char* c, io_buf& cache)
+char* bufread_label(CB::label& ld, char* c, io_buf& cache)
 {
   size_t num = *(size_t*)c;
-  ld->costs.clear();
+  ld.costs.clear();
   c += sizeof(size_t);
-  size_t total = sizeof(cb_class) * num + sizeof(ld->weight);
+  size_t total = sizeof(cb_class) * num + sizeof(ld.weight);
   if (cache.buf_read(c, total) < total)
   {
     std::cout << "error in demarshal of cost data" << std::endl;
@@ -28,17 +28,16 @@ char* bufread_label(CB::label* ld, char* c, io_buf& cache)
   {
     cb_class temp = *(cb_class*)c;
     c += sizeof(cb_class);
-    ld->costs.push_back(temp);
+    ld.costs.push_back(temp);
   }
-  memcpy(&ld->weight, c, sizeof(ld->weight));
-  c += sizeof(ld->weight);
+  memcpy(&ld.weight, c, sizeof(ld.weight));
+  c += sizeof(ld.weight);
   return c;
 }
 
-size_t read_cached_label(shared_data*, void* v, io_buf& cache)
+size_t read_cached_label(shared_data*, CB::label& ld, io_buf& cache)
 {
-  CB::label* ld = (CB::label*)v;
-  ld->costs.clear();
+  ld.costs.clear();
   char* c;
   size_t total = sizeof(size_t);
   if (cache.buf_read(c, total) < total)
@@ -48,71 +47,70 @@ size_t read_cached_label(shared_data*, void* v, io_buf& cache)
   return total;
 }
 
-float weight(void* v)
+size_t read_cached_label(shared_data* s, polylabel& v, io_buf& cache)
 {
-  CB::label* ld = (CB::label*)v;
-  return ld->weight;
+  return CB::read_cached_label(s, v.cb(), cache);
 }
 
-char* bufcache_label(CB::label* ld, char* c)
+float weight(CB::label& ld) { return ld.weight; }
+
+float weight(polylabel& v) { return CB::weight(v.cb()); }
+
+char* bufcache_label(CB::label& ld, char* c)
 {
-  *(size_t*)c = ld->costs.size();
+  *(size_t*)c = ld.costs.size();
   c += sizeof(size_t);
-  for (auto const& cost : ld->costs)
+  for (auto const& cost : ld.costs)
   {
     *(cb_class*)c = cost;
     c += sizeof(cb_class);
   }
-  memcpy(c, &ld->weight, sizeof(ld->weight));
-  c += sizeof(ld->weight);
+  memcpy(c, &ld.weight, sizeof(ld.weight));
+  c += sizeof(ld.weight);
   return c;
 }
 
-void cache_label(void* v, io_buf& cache)
+void cache_label(CB::label& ld, io_buf& cache)
 {
   char* c;
-  CB::label* ld = (CB::label*)v;
-  cache.buf_write(c, sizeof(size_t) + sizeof(cb_class) * ld->costs.size() + sizeof(ld->weight));
+  cache.buf_write(c, sizeof(size_t) + sizeof(cb_class) * ld.costs.size() + sizeof(ld.weight));
   bufcache_label(ld, c);
 }
 
-void default_label(void* v)
+void cache_label(polylabel& v, io_buf& cache) { CB::cache_label(v.cb(), cache); }
+
+void default_label(CB::label& ld)
+{
+  ld.costs.clear();
+  ld.weight = 1;
+}
+
+void default_label(polylabel& v)
 {
-  CB::label* ld = (CB::label*)v;
-  ld->costs.clear();
-  ld->weight = 1;
+  if (v.get_type() != label_type_t::cb)
+  {
+    v.reset();
+    v.init_as_cb();
+  }
+  CB::default_label(v.cb());
 }
 
-bool test_label(void* v)
+bool test_label(CB::label& ld)
 {
-  CB::label* ld = (CB::label*)v;
-  if (ld->costs.empty())
+  if (ld.costs.empty())
     return true;
-  for (auto const& cost : ld->costs)
+  for (auto const& cost : ld.costs)
     if (FLT_MAX != cost.cost && cost.probability > 0.)
       return false;
   return true;
 }
 
-void delete_label(void* v)
-{
-  CB::label* ld = (CB::label*)v;
-  ld->costs.delete_v();
-}
+bool test_label(polylabel& v) { return CB::test_label(v.cb()); }
 
-void copy_label(void* dst, void* src)
+void parse_label(parser* p, shared_data*, CB::label& ld, v_array<VW::string_view>& words)
 {
-  CB::label* ldD = (CB::label*)dst;
-  CB::label* ldS = (CB::label*)src;
-  copy_array(ldD->costs, ldS->costs);
-  ldD->weight = ldS->weight;
-}
-
-void parse_label(parser* p, shared_data*, void* v, v_array<VW::string_view>& words)
-{
-  CB::label* ld = (CB::label*)v;
-  ld->costs.clear();
-  ld->weight = 1.0;
+  ld.costs.clear();
+  ld.weight = 1.0;
 
   for (auto const& word : words)
   {
@@ -159,20 +157,29 @@ void parse_label(parser* p, shared_data*, void* v, v_array<VW::string_view>& wor
         std::cerr << "shared feature vectors should not have costs" << std::endl;
     }
 
-    ld->costs.push_back(f);
+    ld.costs.push_back(f);
   }
 }
 
-label_parser cb_label = {default_label, parse_label, cache_label, read_cached_label, delete_label, weight, copy_label,
-    test_label, sizeof(label)};
+void parse_label(parser* p, shared_data* sd, polylabel& v, v_array<VW::string_view>& words)
+{
+  CB::parse_label(p, sd, v.cb(), words);
+}
+
+label_parser cb_label = {default_label, parse_label, cache_label, read_cached_label,polylabel_delete_label,
+  weight, polylabel_copy_label, test_label, sizeof(label)};
 
 bool ec_is_example_header(example const& ec)  // example headers just have "shared"
 {
-  const auto& costs = ec.l.cb.costs;
-  if (costs.size() != 1)
-    return false;
-  if (costs[0].probability == -1.f)
-    return true;
+  if (ec.l.get_type() == label_type_t::cb)
+  {
+    const auto& costs = ec.l.cb().costs;
+    if (costs.size() != 1)
+      return false;
+    if (costs[0].probability == -1.f)
+      return true;
+  }
+
   return false;
 }
 
@@ -182,7 +189,6 @@ void print_update(vw& all, bool is_test, example& ec, multi_ex* ec_seq, bool act
   {
     size_t num_features = ec.num_features;
 
-    size_t pred = ec.pred.multiclass;
     if (ec_seq != nullptr)
     {
       num_features = 0;
@@ -200,94 +206,89 @@ void print_update(vw& all, bool is_test, example& ec, multi_ex* ec_seq, bool act
     if (action_scores)
     {
       std::ostringstream pred_buf;
+      const auto& a_s = ec.pred.action_probs();
       pred_buf << std::setw(shared_data::col_current_predict) << std::right << std::setfill(' ');
-      if (!ec.pred.a_s.empty())
-        pred_buf << ec.pred.a_s[0].action << ":" << ec.pred.a_s[0].score << "...";
+      if (!a_s.empty())
+        pred_buf << ec.pred.action_probs()[0].action << ":" << a_s[0].score << "...";
       else
         pred_buf << "no action";
       all.sd->print_update(all.holdout_set_off, all.current_pass, label_buf, pred_buf.str(), num_features,
           all.progress_add, all.progress_arg);
     }
     else
+    {
+      size_t pred = ec.pred.multiclass();
       all.sd->print_update(all.holdout_set_off, all.current_pass, label_buf, (uint32_t)pred, num_features,
           all.progress_add, all.progress_arg);
+    }
   }
 }
 }  // namespace CB
 
 namespace CB_EVAL
 {
-float weight(void* v)
+float weight(polylabel& v)
 {
-  CB_EVAL::label* ld = (CB_EVAL::label*)v;
-  return ld->event.weight;
+  auto& ld = v.cb_eval();
+  return ld.event.weight;
 }
 
-size_t read_cached_label(shared_data* sd, void* v, io_buf& cache)
+size_t read_cached_label(shared_data* sd, polylabel& v, io_buf& cache)
 {
-  CB_EVAL::label* ld = (CB_EVAL::label*)v;
+  auto& ld = v.cb_eval();
   char* c;
   size_t total = sizeof(uint32_t);
   if (cache.buf_read(c, total) < total)
     return 0;
-  ld->action = *(uint32_t*)c;
+  ld.action = *(uint32_t*)c;
 
-  return total + CB::read_cached_label(sd, &(ld->event), cache);
+  return total + CB::read_cached_label(sd, ld.event, cache);
 }
 
-void cache_label(void* v, io_buf& cache)
+void cache_label(polylabel& v, io_buf& cache)
 {
   char* c;
-  CB_EVAL::label* ld = (CB_EVAL::label*)v;
+  auto& ld = v.cb_eval();
   cache.buf_write(c, sizeof(uint32_t));
-  *(uint32_t*)c = ld->action;
+  *(uint32_t*)c = ld.action;
 
-  CB::cache_label(&(ld->event), cache);
+  CB::cache_label(ld.event, cache);
 }
 
-void default_label(void* v)
+void default_label(polylabel& v)
 {
-  CB_EVAL::label* ld = (CB_EVAL::label*)v;
-  CB::default_label(&(ld->event));
-  ld->action = 0;
-}
-
-bool test_label(void* v)
-{
-  CB_EVAL::label* ld = (CB_EVAL::label*)v;
-  return CB::test_label(&ld->event);
-}
+  if (v.get_type() != label_type_t::cb_eval)
+  {
+    v.reset();
+    v.init_as_cb_eval();
 
-void delete_label(void* v)
-{
-  CB_EVAL::label* ld = (CB_EVAL::label*)v;
-  CB::delete_label(&(ld->event));
+  }
+  auto& ld = v.cb_eval();
+  CB::default_label(ld.event);
+  ld.action = 0;
 }
 
-void copy_label(void* dst, void* src)
+bool test_label(polylabel& v)
 {
-  CB_EVAL::label* ldD = (CB_EVAL::label*)dst;
-  CB_EVAL::label* ldS = (CB_EVAL::label*)src;
-  CB::copy_label(&(ldD->event), &(ldS)->event);
-  ldD->action = ldS->action;
+  auto& ld = v.cb_eval();
+  return CB::test_label(ld.event);
 }
 
-void parse_label(parser* p, shared_data* sd, void* v, v_array<VW::string_view>& words)
+void parse_label(parser* p, shared_data* sd, polylabel& v, v_array<VW::string_view>& words)
 {
-  CB_EVAL::label* ld = (CB_EVAL::label*)v;
+  auto& ld = v.cb_eval();
 
   if (words.size() < 2)
     THROW("Evaluation can not happen without an action and an exploration");
 
-  ld->action = (uint32_t)hashstring(words[0].begin(), words[0].length(), 0);
+  ld.action = (uint32_t)hashstring(words[0].begin(), words[0].length(), 0);
 
   words.begin()++;
 
-  CB::parse_label(p, sd, &(ld->event), words);
+  CB::parse_label(p, sd, ld.event, words);
 
   words.begin()--;
 }
 
-label_parser cb_eval = {default_label, parse_label, cache_label, read_cached_label, delete_label, weight, copy_label,
-    test_label, sizeof(CB_EVAL::label)};
-}  // namespace CB_EVAL
+label_parser cb_eval = {default_label, parse_label, cache_label, read_cached_label, polylabel_delete_label, weight, polylabel_copy_label,
+    test_label, sizeof(CB_EVAL::label)};}  // namespace CB_EVAL
diff --git a/vowpalwabbit/cb.h b/vowpalwabbit/cb.h
index 27893dff99c..a37dec96a56 100644
--- a/vowpalwabbit/cb.h
+++ b/vowpalwabbit/cb.h
@@ -26,6 +26,9 @@ struct label
   float weight;
 };
 
+bool test_label(label& ld);
+void default_label(label& ld);
+
 extern label_parser cb_label;                  // for learning
 bool ec_is_example_header(example const& ec);  // example headers look like "shared"
 
diff --git a/vowpalwabbit/cb_adf.cc b/vowpalwabbit/cb_adf.cc
index 053cdfad3d4..6481b6af53f 100644
--- a/vowpalwabbit/cb_adf.cc
+++ b/vowpalwabbit/cb_adf.cc
@@ -69,21 +69,6 @@ struct cb_adf
 
   const VW::version_struct* get_model_file_ver() const { return _model_file_ver; }
 
-  ~cb_adf()
-  {
-    _cb_labels.delete_v();
-    for (auto& prepped_cs_label : _prepped_cs_labels) prepped_cs_label.costs.delete_v();
-    _prepped_cs_labels.delete_v();
-    _cs_labels.costs.delete_v();
-    _backup_weights.delete_v();
-    _backup_nf.delete_v();
-    _prob_s.delete_v();
-
-    _a_s.delete_v();
-    _a_s_mtr_cs.delete_v();
-    _gen_cs.pred_scores.costs.delete_v();
-  }
-
  private:
   void learn_IPS(multi_learner& base, multi_ex& examples);
   void learn_DR(multi_learner& base, multi_ex& examples);
@@ -103,9 +88,9 @@ CB::cb_class get_observed_cost(multi_ex& examples)
   size_t i = 0;
   for (example*& ec : examples)
   {
-    if (ec->l.cb.costs.size() == 1 && ec->l.cb.costs[0].cost != FLT_MAX && ec->l.cb.costs[0].probability > 0)
+    if (ec->l.cb().costs.size() == 1 && ec->l.cb().costs[0].cost != FLT_MAX && ec->l.cb().costs[0].probability > 0)
     {
-      ld = &ec->l.cb;
+      ld = &ec->l.cb();
       index = (int)i;
     }
     ++i;
@@ -146,10 +131,10 @@ void cb_adf::learn_SM(multi_learner& base, multi_ex& examples)
   _a_s.clear();
   _prob_s.clear();
   // TODO: Check that predicted scores are always stored with the first example
-  for (uint32_t i = 0; i < examples[0]->pred.a_s.size(); i++)
+  for (uint32_t i = 0; i < examples[0]->pred.action_probs().size(); i++)
   {
-    _a_s.push_back({examples[0]->pred.a_s[i].action, examples[0]->pred.a_s[i].score});
-    _prob_s.push_back({examples[0]->pred.a_s[i].action, 0.0});
+    _a_s.push_back({examples[0]->pred.action_probs()[i].action, examples[0]->pred.action_probs()[i].score});
+    _prob_s.push_back({examples[0]->pred.action_probs()[i].action, 0.0});
   }
 
   float sign_offset = 1.0;  // To account for negative rewards/costs
@@ -158,7 +143,7 @@ void cb_adf::learn_SM(multi_learner& base, multi_ex& examples)
 
   for (uint32_t i = 0; i < examples.size(); i++)
   {
-    CB::label ld = examples[i]->l.cb;
+    CB::label ld = examples[i]->l.cb();
     if (ld.costs.size() == 1 && ld.costs[0].cost != FLT_MAX)
     {
       chosen_action = i;
@@ -241,7 +226,7 @@ void cb_adf::learn_MTR(multi_learner& base, multi_ex& examples)
   {
     gen_cs_example_ips(examples, _cs_labels);
     call_cs_ldf<false>(base, examples, _cb_labels, _cs_labels, _prepped_cs_labels, _offset);
-    std::swap(examples[0]->pred.a_s, _a_s);
+    std::swap(examples[0]->pred.action_probs(), _a_s);
   }
   // second train on _one_ action (which requires up to 3 examples).
   // We must go through the cost sensitive classifier layer to get
@@ -249,16 +234,16 @@ void cb_adf::learn_MTR(multi_learner& base, multi_ex& examples)
   gen_cs_example_mtr(_gen_cs, examples, _cs_labels);
   uint32_t nf = (uint32_t)examples[_gen_cs.mtr_example]->num_features;
   float old_weight = examples[_gen_cs.mtr_example]->weight;
-  const float clipped_p = std::max(examples[_gen_cs.mtr_example]->l.cb.costs[0].probability, _clip_p);
+  const float clipped_p = std::max(examples[_gen_cs.mtr_example]->l.cb().costs[0].probability, _clip_p);
   examples[_gen_cs.mtr_example]->weight *= 1.f / clipped_p * ((float)_gen_cs.event_sum / (float)_gen_cs.action_sum);
 
-  std::swap(_gen_cs.mtr_ec_seq[0]->pred.a_s, _a_s_mtr_cs);
+  std::swap(_gen_cs.mtr_ec_seq[0]->pred.action_probs(), _a_s_mtr_cs);
   // TODO!!! cb_labels are not getting properly restored (empty costs are dropped)
   GEN_CS::call_cs_ldf<true>(base, _gen_cs.mtr_ec_seq, _cb_labels, _cs_labels, _prepped_cs_labels, _offset);
   examples[_gen_cs.mtr_example]->num_features = nf;
   examples[_gen_cs.mtr_example]->weight = old_weight;
-  std::swap(_gen_cs.mtr_ec_seq[0]->pred.a_s, _a_s_mtr_cs);
-  std::swap(examples[0]->pred.a_s, _a_s);
+  std::swap(_gen_cs.mtr_ec_seq[0]->pred.action_probs(), _a_s_mtr_cs);
+  std::swap(examples[0]->pred.action_probs(), _a_s);
 }
 
 // Validates a multiline example collection as a valid sequence for action dependent features format.
@@ -272,11 +257,11 @@ example* test_adf_sequence(multi_ex& ec_seq)
   for (auto* ec : ec_seq)
   {
     // Check if there is more than one cost for this example.
-    if (ec->l.cb.costs.size() > 1)
+    if (ec->l.cb().costs.size() > 1)
       THROW("cb_adf: badly formatted example, only one cost can be known.");
 
     // Check whether the cost was initialized to a value.
-    if (ec->l.cb.costs.size() == 1 && ec->l.cb.costs[0].cost != FLT_MAX)
+    if (ec->l.cb().costs.size() == 1 && ec->l.cb().costs[0].cost != FLT_MAX)
     {
       ret = ec;
       count += 1;
@@ -298,8 +283,8 @@ void cb_adf::do_actual_learning(multi_learner& base, multi_ex& ec_seq)
     /*	v_array<float> temp_scores;
     temp_scores = v_init<float>();
     do_actual_learning<false>(data,base);
-    for (size_t i = 0; i < data.ec_seq[0]->pred.a_s.size(); i++)
-    temp_scores.push_back(data.ec_seq[0]->pred.a_s[i].score);*/
+    for (size_t i = 0; i < data.ec_seq[0]->pred.action_scores().size(); i++)
+    temp_scores.push_back(data.ec_seq[0]->pred.action_scores()[i].score);*/
     switch (_gen_cs.cb_type)
     {
       case CB_TYPE_IPS:
@@ -325,9 +310,9 @@ void cb_adf::do_actual_learning(multi_learner& base, multi_ex& ec_seq)
     }
 
     /*      for (size_t i = 0; i < temp_scores.size(); i++)
-    if (temp_scores[i] != data.ec_seq[0]->pred.a_s[i].score)
-     std::cout << "problem! " << temp_scores[i] << " != " << data.ec_seq[0]->pred.a_s[i].score << " for " <<
-    data.ec_seq[0]->pred.a_s[i].action << std::endl; temp_scores.delete_v();*/
+    if (temp_scores[i] != data.ec_seq[0]->pred.action_scores()[i].score)
+     std::cout << "problem! " << temp_scores[i] << " != " << data.ec_seq[0]->pred.action_scores()[i].score << " for " <<
+    data.ec_seq[0]->pred.action_scores()[i].action << std::endl; temp_scores.delete_v();*/
   }
   else
   {
@@ -355,7 +340,7 @@ bool cb_adf::update_statistics(example& ec, multi_ex* ec_seq)
 {
   size_t num_features = 0;
 
-  uint32_t action = ec.pred.a_s[0].action;
+  uint32_t action = ec.pred.action_probs()[0].action;
   for (const auto& example : *ec_seq) num_features += example->num_features;
 
   float loss = 0.;
@@ -380,15 +365,14 @@ void output_example(vw& all, cb_adf& c, example& ec, multi_ex* ec_seq)
 
   bool labeled_example = c.update_statistics(ec, ec_seq);
 
-  uint32_t action = ec.pred.a_s[0].action;
-  for (int sink : all.final_prediction_sink)
-    all.print_by_ref(sink, (float)action, 0, ec.tag);
+  uint32_t action = ec.pred.action_probs()[0].action;
+  for (int sink : all.final_prediction_sink) all.print_by_ref(sink, (float)action, 0, ec.tag);
 
   if (all.raw_prediction > 0)
   {
     std::string outputString;
     std::stringstream outputStringStream(outputString);
-    const auto& costs = ec.l.cb.costs;
+    const auto& costs = ec.l.cb().costs;
 
     for (size_t i = 0; i < costs.size(); i++)
     {
@@ -404,14 +388,14 @@ void output_example(vw& all, cb_adf& c, example& ec, multi_ex* ec_seq)
 
 void output_rank_example(vw& all, cb_adf& c, example& ec, multi_ex* ec_seq)
 {
-  const auto& costs = ec.l.cb.costs;
+  const auto& costs = ec.l.cb().costs;
 
   if (example_is_newline_not_header(ec))
     return;
 
   bool labeled_example = c.update_statistics(ec, ec_seq);
 
-  for (int sink : all.final_prediction_sink) print_action_score(sink, ec.pred.a_s, ec.tag);
+  for (int sink : all.final_prediction_sink) print_action_score(sink, ec.pred.action_probs(), ec.tag);
 
   if (all.raw_prediction > 0)
   {
@@ -540,8 +524,6 @@ base_learner* cb_adf_setup(options_i& options, vw& all)
     all.trace_message << "warning: clipping probability not yet implemented for cb_type sm; p will not be clipped."
                       << std::endl;
 
-  all.delete_prediction = ACTION_SCORE::delete_action_scores;
-
   // Push necessary flags.
   if ((!options.was_supplied("csoaa_ldf") && !options.was_supplied("wap_ldf")) || rank_all ||
       !options.was_supplied("csoaa_rank"))
@@ -570,11 +552,12 @@ base_learner* cb_adf_setup(options_i& options, vw& all)
 
   cb_adf* bare = ld.get();
   learner<cb_adf, multi_ex>& l =
-      init_learner(ld, base, learn, predict, problem_multiplier, prediction_type_t::action_scores);
+      init_learner(ld, base, learn, predict, problem_multiplier, prediction_type_t::action_probs);
   l.set_finish_example(CB_ADF::finish_multiline_example);
 
   bare->set_scorer(all.scorer);
 
   l.set_save_load(CB_ADF::save_load);
+  l.label_type = label_type_t::cb;
   return make_base(l);
 }
diff --git a/vowpalwabbit/cb_algs.cc b/vowpalwabbit/cb_algs.cc
index 2c2d9614ea7..447433acf5a 100644
--- a/vowpalwabbit/cb_algs.cc
+++ b/vowpalwabbit/cb_algs.cc
@@ -21,12 +21,6 @@ struct cb
 {
   cb_to_cs cbcs;
   COST_SENSITIVE::label cb_cs_ld;
-
-  ~cb()
-  {
-    cb_cs_ld.costs.delete_v();
-    COST_SENSITIVE::cs_label.delete_label(&cbcs.pred_scores);
-  }
 };
 
 bool know_all_cost_example(CB::label& ld)
@@ -47,7 +41,7 @@ bool know_all_cost_example(CB::label& ld)
 template <bool is_learn>
 void predict_or_learn(cb& data, single_learner& base, example& ec)
 {
-  CB::label ld = ec.l.cb;
+  CB::label ld = std::move(ec.l.cb());
   cb_to_cs& c = data.cbcs;
   c.known_cost = get_observed_cost(ld);
   if (c.known_cost != nullptr && (c.known_cost->action < 1 || c.known_cost->action > c.num_actions))
@@ -58,8 +52,8 @@ void predict_or_learn(cb& data, single_learner& base, example& ec)
 
   if (c.cb_type != CB_TYPE_DM)
   {
-    ec.l.cs = data.cb_cs_ld;
-
+    ec.l.reset();
+    ec.l.init_as_cs(data.cb_cs_ld);
     if (is_learn)
       base.learn(ec);
     else
@@ -67,15 +61,16 @@ void predict_or_learn(cb& data, single_learner& base, example& ec)
 
     for (size_t i = 0; i < ld.costs.size(); i++)
       ld.costs[i].partial_prediction = data.cb_cs_ld.costs[i].partial_prediction;
-    ec.l.cb = ld;
   }
+  ec.l.reset();
+  ec.l.init_as_cb(std::move(ld));
 }
 
 void predict_eval(cb&, single_learner&, example&) { THROW("can not use a test label for evaluation"); }
 
 void learn_eval(cb& data, single_learner&, example& ec)
 {
-  CB_EVAL::label ld = ec.l.cb_eval;
+  CB_EVAL::label& ld = ec.l.cb_eval();
 
   cb_to_cs& c = data.cbcs;
   c.known_cost = get_observed_cost(ld.event);
@@ -84,7 +79,7 @@ void learn_eval(cb& data, single_learner&, example& ec)
   for (size_t i = 0; i < ld.event.costs.size(); i++)
     ld.event.costs[i].partial_prediction = data.cb_cs_ld.costs[i].partial_prediction;
 
-  ec.pred.multiclass = ec.l.cb_eval.action;
+  ec.pred.multiclass() = ec.l.cb_eval().action;
 }
 
 void output_example(vw& all, cb& data, example& ec, CB::label& ld)
@@ -92,14 +87,13 @@ void output_example(vw& all, cb& data, example& ec, CB::label& ld)
   float loss = 0.;
 
   cb_to_cs& c = data.cbcs;
-  if (!CB::cb_label.test_label(&ld))
-    loss = get_cost_estimate(c.known_cost, c.pred_scores, ec.pred.multiclass);
+  if (!CB::test_label(ld))
+    loss = get_cost_estimate(c.known_cost, c.pred_scores, ec.pred.multiclass());
 
-  all.sd->update(ec.test_only, !CB::cb_label.test_label(&ld), loss, 1.f, ec.num_features);
+  all.sd->update(ec.test_only, !CB::test_label(ld), loss, 1.f, ec.num_features);
 
   for (int sink : all.final_prediction_sink)
-    all.print_by_ref(sink, (float)ec.pred.multiclass, 0, ec.tag);
-
+    all.print_by_ref(sink, (float)ec.pred.multiclass(), 0, ec.tag);
   if (all.raw_prediction > 0)
   {
     std::stringstream outputStringStream;
@@ -113,18 +107,18 @@ void output_example(vw& all, cb& data, example& ec, CB::label& ld)
     all.print_text_by_ref(all.raw_prediction, outputStringStream.str(), ec.tag);
   }
 
-  print_update(all, CB::cb_label.test_label(&ld), ec, nullptr, false);
+  print_update(all, CB::test_label(ld), ec, nullptr, false);
 }
 
 void finish_example(vw& all, cb& c, example& ec)
 {
-  output_example(all, c, ec, ec.l.cb);
+  output_example(all, c, ec, ec.l.cb());
   VW::finish_example(all, ec);
 }
 
 void eval_finish_example(vw& all, cb& c, example& ec)
 {
-  output_example(all, c, ec, ec.l.cb_eval.event);
+  output_example(all, c, ec, ec.l.cb_eval().event);
   VW::finish_example(all, ec);
 }
 }  // namespace CB_ALGS
@@ -183,28 +177,23 @@ base_learner* cb_algs_setup(options_i& options, vw& all)
   }
 
   auto base = as_singleline(setup_base(options, all));
-  if (eval)
-  {
-    all.p->lp = CB_EVAL::cb_eval;
-    all.label_type = label_type_t::cb_eval;
-  }
-  else
-  {
-    all.p->lp = CB::cb_label;
-    all.label_type = label_type_t::cb;
-  }
 
   learner<cb, example>* l;
   if (eval)
   {
     l = &init_learner(data, base, learn_eval, predict_eval, problem_multiplier, prediction_type_t::multiclass);
     l->set_finish_example(eval_finish_example);
+    all.p->lp = CB_EVAL::cb_eval;
+    l->label_type = label_type_t::cb_eval;
+
   }
   else
   {
     l = &init_learner(
         data, base, predict_or_learn<true>, predict_or_learn<false>, problem_multiplier, prediction_type_t::multiclass);
     l->set_finish_example(finish_example);
+    all.p->lp = CB::cb_label;
+    l->label_type = label_type_t::cb;
   }
   c.scorer = all.scorer;
 
diff --git a/vowpalwabbit/cb_algs.h b/vowpalwabbit/cb_algs.h
index 3e9f1657521..653de954667 100644
--- a/vowpalwabbit/cb_algs.h
+++ b/vowpalwabbit/cb_algs.h
@@ -21,7 +21,7 @@ template <bool is_learn>
 float get_cost_pred(
     LEARNER::single_learner* scorer, CB::cb_class* known_cost, example& ec, uint32_t index, uint32_t base)
 {
-  CB::label ld = ec.l.cb;
+  auto label = std::move(ec.l);
 
   label_data simple_temp;
   simple_temp.initial = 0.;
@@ -32,8 +32,12 @@ float get_cost_pred(
 
   const bool baseline_enabled_old = BASELINE::baseline_enabled(&ec);
   BASELINE::set_baseline_enabled(&ec);
-  ec.l.simple = simple_temp;
-  polyprediction p = ec.pred;
+  ec.l.reset();
+  ec.l.init_as_simple(simple_temp);
+  // Save what is in the prediction right now, and restore it before we exit the function.
+  polyprediction p = std::move(ec.pred);
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
   if (is_learn && known_cost != nullptr && index == known_cost->action)
   {
     float old_weight = ec.weight;
@@ -46,11 +50,10 @@ float get_cost_pred(
 
   if (!baseline_enabled_old)
     BASELINE::reset_baseline_disabled(&ec);
-  float pred = ec.pred.scalar;
-  ec.pred = p;
-
-  ec.l.cb = ld;
+  float pred = ec.pred.scalar();
+  ec.pred = std::move(p);
 
+  ec.l = std::move(label);
   return pred;
 }
 
diff --git a/vowpalwabbit/cb_dro.cc b/vowpalwabbit/cb_dro.cc
index d0d03778ba5..18a2f63892d 100644
--- a/vowpalwabbit/cb_dro.cc
+++ b/vowpalwabbit/cb_dro.cc
@@ -36,14 +36,14 @@ struct cb_dro_data
 
     if (is_learn)
     {
-      const auto it = std::find_if(examples.begin(), examples.end(), [](example *item) { return !item->l.cb.costs.empty(); });
+      const auto it = std::find_if(examples.begin(), examples.end(), [](example *item) { return !item->l.cb().costs.empty(); });
 
       if (it != examples.end())
       {
-        const CB::cb_class logged = (*it)->l.cb.costs[0];
+        const CB::cb_class logged = (*it)->l.cb().costs[0];
         const uint32_t labelled_action = std::distance(examples.begin(), it);
 
-        const auto action_scores = examples[0]->pred.a_s;
+        const auto& action_scores = examples[0]->pred.action_probs();
 
         // cb_explore_adf => want maximum probability
         // cb_adf => first action is a greedy action
@@ -151,12 +151,19 @@ base_learner *cb_dro_setup(options_i &options, vw &all)
     THROW("invalid cb_dro parameter values supplied");
   }
 
+  auto* base = as_multiline(setup_base(options, all));
   if (options.was_supplied("cb_explore_adf"))
   {
-    return make_base(init_learner(data, as_multiline(setup_base(options, all)), learn_or_predict<true, true>, learn_or_predict<false, true>, 1 /* weights */, prediction_type_t::action_probs));
+    auto& learner = init_learner(data, base, learn_or_predict<true, true>, learn_or_predict<false, true>,
+        1 /* weights */, prediction_type_t::action_probs);
+    learner.label_type = label_type_t::cb;
+    return make_base(learner);
   }
   else
   {
-    return make_base(init_learner(data, as_multiline(setup_base(options, all)), learn_or_predict<true, false>, learn_or_predict<false, false>, 1 /* weights */, prediction_type_t::action_probs));
+    auto& learner = init_learner(data, base, learn_or_predict<true, false>, learn_or_predict<false, false>,
+        1 /* weights */, prediction_type_t::action_probs);
+    learner.label_type = label_type_t::cb;
+    return make_base(learner);
   }
 }
diff --git a/vowpalwabbit/cb_explore.cc b/vowpalwabbit/cb_explore.cc
index 0321ff54c56..c3c1b3c5d81 100644
--- a/vowpalwabbit/cb_explore.cc
+++ b/vowpalwabbit/cb_explore.cc
@@ -31,7 +31,7 @@ struct cb_explore
   COST_SENSITIVE::label cs_label;
   COST_SENSITIVE::label second_cs_label;
 
-  learner<cb_explore, example>* cs;
+  learner<cb_explore, example>* cost_sensitive_learner;
 
   size_t tau;
   float epsilon;
@@ -40,24 +40,18 @@ struct cb_explore
   float psi;
 
   size_t counter;
-
-  ~cb_explore()
-  {
-    preds.delete_v();
-    cover_probs.delete_v();
-    COST_SENSITIVE::cs_label.delete_label(&cbcs.pred_scores);
-    COST_SENSITIVE::cs_label.delete_label(&cs_label);
-    COST_SENSITIVE::cs_label.delete_label(&second_cs_label);
-  }
 };
 
 template <bool is_learn>
 void predict_or_learn_first(cb_explore& data, single_learner& base, example& ec)
 {
   // Explore tau times, then act according to optimal.
-  action_scores probs = ec.pred.a_s;
+  auto probs = std::move(ec.pred.action_probs());
+  probs.clear();
+  ec.pred.reset();
+  ec.pred.init_as_multiclass();
 
-  if (is_learn && ec.l.cb.costs[0].probability < 1)
+  if (is_learn && ec.l.cb().costs[0].probability < 1)
     base.learn(ec);
   else
     base.predict(ec);
@@ -71,22 +65,23 @@ void predict_or_learn_first(cb_explore& data, single_learner& base, example& ec)
   }
   else
   {
-    uint32_t chosen = ec.pred.multiclass - 1;
+    uint32_t chosen = ec.pred.multiclass() - 1;
     for (uint32_t i = 0; i < data.cbcs.num_actions; i++) probs.push_back({i, 0.});
     probs[chosen].score = 1.0;
   }
 
-  ec.pred.a_s = probs;
+  ec.pred.reset();
+  ec.pred.init_as_action_probs(std::move(probs));
 }
 
 template <bool is_learn>
 void predict_or_learn_greedy(cb_explore& data, single_learner& base, example& ec)
 {
   // Explore uniform random an epsilon fraction of the time.
-  // TODO: pointers are copied here. What happens if base.learn/base.predict re-allocs?
-  // ec.pred.a_s = probs; will restore the than free'd memory
-  action_scores probs = ec.pred.a_s;
+  auto probs = std::move(ec.pred.action_probs());
   probs.clear();
+  ec.pred.reset();
+  ec.pred.init_as_multiclass();
 
   if (is_learn)
     base.learn(ec);
@@ -95,18 +90,22 @@ void predict_or_learn_greedy(cb_explore& data, single_learner& base, example& ec
 
   // pre-allocate pdf
   probs.resize(data.cbcs.num_actions);
-  for (uint32_t i = 0; i < data.cbcs.num_actions; i++) probs.push_back({i, 0});
-  generate_epsilon_greedy(data.epsilon, ec.pred.multiclass - 1, begin_scores(probs), end_scores(probs));
+  for (uint32_t i = 0; i < data.cbcs.num_actions; i++)
+    probs.push_back({i, 0});
+  generate_epsilon_greedy(data.epsilon, ec.pred.multiclass() - 1, begin_scores(probs), end_scores(probs));
 
-  ec.pred.a_s = probs;
+  ec.pred.reset();
+  ec.pred.init_as_action_probs(std::move(probs));
 }
 
 template <bool is_learn>
 void predict_or_learn_bag(cb_explore& data, single_learner& base, example& ec)
 {
   // Randomize over predictions from a base set of predictors
-  action_scores probs = ec.pred.a_s;
+  auto probs = std::move(ec.pred.action_probs());
   probs.clear();
+  ec.pred.reset();
+  ec.pred.init_as_multiclass();
 
   for (uint32_t i = 0; i < data.cbcs.num_actions; i++) probs.push_back({i, 0.});
   float prob = 1.f / (float)data.bag_size;
@@ -117,13 +116,14 @@ void predict_or_learn_bag(cb_explore& data, single_learner& base, example& ec)
       base.learn(ec, i);
     else
       base.predict(ec, i);
-    uint32_t chosen = ec.pred.multiclass - 1;
+    uint32_t chosen = ec.pred.multiclass() - 1;
     probs[chosen].score += prob;
     if (is_learn)
       for (uint32_t j = 1; j < count; j++) base.learn(ec, i);
   }
 
-  ec.pred.a_s = probs;
+  ec.pred.reset();
+  ec.pred.init_as_action_probs(std::move(probs));
 }
 
 void get_cover_probabilities(cb_explore& data, single_learner& /* base */, example& ec, v_array<action_score>& probs)
@@ -131,16 +131,19 @@ void get_cover_probabilities(cb_explore& data, single_learner& /* base */, examp
   float additive_probability = 1.f / (float)data.cover_size;
   data.preds.clear();
 
+  ec.pred.reset();
+  ec.pred.init_as_multiclass();
+
   for (uint32_t i = 0; i < data.cbcs.num_actions; i++) probs.push_back({i, 0.});
 
   for (size_t i = 0; i < data.cover_size; i++)
   {
     // get predicted cost-sensitive predictions
     if (i == 0)
-      data.cs->predict(ec, i);
+      data.cost_sensitive_learner->predict(ec, i);
     else
-      data.cs->predict(ec, i + 1);
-    uint32_t pred = ec.pred.multiclass;
+      data.cost_sensitive_learner->predict(ec, i + 1);
+    uint32_t pred = ec.pred.multiclass();
     probs[pred - 1].score += additive_probability;
     data.preds.push_back((uint32_t)pred);
   }
@@ -161,7 +164,7 @@ void predict_or_learn_cover(cb_explore& data, single_learner& base, example& ec)
 
   uint32_t num_actions = data.cbcs.num_actions;
 
-  action_scores probs = ec.pred.a_s;
+  auto probs = std::move(ec.pred.action_probs());
   probs.clear();
   data.cs_label.costs.clear();
 
@@ -176,14 +179,16 @@ void predict_or_learn_cover(cb_explore& data, single_learner& base, example& ec)
 
   float min_prob = std::min(1.f / num_actions, 1.f / (float)std::sqrt(counter * num_actions));
 
-  data.cb_label = ec.l.cb;
+  data.cb_label = std::move(ec.l.cb());
 
-  ec.l.cs = data.cs_label;
+  ec.l.reset();
+  ec.l.init_as_cs() = std::move(data.cs_label);
   get_cover_probabilities(data, base, ec, probs);
 
   if (is_learn)
   {
-    ec.l.cb = data.cb_label;
+    ec.l.reset();
+    ec.l.init_as_cb() = std::move(data.cb_label);
     base.learn(ec);
 
     // Now update oracles
@@ -191,12 +196,17 @@ void predict_or_learn_cover(cb_explore& data, single_learner& base, example& ec)
     // 1. Compute loss vector
     data.cs_label.costs.clear();
     float norm = min_prob * num_actions;
-    ec.l.cb = data.cb_label;
-    data.cbcs.known_cost = get_observed_cost(data.cb_label);
-    gen_cs_example<false>(data.cbcs, ec, data.cb_label, data.cs_label);
+    // This should not be nee2ded as it was done just above.
+    // ec.l.cb() = data.cb_label;
+    data.cbcs.known_cost = get_observed_cost(ec.l.cb());
+    gen_cs_example<false>(data.cbcs, ec, ec.l.cb(), data.cs_label);
     for (uint32_t i = 0; i < num_actions; i++) probabilities[i] = 0;
 
-    ec.l.cs = data.second_cs_label;
+    data.cb_label = std::move(ec.l.cb());
+    ec.l.reset();
+    ec.l.init_as_cs(std::move(data.second_cs_label));
+    auto& second_cs_label_ref = ec.l.cs();
+
     // 2. Update functions
     for (size_t i = 0; i < cover_size; i++)
     {
@@ -205,21 +215,24 @@ void predict_or_learn_cover(cb_explore& data, single_learner& base, example& ec)
       {
         float pseudo_cost =
             data.cs_label.costs[j].x - data.psi * min_prob / (std::max(probabilities[j], min_prob) / norm) + 1;
-        data.second_cs_label.costs[j].class_index = j + 1;
-        data.second_cs_label.costs[j].x = pseudo_cost;
+        second_cs_label_ref.costs[j].class_index = j + 1;
+        second_cs_label_ref.costs[j].x = pseudo_cost;
       }
       if (i != 0)
-        data.cs->learn(ec, i + 1);
+        data.cost_sensitive_learner->learn(ec, i + 1);
       if (probabilities[predictions[i] - 1] < min_prob)
         norm += std::max(0.f, additive_probability - (min_prob - probabilities[predictions[i] - 1]));
       else
         norm += additive_probability;
       probabilities[predictions[i] - 1] += additive_probability;
     }
+    data.second_cs_label = std::move(ec.l.cs());
   }
 
-  ec.l.cb = data.cb_label;
-  ec.pred.a_s = probs;
+  ec.l.reset();
+  ec.l.init_as_cb(std::move(data.cb_label));
+  ec.pred.reset();
+  ec.pred.init_as_action_probs(std::move(probs));
 }
 
 void print_update_cb_explore(vw& all, bool is_test, example& ec, std::stringstream& pred_string)
@@ -230,7 +243,7 @@ void print_update_cb_explore(vw& all, bool is_test, example& ec, std::stringstre
     if (is_test)
       label_string << " unknown";
     else
-      label_string << ec.l.cb.costs[0].action;
+      label_string << ec.l.cb().costs[0].action;
     all.sd->print_update(all.holdout_set_off, all.current_pass, label_string.str(), pred_string.str(), ec.num_features,
         all.progress_add, all.progress_arg);
   }
@@ -243,20 +256,20 @@ void output_example(vw& all, cb_explore& data, example& ec, CB::label& ld)
   cb_to_cs& c = data.cbcs;
 
   if ((c.known_cost = get_observed_cost(ld)) != nullptr)
-    for (uint32_t i = 0; i < ec.pred.a_s.size(); i++)
-      loss += get_cost_estimate(c.known_cost, c.pred_scores, i + 1) * ec.pred.a_s[i].score;
+    for (uint32_t i = 0; i < ec.pred.action_probs().size(); i++)
+      loss += get_cost_estimate(c.known_cost, c.pred_scores, i + 1) * ec.pred.action_probs()[i].score;
 
   all.sd->update(ec.test_only, get_observed_cost(ld) != nullptr, loss, 1.f, ec.num_features);
 
   std::stringstream ss;
   float maxprob = 0.;
   uint32_t maxid = 0;
-  for (uint32_t i = 0; i < ec.pred.a_s.size(); i++)
+  for (uint32_t i = 0; i < ec.pred.action_probs().size(); i++)
   {
-    ss << std::fixed << ec.pred.a_s[i].score << " ";
-    if (ec.pred.a_s[i].score > maxprob)
+    ss << std::fixed << ec.pred.action_probs()[i].score << " ";
+    if (ec.pred.action_probs()[i].score > maxprob)
     {
-      maxprob = ec.pred.a_s[i].score;
+      maxprob = ec.pred.action_probs()[i].score;
       maxid = i + 1;
     }
   }
@@ -264,12 +277,12 @@ void output_example(vw& all, cb_explore& data, example& ec, CB::label& ld)
 
   std::stringstream sso;
   sso << maxid << ":" << std::fixed << maxprob;
-  print_update_cb_explore(all, CB::cb_label.test_label(&ld), ec, sso);
+  print_update_cb_explore(all, CB::test_label(ld), ec, sso);
 }
 
 void finish_example(vw& all, cb_explore& c, example& ec)
 {
-  output_example(all, c, ec, ec.l.cb);
+  output_example(all, c, ec, ec.l.cb());
   VW::finish_example(all, ec);
 }
 }  // namespace CB_EXPLORE
@@ -303,7 +316,6 @@ base_learner* cb_explore_setup(options_i& options, vw& all)
     options.insert("cb", ss.str());
   }
 
-  all.delete_prediction = delete_action_scores;
   data->cbcs.cb_type = CB_TYPE_DR;
 
   single_learner* base = as_singleline(setup_base(options, all));
@@ -312,12 +324,10 @@ base_learner* cb_explore_setup(options_i& options, vw& all)
   learner<cb_explore, example>* l;
   if (options.was_supplied("cover"))
   {
-    data->cs = (learner<cb_explore, example>*)(as_singleline(all.cost_sensitive));
+    data->cost_sensitive_learner = reinterpret_cast<learner<cb_explore, example>*>(as_singleline(all.cost_sensitive));
     data->second_cs_label.costs.resize(num_actions);
     data->second_cs_label.costs.end() = data->second_cs_label.costs.begin() + num_actions;
-    data->cover_probs = v_init<float>();
     data->cover_probs.resize(num_actions);
-    data->preds = v_init<uint32_t>();
     data->preds.resize(data->cover_size);
     l = &init_learner(data, base, predict_or_learn_cover<true>, predict_or_learn_cover<false>, data->cover_size + 1,
         prediction_type_t::action_probs);
@@ -333,5 +343,6 @@ base_learner* cb_explore_setup(options_i& options, vw& all)
         data, base, predict_or_learn_greedy<true>, predict_or_learn_greedy<false>, 1, prediction_type_t::action_probs);
 
   l->set_finish_example(finish_example);
+  l->label_type = label_type_t::cb;
   return make_base(*l);
 }
diff --git a/vowpalwabbit/cb_explore_adf_bag.cc b/vowpalwabbit/cb_explore_adf_bag.cc
index eaa389e5d7d..3084f71e14f 100644
--- a/vowpalwabbit/cb_explore_adf_bag.cc
+++ b/vowpalwabbit/cb_explore_adf_bag.cc
@@ -41,7 +41,7 @@ struct cb_explore_adf_bag
  public:
   cb_explore_adf_bag(
       float epsilon, size_t bag_size, bool greedify, bool first_only, std::shared_ptr<rand_state> random_state);
-  ~cb_explore_adf_bag();
+  ~cb_explore_adf_bag() = default;
 
   // Should be called through cb_explore_adf_base for pre/post-processing
   void predict(LEARNER::multi_learner& base, multi_ex& examples) { predict_or_learn_impl<false>(base, examples); }
@@ -62,7 +62,7 @@ template <bool is_learn>
 void cb_explore_adf_bag::predict_or_learn_impl(LEARNER::multi_learner& base, multi_ex& examples)
 {
   // Randomize over predictions from a base set of predictors
-  v_array<ACTION_SCORE::action_score>& preds = examples[0]->pred.a_s;
+  auto& preds = examples[0]->pred.action_probs();
   uint32_t num_actions = (uint32_t)examples.size();
   if (num_actions == 0)
   {
@@ -83,7 +83,7 @@ void cb_explore_adf_bag::predict_or_learn_impl(LEARNER::multi_learner& base, mul
       LEARNER::multiline_learn_or_predict<true>(base, examples, examples[0]->ft_offset, i);
     else
       LEARNER::multiline_learn_or_predict<false>(base, examples, examples[0]->ft_offset, i);
-
+    auto& preds = examples[0]->pred.action_probs();
     assert(preds.size() == num_actions);
     for (auto e : preds) _scores[e.action] += e.score;
 
@@ -113,8 +113,6 @@ void cb_explore_adf_bag::predict_or_learn_impl(LEARNER::multi_learner& base, mul
   for (size_t i = 0; i < num_actions; i++) preds[i] = _action_probs[i];
 }
 
-cb_explore_adf_bag::~cb_explore_adf_bag() { _action_probs.delete_v(); }
-
 LEARNER::base_learner* setup(VW::config::options_i& options, vw& all)
 {
   using config::make_option;
@@ -143,19 +141,16 @@ LEARNER::base_learner* setup(VW::config::options_i& options, vw& all)
     options.insert("cb_adf", "");
   }
 
-  all.delete_prediction = ACTION_SCORE::delete_action_scores;
-
   size_t problem_multiplier = bag_size;
   LEARNER::multi_learner* base = as_multiline(setup_base(options, all));
   all.p->lp = CB::cb_label;
-  all.label_type = label_type_t::cb;
 
   using explore_type = cb_explore_adf_base<cb_explore_adf_bag>;
   auto data = scoped_calloc_or_throw<explore_type>(epsilon, bag_size, greedify, first_only, all.get_random_state());
 
   LEARNER::learner<explore_type, multi_ex>& l = LEARNER::init_learner(
       data, base, explore_type::learn, explore_type::predict, problem_multiplier, prediction_type_t::action_probs);
-
+  l.label_type = label_type_t::cb;
   l.set_finish_example(explore_type::finish_multiline_example);
   return make_base(l);
 }
diff --git a/vowpalwabbit/cb_explore_adf_common.h b/vowpalwabbit/cb_explore_adf_common.h
index ea6b0efb421..1307e2c2630 100644
--- a/vowpalwabbit/cb_explore_adf_common.h
+++ b/vowpalwabbit/cb_explore_adf_common.h
@@ -93,8 +93,8 @@ inline void cb_explore_adf_base<ExploreType>::predict(
   if (label_example != nullptr)
   {
     // predict path, replace the label example with an empty one
-    data._action_label = label_example->l.cb;
-    label_example->l.cb = data._empty_label;
+    data._action_label = std::move(label_example->l.cb());
+    label_example->l.cb() = std::move(data._empty_label);
   }
 
   data.explore.predict(base, examples);
@@ -102,7 +102,8 @@ inline void cb_explore_adf_base<ExploreType>::predict(
   if (label_example != nullptr)
   {
     // predict path, restore label
-    label_example->l.cb = data._action_label;
+    data._empty_label = std::move(label_example->l.cb());
+    label_example->l.cb() = std::move(data._action_label);
   }
 }
 
@@ -134,7 +135,7 @@ void cb_explore_adf_base<ExploreType>::output_example(vw& all, multi_ex& ec_seq)
   float loss = 0.;
 
   auto& ec = *ec_seq[0];
-  const auto& preds = ec.pred.a_s;
+  const auto& preds = ec.pred.action_probs();
 
   for (const auto& example : ec_seq)
   {
@@ -158,13 +159,13 @@ void cb_explore_adf_base<ExploreType>::output_example(vw& all, multi_ex& ec_seq)
 
   all.sd->update(holdout_example, labeled_example, loss, ec.weight, num_features);
 
-  for (auto sink : all.final_prediction_sink) ACTION_SCORE::print_action_score(sink, ec.pred.a_s, ec.tag);
+  for (auto sink : all.final_prediction_sink) ACTION_SCORE::print_action_score(sink, ec.pred.action_probs(), ec.tag);
 
   if (all.raw_prediction > 0)
   {
     std::string outputString;
     std::stringstream outputStringStream(outputString);
-    const auto& costs = ec.l.cb.costs;
+    const auto& costs = ec.l.cb().costs;
 
     for (size_t i = 0; i < costs.size(); i++)
     {
diff --git a/vowpalwabbit/cb_explore_adf_cover.cc b/vowpalwabbit/cb_explore_adf_cover.cc
index b811daa7a01..e7ef22e0fb6 100644
--- a/vowpalwabbit/cb_explore_adf_cover.cc
+++ b/vowpalwabbit/cb_explore_adf_cover.cc
@@ -44,7 +44,7 @@ struct cb_explore_adf_cover
  public:
   cb_explore_adf_cover(size_t cover_size, float psi, bool nounif, bool first_only,
       LEARNER::multi_learner* cs_ldf_learner, LEARNER::single_learner* scorer, size_t cb_type);
-  ~cb_explore_adf_cover();
+  ~cb_explore_adf_cover() = default;
 
   // Should be called through cb_explore_adf_base for pre/post-processing
   void predict(LEARNER::multi_learner& base, multi_ex& examples) { predict_or_learn_impl<false>(base, examples); }
@@ -72,6 +72,7 @@ void cb_explore_adf_cover::predict_or_learn_impl(LEARNER::multi_learner& base, m
   // Randomize over predictions from a base set of predictors
   // Use cost sensitive oracle to cover actions to form distribution.
   const bool is_mtr = _gen_cs.cb_type == CB_TYPE_MTR;
+  // swap_to_scores(examples);
   if (is_learn)
   {
     if (is_mtr)  // use DR estimates for non-ERM policies in MTR
@@ -85,7 +86,8 @@ void cb_explore_adf_cover::predict_or_learn_impl(LEARNER::multi_learner& base, m
     GEN_CS::gen_cs_example_ips(examples, _cs_labels);
     LEARNER::multiline_learn_or_predict<false>(base, examples, examples[0]->ft_offset);
   }
-  v_array<ACTION_SCORE::action_score>& preds = examples[0]->pred.a_s;
+  // swap_to_probs(examples);
+  auto& preds = examples[0]->pred.action_probs();
   const uint32_t num_actions = (uint32_t)preds.size();
 
   float additive_probability = 1.f / (float)_cover_size;
@@ -164,17 +166,6 @@ void cb_explore_adf_cover::predict_or_learn_impl(LEARNER::multi_learner& base, m
     ++_counter;
 }
 
-cb_explore_adf_cover::~cb_explore_adf_cover()
-{
-  _cb_labels.delete_v();
-  for (size_t i = 0; i < _prepped_cs_labels.size(); i++) _prepped_cs_labels[i].costs.delete_v();
-  _prepped_cs_labels.delete_v();
-  _cs_labels_2.costs.delete_v();
-  _cs_labels.costs.delete_v();
-  _action_probs.delete_v();
-  _gen_cs.pred_scores.costs.delete_v();
-}
-
 LEARNER::base_learner* setup(config::options_i& options, vw& all)
 {
   using config::make_option;
@@ -216,8 +207,6 @@ LEARNER::base_learner* setup(config::options_i& options, vw& all)
     options.insert("cb_adf", "");
   }
 
-  all.delete_prediction = ACTION_SCORE::delete_action_scores;
-
   // Set cb_type
   size_t cb_type_enum;
   if (type_string.compare("dr") == 0)
@@ -242,7 +231,6 @@ LEARNER::base_learner* setup(config::options_i& options, vw& all)
 
   LEARNER::multi_learner* base = LEARNER::as_multiline(setup_base(options, all));
   all.p->lp = CB::cb_label;
-  all.label_type = label_type_t::cb;
 
   using explore_type = cb_explore_adf_base<cb_explore_adf_cover>;
   auto data = scoped_calloc_or_throw<explore_type>(
@@ -250,6 +238,7 @@ LEARNER::base_learner* setup(config::options_i& options, vw& all)
 
   LEARNER::learner<explore_type, multi_ex>& l = init_learner(
       data, base, explore_type::learn, explore_type::predict, problem_multiplier, prediction_type_t::action_probs);
+  l.label_type = label_type_t::cb;
 
   l.set_finish_example(explore_type::finish_multiline_example);
   return make_base(l);
diff --git a/vowpalwabbit/cb_explore_adf_first.cc b/vowpalwabbit/cb_explore_adf_first.cc
index 2aa4c142264..4f4ff620afd 100644
--- a/vowpalwabbit/cb_explore_adf_first.cc
+++ b/vowpalwabbit/cb_explore_adf_first.cc
@@ -51,8 +51,7 @@ void cb_explore_adf_first::predict_or_learn_impl(LEARNER::multi_learner& base, m
     LEARNER::multiline_learn_or_predict<true>(base, examples, examples[0]->ft_offset);
   else
     LEARNER::multiline_learn_or_predict<false>(base, examples, examples[0]->ft_offset);
-
-  v_array<ACTION_SCORE::action_score>& preds = examples[0]->pred.a_s;
+  auto& preds = examples[0]->pred.action_probs();
   uint32_t num_actions = (uint32_t)preds.size();
 
   if (_tau)
@@ -95,22 +94,19 @@ LEARNER::base_learner* setup(config::options_i& options, vw& all)
     options.insert("cb_adf", "");
   }
 
-  all.delete_prediction = ACTION_SCORE::delete_action_scores;
-
   size_t problem_multiplier = 1;
 
   LEARNER::multi_learner* base = LEARNER::as_multiline(setup_base(options, all));
   all.p->lp = CB::cb_label;
-  all.label_type = label_type_t::cb;
 
   using explore_type = cb_explore_adf_base<cb_explore_adf_first>;
   auto data = scoped_calloc_or_throw<explore_type>(tau, epsilon);
 
   LEARNER::learner<explore_type, multi_ex>& l = LEARNER::init_learner(
       data, base, explore_type::learn, explore_type::predict, problem_multiplier, prediction_type_t::action_probs);
-
+  l.label_type = label_type_t::cb;
   l.set_finish_example(explore_type::finish_multiline_example);
-  return make_base(l);
+  return make_base(l); 
 }
 }  // namespace first
 }  // namespace cb_explore_adf
diff --git a/vowpalwabbit/cb_explore_adf_greedy.cc b/vowpalwabbit/cb_explore_adf_greedy.cc
index ae8ea0c5f68..ee910fe9543 100644
--- a/vowpalwabbit/cb_explore_adf_greedy.cc
+++ b/vowpalwabbit/cb_explore_adf_greedy.cc
@@ -50,8 +50,7 @@ void cb_explore_adf_greedy::predict_or_learn_impl(LEARNER::multi_learner& base,
 {
   // Explore uniform random an epsilon fraction of the time.
   LEARNER::multiline_learn_or_predict<is_learn>(base, examples, examples[0]->ft_offset);
-
-  ACTION_SCORE::action_scores& preds = examples[0]->pred.a_s;
+  auto& preds = examples[0]->pred.action_probs();
 
   uint32_t num_actions = (uint32_t)preds.size();
 
@@ -97,8 +96,6 @@ LEARNER::base_learner* setup(VW::config::options_i& options, vw& all)
     options.insert("cb_adf", "");
   }
 
-  all.delete_prediction = ACTION_SCORE::delete_action_scores;
-
   size_t problem_multiplier = 1;
 
   if (!options.was_supplied("epsilon"))
@@ -106,13 +103,13 @@ LEARNER::base_learner* setup(VW::config::options_i& options, vw& all)
 
   LEARNER::multi_learner* base = as_multiline(setup_base(options, all));
   all.p->lp = CB::cb_label;
-  all.label_type = label_type_t::cb;
 
   using explore_type = cb_explore_adf_base<cb_explore_adf_greedy>;
   auto data = scoped_calloc_or_throw<explore_type>(epsilon, first_only);
 
   LEARNER::learner<explore_type, multi_ex>& l = LEARNER::init_learner(
       data, base, explore_type::learn, explore_type::predict, problem_multiplier, prediction_type_t::action_probs);
+  l.label_type = label_type_t::cb;
 
   l.set_finish_example(explore_type::finish_multiline_example);
   return make_base(l);
diff --git a/vowpalwabbit/cb_explore_adf_regcb.cc b/vowpalwabbit/cb_explore_adf_regcb.cc
index 1c282736872..2e33dc52705 100644
--- a/vowpalwabbit/cb_explore_adf_regcb.cc
+++ b/vowpalwabbit/cb_explore_adf_regcb.cc
@@ -95,7 +95,7 @@ float cb_explore_adf_regcb::binary_search(float fhat, float delta, float sens, f
 
 void cb_explore_adf_regcb::get_cost_ranges(float delta, LEARNER::multi_learner& base, multi_ex& examples, bool min_only)
 {
-  const size_t num_actions = examples[0]->pred.a_s.size();
+  const size_t num_actions = examples[0]->pred.action_probs().size();
   _min_costs.resize(num_actions);
   _max_costs.resize(num_actions);
 
@@ -105,14 +105,15 @@ void cb_explore_adf_regcb::get_cost_ranges(float delta, LEARNER::multi_learner&
   // backup cb example data
   for (const auto& ex : examples)
   {
-    _ex_as.push_back(ex->pred.a_s);
-    _ex_costs.push_back(ex->l.cb.costs);
+    _ex_as.push_back(std::move(ex->pred.action_probs()));
+    _ex_costs.push_back(std::move(ex->l.cb().costs));
   }
 
   // set regressor predictions
   for (const auto& as : _ex_as[0])
   {
-    examples[as.action]->pred.scalar = as.score;
+    examples[as.action]->pred.reset();
+    examples[as.action]->pred.init_as_scalar() = as.score;
   }
 
   const float cmin = _min_cb_cost;
@@ -121,32 +122,33 @@ void cb_explore_adf_regcb::get_cost_ranges(float delta, LEARNER::multi_learner&
   for (size_t a = 0; a < num_actions; ++a)
   {
     example* ec = examples[a];
-    ec->l.simple.label = cmin - 1;
+    ec->l.reset();
+    ec->l.init_as_simple().label = cmin - 1;
     float sens = base.sensitivity(*ec);
     float w = 0;  // importance weight
 
-    if (ec->pred.scalar < cmin || std::isnan(sens) || std::isinf(sens))
+    if (ec->pred.scalar() < cmin || std::isnan(sens) || std::isinf(sens))
       _min_costs[a] = cmin;
     else
     {
-      w = binary_search(ec->pred.scalar - cmin + 1, delta, sens);
-      _min_costs[a] = (std::max)(ec->pred.scalar - sens * w, cmin);
+      w = binary_search(ec->pred.scalar() - cmin + 1, delta, sens);
+      _min_costs[a] = (std::max)(ec->pred.scalar() - sens * w, cmin);
       if (_min_costs[a] > cmax)
         _min_costs[a] = cmax;
     }
 
     if (!min_only)
     {
-      ec->l.simple.label = cmax + 1;
+      ec->l.simple().label = cmax + 1;
       sens = base.sensitivity(*ec);
-      if (ec->pred.scalar > cmax || std::isnan(sens) || std::isinf(sens))
+      if (ec->pred.scalar() > cmax || std::isnan(sens) || std::isinf(sens))
       {
         _max_costs[a] = cmax;
       }
       else
       {
-        w = binary_search(cmax + 1 - ec->pred.scalar, delta, sens);
-        _max_costs[a] = (std::min)(ec->pred.scalar + sens * w, cmax);
+        w = binary_search(cmax + 1 - ec->pred.scalar(), delta, sens);
+        _max_costs[a] = (std::min)(ec->pred.scalar() + sens * w, cmax);
         if (_max_costs[a] < cmin)
           _max_costs[a] = cmin;
       }
@@ -156,8 +158,11 @@ void cb_explore_adf_regcb::get_cost_ranges(float delta, LEARNER::multi_learner&
   // reset cb example data
   for (size_t i = 0; i < examples.size(); ++i)
   {
-    examples[i]->pred.a_s = _ex_as[i];
-    examples[i]->l.cb.costs = _ex_costs[i];
+    examples[i]->pred.reset();
+    examples[i]->pred.init_as_action_probs() = std::move(_ex_as[i]);
+    examples[i]->l.reset();
+    examples[i]->l.init_as_cb();
+    examples[i]->l.cb().costs = std::move(_ex_costs[i]);
   }
 }
 
@@ -168,7 +173,7 @@ void cb_explore_adf_regcb::predict_or_learn_impl(LEARNER::multi_learner& base, m
   {
     for (size_t i = 0; i < examples.size() - 1; ++i)
     {
-      CB::label& ld = examples[i]->l.cb;
+      CB::label& ld = examples[i]->l.cb();
       if (ld.costs.size() == 1)
         ld.costs[0].probability = 1.f;  // no importance weighting
     }
@@ -178,8 +183,7 @@ void cb_explore_adf_regcb::predict_or_learn_impl(LEARNER::multi_learner& base, m
   }
   else
     LEARNER::multiline_learn_or_predict<false>(base, examples, examples[0]->ft_offset);
-
-  v_array<ACTION_SCORE::action_score>& preds = examples[0]->pred.a_s;
+  auto& preds = examples[0]->pred.action_probs();
   uint32_t num_actions = (uint32_t)preds.size();
 
   const float max_range = _max_cb_cost - _min_cb_cost;
@@ -272,19 +276,17 @@ LEARNER::base_learner* setup(VW::config::options_i& options, vw& all)
     options.replace("cb_type", mtr);
   }
 
-  all.delete_prediction = ACTION_SCORE::delete_action_scores;
-
   // Set explore_type
   size_t problem_multiplier = 1;
 
   LEARNER::multi_learner* base = as_multiline(setup_base(options, all));
   all.p->lp = CB::cb_label;
-  all.label_type = label_type_t::cb;
 
   using explore_type = cb_explore_adf_base<cb_explore_adf_regcb>;
   auto data = scoped_calloc_or_throw<explore_type>(regcbopt, c0, first_only, min_cb_cost, max_cb_cost);
   LEARNER::learner<explore_type, multi_ex>& l = LEARNER::init_learner(
       data, base, explore_type::learn, explore_type::predict, problem_multiplier, prediction_type_t::action_probs);
+  l.label_type = label_type_t::cb;
 
   l.set_finish_example(explore_type::finish_multiline_example);
   return make_base(l);
diff --git a/vowpalwabbit/cb_explore_adf_softmax.cc b/vowpalwabbit/cb_explore_adf_softmax.cc
index d2b465fcac2..980e1eda01d 100644
--- a/vowpalwabbit/cb_explore_adf_softmax.cc
+++ b/vowpalwabbit/cb_explore_adf_softmax.cc
@@ -45,8 +45,7 @@ template <bool is_learn>
 void cb_explore_adf_softmax::predict_or_learn_impl(LEARNER::multi_learner& base, multi_ex& examples)
 {
   LEARNER::multiline_learn_or_predict<is_learn>(base, examples, examples[0]->ft_offset);
-
-  v_array<ACTION_SCORE::action_score>& preds = examples[0]->pred.a_s;
+  auto& preds = examples[0]->pred.action_probs();
   exploration::generate_softmax(
       -_lambda, begin_scores(preds), end_scores(preds), begin_scores(preds), end_scores(preds));
 
@@ -82,19 +81,17 @@ LEARNER::base_learner* setup(VW::config::options_i& options, vw& all)
     options.insert("cb_adf", "");
   }
 
-  all.delete_prediction = ACTION_SCORE::delete_action_scores;
-
   // Set explore_type
   size_t problem_multiplier = 1;
 
   LEARNER::multi_learner* base = as_multiline(setup_base(options, all));
   all.p->lp = CB::cb_label;
-  all.label_type = label_type_t::cb;
 
   using explore_type = cb_explore_adf_base<cb_explore_adf_softmax>;
   auto data = scoped_calloc_or_throw<explore_type>(epsilon, lambda);
   LEARNER::learner<explore_type, multi_ex>& l = LEARNER::init_learner(
       data, base, explore_type::learn, explore_type::predict, problem_multiplier, prediction_type_t::action_probs);
+  l.label_type = label_type_t::cb;
 
   l.set_finish_example(explore_type::finish_multiline_example);
   return make_base(l);
diff --git a/vowpalwabbit/cb_sample.cc b/vowpalwabbit/cb_sample.cc
index d78b04a1753..d258d44b8ae 100644
--- a/vowpalwabbit/cb_sample.cc
+++ b/vowpalwabbit/cb_sample.cc
@@ -26,12 +26,12 @@ struct cb_sample_data
   {
     multiline_learn_or_predict<is_learn>(base, examples, examples[0]->ft_offset);
 
-    auto action_scores = examples[0]->pred.a_s;
+    auto& action_scores = examples[0]->pred.action_probs();
     uint32_t chosen_action = -1;
 
     int labelled_action = -1;
     // Find that chosen action in the learning case, skip the shared example.
-    auto it = std::find_if(examples.begin(), examples.end(), [](example *item) { return !item->l.cb.costs.empty(); });
+    auto it = std::find_if(examples.begin(), examples.end(), [](example *item) { return !item->l.cb().costs.empty(); });
     if (it != examples.end())
     {
       labelled_action = std::distance(examples.begin(), it);
@@ -116,6 +116,9 @@ base_learner *cb_sample_setup(options_i &options, vw &all)
   }
 
   auto data = scoped_calloc_or_throw<cb_sample_data>(all.get_random_state());
-  return make_base(init_learner(data, as_multiline(setup_base(options, all)), learn_or_predict<true>,
+  auto base = as_multiline(setup_base(options, all));
+  auto l = make_base(init_learner(data, base, learn_or_predict<true>,
       learn_or_predict<false>, 1 /* weights */, prediction_type_t::action_probs));
+  l->label_type = label_type_t::cb;
+  return l;
 }
diff --git a/vowpalwabbit/cbify.cc b/vowpalwabbit/cbify.cc
index c2cb93594ba..440945b04d9 100644
--- a/vowpalwabbit/cbify.cc
+++ b/vowpalwabbit/cbify.cc
@@ -23,6 +23,15 @@ struct cbify_adf_data
 {
   multi_ex ecs;
   size_t num_actions;
+
+  ~cbify_adf_data()
+  {
+    for (auto& ex : ecs)
+    {
+      ex->~example();
+      free(ex);
+    }
+  }
 };
 
 struct cbify
@@ -39,26 +48,9 @@ struct cbify
   float loss1;
 
   // for ldf inputs
-  std::vector<v_array<COST_SENSITIVE::wclass>> cs_costs;
-  std::vector<v_array<CB::cb_class>> cb_costs;
+  std::vector<COST_SENSITIVE::label> cs_labels;
+  std::vector<CB::label> cb_labels;
   std::vector<ACTION_SCORE::action_scores> cb_as;
-
-  ~cbify()
-  {
-    CB::cb_label.delete_label(&cb_label);
-    a_s.delete_v();
-
-    if (use_adf)
-    {
-      for (size_t a = 0; a < adf_data.num_actions; ++a)
-      {
-        adf_data.ecs[a]->pred.a_s.delete_v();
-        VW::dealloc_example(CB::cb_label.delete_label, *adf_data.ecs[a]);
-        free_it(adf_data.ecs[a]);
-      }
-      for (auto& as : cb_as) as.delete_v();
-    }
-  }
 };
 
 float loss(cbify& data, uint32_t label, uint32_t final_prediction)
@@ -83,14 +75,14 @@ float loss_cs(cbify& data, v_array<COST_SENSITIVE::wclass>& costs, uint32_t fina
   return data.loss0 + (data.loss1 - data.loss0) * cost;
 }
 
-float loss_csldf(cbify& data, std::vector<v_array<COST_SENSITIVE::wclass>>& cs_costs, uint32_t final_prediction)
+float loss_csldf(cbify& data, std::vector<COST_SENSITIVE::label>& cs_labels, uint32_t final_prediction)
 {
   float cost = 0.;
-  for (auto costs : cs_costs)
+  for (auto& label : cs_labels)
   {
-    if (costs[0].class_index == final_prediction)
+    if (label.costs[0].class_index == final_prediction)
     {
-      cost = costs[0].x;
+      cost = label.costs[0].x;
       break;
     }
   }
@@ -107,8 +99,8 @@ void copy_example_to_adf(cbify& data, example& ec)
   {
     auto& eca = *adf_data.ecs[a];
     // clear label
-    auto& lab = eca.l.cb;
-    CB::cb_label.default_label(&lab);
+    auto& lab = eca.l.cb();
+    CB::default_label(lab);
 
     // copy data
     VW::copy_example_data(false, &eca, &ec);
@@ -123,7 +115,7 @@ void copy_example_to_adf(cbify& data, example& ec)
     }
 
     // avoid empty example by adding a tag (hacky)
-    if (CB_ALGS::example_is_newline_not_header(eca) && CB::cb_label.test_label(&eca.l))
+    if (CB_ALGS::example_is_newline_not_header(eca) && CB::cb_label.test_label(eca.l))
     {
       eca.tag.push_back('n');
     }
@@ -137,26 +129,28 @@ void predict_or_learn(cbify& data, single_learner& base, example& ec)
   MULTICLASS::label_t ld;
   COST_SENSITIVE::label csl;
   if (use_cs)
-    csl = ec.l.cs;
+    csl = std::move(ec.l.cs());
   else
-    ld = ec.l.multi;
+    ld = std::move(ec.l.multi());
 
   data.cb_label.costs.clear();
-  ec.l.cb = data.cb_label;
-  ec.pred.a_s = data.a_s;
+  ec.l.reset();
+  ec.l.init_as_cb(data.cb_label);
+  ec.pred.reset();
+  ec.pred.init_as_action_probs(std::move(data.a_s));
 
   // Call the cb_explore algorithm. It returns a vector of probabilities for each action
   base.predict(ec);
-  // data.probs = ec.pred.scalars;
+  // data.probs = ec.pred.scalars();
 
   uint32_t chosen_action;
-  if (sample_after_normalizing(
-          data.app_seed + data.example_counter++, begin_scores(ec.pred.a_s), end_scores(ec.pred.a_s), chosen_action))
+  if (sample_after_normalizing(data.app_seed + data.example_counter++, begin_scores(ec.pred.action_probs()),
+          end_scores(ec.pred.action_probs()), chosen_action))
     THROW("Failed to sample from pdf");
 
   CB::cb_class cl;
   cl.action = chosen_action + 1;
-  cl.probability = ec.pred.a_s[chosen_action].score;
+  cl.probability = ec.pred.action_probs()[chosen_action].score;
 
   if (!cl.action)
     THROW("No action with non-zero probability found!");
@@ -167,64 +161,59 @@ void predict_or_learn(cbify& data, single_learner& base, example& ec)
 
   // Create a new cb label
   data.cb_label.costs.push_back(cl);
-  ec.l.cb = data.cb_label;
+  ec.l.cb() = data.cb_label;
 
   if (is_learn)
     base.learn(ec);
 
   data.a_s.clear();
-  data.a_s = ec.pred.a_s;
+  data.a_s = std::move(ec.pred.action_probs());
 
+  ec.l.reset();
   if (use_cs)
-    ec.l.cs = csl;
+    ec.l.init_as_cs(std::move(csl));
   else
-    ec.l.multi = ld;
+    ec.l.init_as_multi(std::move(ld));
 
-  ec.pred.multiclass = cl.action;
+  ec.pred.reset();
+  ec.pred.init_as_multiclass() = cl.action;
 }
 
+// will call into cb_explore_adf must use cb labels
 template <bool is_learn, bool use_cs>
 void predict_or_learn_adf(cbify& data, multi_learner& base, example& ec)
 {
-  // Store the multiclass or cost-sensitive input label
-  MULTICLASS::label_t ld;
-  COST_SENSITIVE::label csl;
-  if (use_cs)
-    csl = ec.l.cs;
-  else
-    ld = ec.l.multi;
-
   copy_example_to_adf(data, ec);
   base.predict(data.adf_data.ecs);
 
   auto& out_ec = *data.adf_data.ecs[0];
 
   uint32_t chosen_action;
-  if (sample_after_normalizing(data.app_seed + data.example_counter++, begin_scores(out_ec.pred.a_s),
-          end_scores(out_ec.pred.a_s), chosen_action))
+  if (sample_after_normalizing(data.app_seed + data.example_counter++, begin_scores(out_ec.pred.action_probs()),
+          end_scores(out_ec.pred.action_probs()), chosen_action))
     THROW("Failed to sample from pdf");
 
   CB::cb_class cl;
-  cl.action = out_ec.pred.a_s[chosen_action].action + 1;
-  cl.probability = out_ec.pred.a_s[chosen_action].score;
+  cl.action = out_ec.pred.action_probs()[chosen_action].action + 1;
+  cl.probability = out_ec.pred.action_probs()[chosen_action].score;
 
   if (!cl.action)
     THROW("No action with non-zero probability found!");
 
   if (use_cs)
-    cl.cost = loss_cs(data, csl.costs, cl.action);
+    cl.cost = loss_cs(data, ec.l.cs().costs, cl.action);
   else
-    cl.cost = loss(data, ld.label, cl.action);
+    cl.cost = loss(data, ec.l.multi().label, cl.action);
 
   // add cb label to chosen action
-  auto& lab = data.adf_data.ecs[cl.action - 1]->l.cb;
+  auto& lab = data.adf_data.ecs[cl.action - 1]->l.cb();
   lab.costs.clear();
   lab.costs.push_back(cl);
 
   if (is_learn)
     base.learn(data.adf_data.ecs);
 
-  ec.pred.multiclass = cl.action;
+  ec.pred.multiclass() = cl.action;
 }
 
 void init_adf_data(cbify& data, const size_t num_actions)
@@ -235,9 +224,10 @@ void init_adf_data(cbify& data, const size_t num_actions)
   adf_data.ecs.resize(num_actions);
   for (size_t a = 0; a < num_actions; ++a)
   {
-    adf_data.ecs[a] = VW::alloc_examples(CB::cb_label.label_size, 1);
-    auto& lab = adf_data.ecs[a]->l.cb;
-    CB::cb_label.default_label(&lab);
+    adf_data.ecs[a] = VW::alloc_examples(1);
+    auto& lab = adf_data.ecs[a]->l.init_as_cb();
+    CB::default_label(lab);
+    adf_data.ecs[a]->pred.init_as_action_probs();
     adf_data.ecs[a]->interactions = &data.all->interactions;
   }
 }
@@ -246,68 +236,69 @@ template <bool is_learn>
 void do_actual_learning_ldf(cbify& data, multi_learner& base, multi_ex& ec_seq)
 {
   // change label and pred data for cb
-  if (data.cs_costs.size() < ec_seq.size())
-    data.cs_costs.resize(ec_seq.size());
-  if (data.cb_costs.size() < ec_seq.size())
-    data.cb_costs.resize(ec_seq.size());
+  if (data.cs_labels.size() < ec_seq.size())
+    data.cs_labels.resize(ec_seq.size());
+  if (data.cb_labels.size() < ec_seq.size())
+    data.cb_labels.resize(ec_seq.size());
   if (data.cb_as.size() < ec_seq.size())
     data.cb_as.resize(ec_seq.size());
+
   for (size_t i = 0; i < ec_seq.size(); ++i)
   {
     auto& ec = *ec_seq[i];
-    data.cs_costs[i] = ec.l.cs.costs;
-    data.cb_costs[i].clear();
-    data.cb_as[i].clear();
-    ec.l.cb.costs = data.cb_costs[i];
-    ec.pred.a_s = data.cb_as[i];
+    data.cs_labels[i] = std::move(ec.l.cs());
+
+    ec.l.reset();
+    ec.l.init_as_cb(std::move(data.cb_labels[i]));
+    ec.pred.reset();
+    ec.pred.init_as_action_probs(std::move(data.cb_as[i]));
   }
 
   base.predict(ec_seq);
 
   auto& out_ec = *ec_seq[0];
 
-  uint32_t chosen_action;
-  if (sample_after_normalizing(data.app_seed + data.example_counter++, begin_scores(out_ec.pred.a_s),
-          end_scores(out_ec.pred.a_s), chosen_action))
+  uint32_t chosen_action_index;
+  if (sample_after_normalizing(data.app_seed + data.example_counter++, begin_scores(out_ec.pred.action_probs()),
+          end_scores(out_ec.pred.action_probs()), chosen_action_index))
     THROW("Failed to sample from pdf");
 
+  const auto chosen_action_zero_based = out_ec.pred.action_probs()[chosen_action_index].action;
+  const auto chosen_action_score = out_ec.pred.action_probs()[chosen_action_index].score;
+  const auto chosen_action_one_based = chosen_action_zero_based + 1;
+
   CB::cb_class cl;
-  cl.action = out_ec.pred.a_s[chosen_action].action + 1;
-  cl.probability = out_ec.pred.a_s[chosen_action].score;
+  cl.action = chosen_action_one_based;
+  cl.probability = chosen_action_score;
 
   if (!cl.action)
     THROW("No action with non-zero probability found!");
 
-  cl.cost = loss_csldf(data, data.cs_costs, cl.action);
-
-  // add cb label to chosen action
-  data.cb_label.costs.clear();
-  data.cb_label.costs.push_back(cl);
-  data.cb_costs[cl.action - 1] = ec_seq[cl.action - 1]->l.cb.costs;
-  ec_seq[cl.action - 1]->l.cb = data.cb_label;
+  cl.cost = loss_csldf(data, data.cs_labels, chosen_action_one_based);
 
+  ec_seq[chosen_action_zero_based]->l.cb().costs.push_back(cl);
   base.learn(ec_seq);
+  ec_seq[chosen_action_zero_based]->l.cb().costs.clear();
 
-  // set cs prediction and reset cs costs
+  // Return labels and predictions to be reused and restore initial labels and preds
   for (size_t i = 0; i < ec_seq.size(); ++i)
   {
     auto& ec = *ec_seq[i];
-    data.cb_as[i] = ec.pred.a_s;  // store action_score vector for later reuse.
-    if (i == cl.action - 1)
-      data.cb_label = ec.l.cb;
-    else
-      data.cb_costs[i] = ec.l.cb.costs;
-    ec.l.cs.costs = data.cs_costs[i];
-    if (i == cl.action - 1)
-      ec.pred.multiclass = cl.action;
-    else
-      ec.pred.multiclass = 0;
+    // Store the cb label back in data to be reused.
+    data.cb_labels[i] = std::move(ec.l.cb());
+    ec.l.reset();
+    ec.l.init_as_cs(std::move(data.cs_labels[i]));
+
+    // store action_score vector for later reuse, then set the output prediction.
+    data.cb_as[i] = std::move(ec.pred.action_probs());
+    ec.pred.reset();
+    ec.pred.init_as_multiclass() = (i == cl.action - 1) ? cl.action : 0;
   }
 }
 
 void output_example(vw& all, example& ec, bool& hit_loss, multi_ex* ec_seq)
 {
-  const auto& costs = ec.l.cs.costs;
+  const auto& costs = ec.l.cs().costs;
 
   if (example_is_newline(ec))
     return;
@@ -318,9 +309,9 @@ void output_example(vw& all, example& ec, bool& hit_loss, multi_ex* ec_seq)
 
   float loss = 0.;
 
-  uint32_t predicted_class = ec.pred.multiclass;
+  uint32_t predicted_class = ec.pred.multiclass();
 
-  if (!COST_SENSITIVE::cs_label.test_label(&ec.l))
+  if (!COST_SENSITIVE::cs_label.test_label(ec.l))
   {
     for (auto const& cost : costs)
     {
@@ -338,7 +329,7 @@ void output_example(vw& all, example& ec, bool& hit_loss, multi_ex* ec_seq)
   }
 
   for (int sink : all.final_prediction_sink)
-    all.print_by_ref(sink, (float)ec.pred.multiclass, 0, ec.tag);
+    all.print_by_ref(sink, (float)ec.pred.multiclass(), 0, ec.tag);
 
   if (all.raw_prediction > 0)
   {
@@ -354,7 +345,7 @@ void output_example(vw& all, example& ec, bool& hit_loss, multi_ex* ec_seq)
     all.print_text_by_ref(all.raw_prediction, outputStringStream.str(), ec.tag);
   }
 
-  COST_SENSITIVE::print_update(all, COST_SENSITIVE::cs_label.test_label(&ec.l), ec, ec_seq, false, predicted_class);
+  COST_SENSITIVE::print_update(all, COST_SENSITIVE::cs_label.test_label(ec.l), ec, ec_seq, false, predicted_class);
 }
 
 void output_example_seq(vw& all, multi_ex& ec_seq)
@@ -369,7 +360,7 @@ void output_example_seq(vw& all, multi_ex& ec_seq)
 
   if (all.raw_prediction > 0)
   {
-    v_array<char> empty = {nullptr, nullptr, nullptr, 0};
+    v_array<char> empty;
     all.print_text_by_ref(all.raw_prediction, "", empty);
   }
 }
@@ -405,7 +396,6 @@ base_learner* cbify_setup(options_i& options, vw& all)
 
   data->use_adf = options.was_supplied("cb_explore_adf");
   data->app_seed = uniform_hash("vw", 2, 0);
-  data->a_s = v_init<action_score>();
   data->all = &all;
 
   if (data->use_adf)
@@ -452,7 +442,8 @@ base_learner* cbify_setup(options_i& options, vw& all)
     else
       l = &init_multiclass_learner(data, base, predict_or_learn<true, false>, predict_or_learn<false, false>, all.p, 1);
   }
-  all.delete_prediction = nullptr;
+
+  l->label_type = use_cs ? label_type_t::cs : label_type_t::multi;
 
   return make_base(*l);
 }
@@ -496,7 +487,6 @@ base_learner* cbifyldf_setup(options_i& options, vw& all)
 
   l.set_finish_example(finish_multiline_example);
   all.p->lp = COST_SENSITIVE::cs_label;
-  all.delete_prediction = nullptr;
-
+  l.label_type = label_type_t::cs;
   return make_base(l);
 }
diff --git a/vowpalwabbit/ccb_label.cc b/vowpalwabbit/ccb_label.cc
index cd7b0176859..d352fb5da63 100644
--- a/vowpalwabbit/ccb_label.cc
+++ b/vowpalwabbit/ccb_label.cc
@@ -26,28 +26,28 @@ using namespace VW::config;
 
 namespace CCB
 {
-void default_label(void* v);
+void default_label(polylabel& v);
 
-size_t read_cached_label(shared_data*, void* v, io_buf& cache)
+size_t read_cached_label(shared_data*, polylabel& v, io_buf& cache)
 {
   // Since read_cached_features doesn't default the label we must do it here.
   default_label(v);
-  CCB::label* ld = static_cast<CCB::label*>(v);
+  CCB::label& ld = v.ccb();
 
-  if (ld->outcome)
+  if (ld.outcome)
   {
-    ld->outcome->probabilities.clear();
+    ld.outcome->probabilities.clear();
   }
-  ld->explicit_included_actions.clear();
+  ld.explicit_included_actions.clear();
 
   size_t read_count = 0;
   char* read_ptr;
 
-  size_t next_read_size = sizeof(ld->type);
+  size_t next_read_size = sizeof(ld.type);
   if (cache.buf_read(read_ptr, next_read_size) < next_read_size)
     return 0;
-  ld->type = *(CCB::example_type*)read_ptr;
-  read_count += sizeof(ld->type);
+  ld.type = *(CCB::example_type*)read_ptr;
+  read_count += sizeof(ld.type);
 
   bool is_outcome_present;
   next_read_size = sizeof(bool);
@@ -58,14 +58,13 @@ size_t read_cached_label(shared_data*, void* v, io_buf& cache)
 
   if (is_outcome_present)
   {
-    ld->outcome = new CCB::conditional_contextual_bandit_outcome();
-    ld->outcome->probabilities = v_init<ACTION_SCORE::action_score>();
+    ld.outcome = new CCB::conditional_contextual_bandit_outcome();
 
-    next_read_size = sizeof(ld->outcome->cost);
+    next_read_size = sizeof(ld.outcome->cost);
     if (cache.buf_read(read_ptr, next_read_size) < next_read_size)
       return 0;
-    ld->outcome->cost = *(float*)read_ptr;
-    read_count += sizeof(ld->outcome->cost);
+    ld.outcome->cost = *(float*)read_ptr;
+    read_count += sizeof(ld.outcome->cost);
 
     uint32_t size_probs;
     next_read_size = sizeof(size_probs);
@@ -83,7 +82,7 @@ size_t read_cached_label(shared_data*, void* v, io_buf& cache)
       a_s = *(ACTION_SCORE::action_score*)read_ptr;
       read_count += sizeof(a_s);
 
-      ld->outcome->probabilities.push_back(a_s);
+      ld.outcome->probabilities.push_back(a_s);
     }
   }
 
@@ -102,123 +101,97 @@ size_t read_cached_label(shared_data*, void* v, io_buf& cache)
       return 0;
     include = *(uint32_t*)read_ptr;
     read_count += sizeof(include);
-    ld->explicit_included_actions.push_back(include);
+    ld.explicit_included_actions.push_back(include);
   }
 
-  next_read_size = sizeof(ld->weight);
+  next_read_size = sizeof(ld.weight);
   if (cache.buf_read(read_ptr, next_read_size) < next_read_size)
     return 0;
-  ld->weight = *(float*)read_ptr;
+  ld.weight = *(float*)read_ptr;
   return read_count;
 }
 
-float ccb_weight(void* v)
+float ccb_weight(polylabel& v)
 {
-  CCB::label* ld = (CCB::label*)v;
-  return ld->weight;
+  CCB::label& ld = (CCB::label&)v;
+  return ld.weight;
 }
 
-void cache_label(void* v, io_buf& cache)
+void cache_label(polylabel& v, io_buf& cache)
 {
   char* c;
-  CCB::label* ld = static_cast<CCB::label*>(v);
+  CCB::label& ld = v.ccb();
   size_t size = sizeof(uint8_t)  // type
       + sizeof(bool)             // outcome exists?
-      + (ld->outcome == nullptr ? 0
-                                : sizeof(ld->outcome->cost)                                    // cost
-                    + sizeof(uint32_t)                                                         // probabilities size
-                    + sizeof(ACTION_SCORE::action_score) * ld->outcome->probabilities.size())  // probabilities
+      + (ld.outcome == nullptr ? 0
+                               : sizeof(ld.outcome->cost)                                     // cost
+                    + sizeof(uint32_t)                                                        // probabilities size
+                    + sizeof(ACTION_SCORE::action_score) * ld.outcome->probabilities.size())  // probabilities
       + sizeof(uint32_t)  // explicit_included_actions size
-      + sizeof(uint32_t) * ld->explicit_included_actions.size() + sizeof(ld->weight);
+      + sizeof(uint32_t) * ld.explicit_included_actions.size() + sizeof(ld.weight);
 
   cache.buf_write(c, size);
 
-  *(uint8_t*)c = static_cast<uint8_t>(ld->type);
-  c += sizeof(ld->type);
+  *(uint8_t*)c = static_cast<uint8_t>(ld.type);
+  c += sizeof(ld.type);
 
-  *(bool*)c = ld->outcome != nullptr;
+  *(bool*)c = ld.outcome != nullptr;
   c += sizeof(bool);
 
-  if (ld->outcome != nullptr)
+  if (ld.outcome != nullptr)
   {
-    *(float*)c = ld->outcome->cost;
-    c += sizeof(ld->outcome->cost);
+    *(float*)c = ld.outcome->cost;
+    c += sizeof(ld.outcome->cost);
 
-    *(uint32_t*)c = convert(ld->outcome->probabilities.size());
+    *(uint32_t*)c = convert(ld.outcome->probabilities.size());
     c += sizeof(uint32_t);
 
-    for (const auto& score : ld->outcome->probabilities)
+    for (const auto& score : ld.outcome->probabilities)
     {
       *(ACTION_SCORE::action_score*)c = score;
       c += sizeof(ACTION_SCORE::action_score);
     }
   }
 
-  *(uint32_t*)c = convert(ld->explicit_included_actions.size());
+  *(uint32_t*)c = convert(ld.explicit_included_actions.size());
   c += sizeof(uint32_t);
 
-  for (const auto& included_action : ld->explicit_included_actions)
+  for (const auto& included_action : ld.explicit_included_actions)
   {
     *(uint32_t*)c = included_action;
     c += sizeof(included_action);
   }
 
-  *(float*)c = ld->weight;
-  c += sizeof(ld->weight);
+  *(float*)c = ld.weight;
+  c += sizeof(ld.weight);
 }
 
-void default_label(void* v)
+void default_label(polylabel& v)
 {
-  CCB::label* ld = static_cast<CCB::label*>(v);
-
-  // This is tested against nullptr, so unfortunately as things are this must be deleted when not used.
-  if (ld->outcome)
+  if (v.get_type() != label_type_t::conditional_contextual_bandit)
   {
-    ld->outcome->probabilities.delete_v();
-    delete ld->outcome;
-    ld->outcome = nullptr;
-  }
-
-  ld->explicit_included_actions.clear();
-  ld->type = example_type::unset;
-  ld->weight = 1.0;
-}
+    v.reset();
+    v.init_as_ccb();
 
-bool test_label(void* v)
-{
-  CCB::label* ld = static_cast<CCB::label*>(v);
-  return ld->outcome == nullptr;
-}
+  }
+  CCB::label& ld = v.ccb();
 
-void delete_label(void* v)
-{
-  CCB::label* ld = static_cast<CCB::label*>(v);
-  if (ld->outcome)
+  // This is tested against nullptr, so unfortunately as things are this must be deleted when not used.
+  if (ld.outcome)
   {
-    ld->outcome->probabilities.delete_v();
-    delete ld->outcome;
-    ld->outcome = nullptr;
+    delete ld.outcome;
+    ld.outcome = nullptr;
   }
-  ld->explicit_included_actions.delete_v();
+
+  ld.explicit_included_actions.clear();
+  ld.type = example_type::unset;
+  ld.weight = 1.0;
 }
 
-void copy_label(void* dst, void* src)
+bool test_label(polylabel& v)
 {
-  CCB::label* ldDst = static_cast<CCB::label*>(dst);
-  CCB::label* ldSrc = static_cast<CCB::label*>(src);
-
-  if (ldSrc->outcome)
-  {
-    ldDst->outcome = new CCB::conditional_contextual_bandit_outcome();
-    ldDst->outcome->probabilities = v_init<ACTION_SCORE::action_score>();
-
-    ldDst->outcome->cost = ldSrc->outcome->cost;
-    copy_array(ldDst->outcome->probabilities, ldSrc->outcome->probabilities);
-  }
-
-  copy_array(ldDst->explicit_included_actions, ldSrc->explicit_included_actions);
-  ldDst->type = ldSrc->type;
-  ldDst->weight = ldSrc->weight;
+  CCB::label& ld = v.ccb();
+  return ld.outcome == nullptr;
 }
 
 ACTION_SCORE::action_score convert_to_score(const VW::string_view& action_id_str, const VW::string_view& probability_str)
@@ -247,16 +220,15 @@ CCB::conditional_contextual_bandit_outcome* parse_outcome(VW::string_view& outco
 {
   auto& ccb_outcome = *(new CCB::conditional_contextual_bandit_outcome());
 
-  auto split_commas = v_init<VW::string_view>();
+  v_array<VW::string_view> split_commas;
   tokenize(',', outcome, split_commas);
 
-  auto split_colons = v_init<VW::string_view>();
+  v_array<VW::string_view> split_colons;
   tokenize(':', split_commas[0], split_colons);
 
   if (split_colons.size() != 3)
     THROW("Malformed ccb label");
 
-  ccb_outcome.probabilities = v_init<ACTION_SCORE::action_score>();
   ccb_outcome.probabilities.push_back(convert_to_score(split_colons[0], split_colons[2]));
 
   ccb_outcome.cost = float_of_string(split_colons[1]);
@@ -273,24 +245,21 @@ CCB::conditional_contextual_bandit_outcome* parse_outcome(VW::string_view& outco
     ccb_outcome.probabilities.push_back(convert_to_score(split_colons[0], split_colons[1]));
   }
 
-  split_colons.delete_v();
-  split_commas.delete_v();
-
   return &ccb_outcome;
 }
 
-void parse_explicit_inclusions(CCB::label* ld, v_array<VW::string_view>& split_inclusions)
+void parse_explicit_inclusions(CCB::label& ld, v_array<VW::string_view>& split_inclusions)
 {
   for (const auto& inclusion : split_inclusions)
   {
-    ld->explicit_included_actions.push_back(int_of_string(inclusion));
+    ld.explicit_included_actions.push_back(int_of_string(inclusion));
   }
 }
 
-void parse_label(parser* p, shared_data*, void* v, v_array<VW::string_view>& words)
+void parse_label(parser* p, shared_data*, polylabel& v, v_array<VW::string_view>& words)
 {
-  CCB::label* ld = static_cast<CCB::label*>(v);
-  ld->weight = 1.0;
+  CCB::label& ld = v.ccb();
+  ld.weight = 1.0;
 
   if (words.size() < 2)
     THROW("ccb labels may not be empty");
@@ -304,19 +273,19 @@ void parse_label(parser* p, shared_data*, void* v, v_array<VW::string_view>& wor
   {
     if (words.size() > 2)
       THROW("shared labels may not have a cost");
-    ld->type = CCB::example_type::shared;
+    ld.type = CCB::example_type::shared;
   }
   else if (type == "action")
   {
     if (words.size() > 2)
       THROW("action labels may not have a cost");
-    ld->type = CCB::example_type::action;
+    ld.type = CCB::example_type::action;
   }
   else if (type == "slot")
   {
     if (words.size() > 4)
       THROW("ccb slot label can only have a type cost and exclude list");
-    ld->type = CCB::example_type::slot;
+    ld.type = CCB::example_type::slot;
 
     // Skip the first two words "ccb <type>"
     for (size_t i = 2; i < words.size(); i++)
@@ -324,12 +293,12 @@ void parse_label(parser* p, shared_data*, void* v, v_array<VW::string_view>& wor
       auto is_outcome = words[i].find(':');
       if (is_outcome != VW::string_view::npos)
       {
-        if (ld->outcome != nullptr)
+        if (ld.outcome != nullptr)
         {
           THROW("There may be only 1 outcome associated with a slot.")
         }
 
-        ld->outcome = parse_outcome(words[i]);
+        ld.outcome = parse_outcome(words[i]);
       }
       else
       {
@@ -339,9 +308,9 @@ void parse_label(parser* p, shared_data*, void* v, v_array<VW::string_view>& wor
     }
 
     // If a full distribution has been given, check if it sums to 1, otherwise throw.
-    if (ld->outcome && ld->outcome->probabilities.size() > 1)
+    if (ld.outcome && ld.outcome->probabilities.size() > 1)
     {
-      float total_pred = std::accumulate(ld->outcome->probabilities.begin(), ld->outcome->probabilities.end(), 0.f,
+      float total_pred = std::accumulate(ld.outcome->probabilities.begin(), ld.outcome->probabilities.end(), 0.f,
           [](float result_so_far, ACTION_SCORE::action_score action_pred) {
             return result_so_far + action_pred.score;
           });
@@ -360,6 +329,6 @@ void parse_label(parser* p, shared_data*, void* v, v_array<VW::string_view>& wor
 }
 
 // Export the definition of this label parser.
-label_parser ccb_label_parser = {default_label, parse_label, cache_label, read_cached_label, delete_label, ccb_weight,
-    copy_label, test_label, sizeof(CCB::label)};
+label_parser ccb_label_parser = {default_label, parse_label, cache_label, read_cached_label, polylabel_delete_label, ccb_weight,
+    polylabel_copy_label, test_label, sizeof(CCB::label)};
 }  // namespace CCB
diff --git a/vowpalwabbit/ccb_label.h b/vowpalwabbit/ccb_label.h
index 8b6e341c4be..d18dd44b602 100644
--- a/vowpalwabbit/ccb_label.h
+++ b/vowpalwabbit/ccb_label.h
@@ -33,11 +33,67 @@ enum example_type : uint8_t
 
 struct label
 {
-  example_type type;
+  example_type type = example_type::unset;
   // Outcome may be unset.
-  conditional_contextual_bandit_outcome* outcome;
+  conditional_contextual_bandit_outcome* outcome = nullptr;
   v_array<uint32_t> explicit_included_actions;
-  float weight;
+  float weight = 0.f;
+
+  label() = default;
+  label(example_type type, conditional_contextual_bandit_outcome* outcome, v_array<uint32_t>& explicit_included_actions,
+      float weight)
+      : type(type), outcome(outcome), explicit_included_actions(explicit_included_actions), weight(weight)
+  {
+  }
+
+  label(label&& other)
+  {
+    type = example_type::unset;
+    std::swap(type, other.type);
+    outcome = nullptr;
+    std::swap(outcome, other.outcome);
+    explicit_included_actions.clear();
+    std::swap(explicit_included_actions, other.explicit_included_actions);
+    weight = 0.f;
+    std::swap(weight, other.weight);
+  }
+  label& operator=(label&& other)
+  {
+    type = example_type::unset;
+    std::swap(type, other.type);
+    delete outcome;
+    outcome = nullptr;
+    std::swap(outcome, other.outcome);
+
+    explicit_included_actions.clear();
+    std::swap(explicit_included_actions, other.explicit_included_actions);
+
+    weight = 0.f;
+    std::swap(weight, other.weight);
+
+    return *this;
+  }
+
+  label(const label& other)
+  {
+    type = other.type;
+    // todo copyconstructor of outcome
+    outcome = new conditional_contextual_bandit_outcome(*other.outcome);
+    explicit_included_actions = other.explicit_included_actions;
+    weight = other.weight;
+  }
+
+  label& operator=(const label& other)
+  {
+    type = other.type;
+    delete outcome;
+    outcome = new conditional_contextual_bandit_outcome(*other.outcome);
+    explicit_included_actions = other.explicit_included_actions;
+    weight = other.weight;
+    return *this;
+  }
+
+  ~label() { delete outcome; }
 };
 
 extern label_parser ccb_label_parser;
diff --git a/vowpalwabbit/classweight.cc b/vowpalwabbit/classweight.cc
index 2728a33916b..1b0a06a757b 100644
--- a/vowpalwabbit/classweight.cc
+++ b/vowpalwabbit/classweight.cc
@@ -53,10 +53,10 @@ static void predict_or_learn(classweights& cweights, LEARNER::single_learner& ba
   switch (pred_type)
   {
     case prediction_type_t::scalar:
-      ec.weight *= cweights.get_class_weight((uint32_t)ec.l.simple.label);
+      ec.weight *= cweights.get_class_weight((uint32_t)ec.l.simple().label);
       break;
     case prediction_type_t::multiclass:
-      ec.weight *= cweights.get_class_weight(ec.l.multi.label);
+      ec.weight *= cweights.get_class_weight(ec.l.multi().label);
       break;
     default:
       // suppress the warning
@@ -92,12 +92,21 @@ LEARNER::base_learner* classweight_setup(options_i& options, vw& all)
 
   LEARNER::learner<classweights, example>* ret;
   if (base->pred_type == prediction_type_t::scalar)
+  {
     ret = &LEARNER::init_learner<classweights>(cweights, base, predict_or_learn<true, prediction_type_t::scalar>,
         predict_or_learn<false, prediction_type_t::scalar>);
+    ret->label_type = label_type_t::simple;
+  }
   else if (base->pred_type == prediction_type_t::multiclass)
+  {
     ret = &LEARNER::init_learner<classweights>(cweights, base, predict_or_learn<true, prediction_type_t::multiclass>,
         predict_or_learn<false, prediction_type_t::multiclass>);
+    ret->label_type = label_type_t::multi;
+  }
   else
+  {
     THROW("--classweight not implemented for this type of prediction");
+  }
+
   return make_base(*ret);
 }
diff --git a/vowpalwabbit/comp_io.cc b/vowpalwabbit/comp_io.cc
index cd784d0180f..14f93d7c1a1 100644
--- a/vowpalwabbit/comp_io.cc
+++ b/vowpalwabbit/comp_io.cc
@@ -6,6 +6,12 @@
 #include "zlib.h"
 #include "comp_io.h"
 
+// Comp io needs to override this as the default destructor checks for stdin by file descriptor and the file descriptor that is used by zlib collides.
+comp_io_buf::~comp_io_buf() 
+{
+   while (comp_io_buf::close_file());
+}
+
 int comp_io_buf::open_file(const char* name, bool stdin_off, int flag)
 {
   gzFile fil = nullptr;
diff --git a/vowpalwabbit/comp_io.h b/vowpalwabbit/comp_io.h
index 45d1a17a18a..4bb7b8e3264 100644
--- a/vowpalwabbit/comp_io.h
+++ b/vowpalwabbit/comp_io.h
@@ -20,6 +20,8 @@ class comp_io_buf : public io_buf
  public:
   std::vector<gzFile> gz_files;
 
+  ~comp_io_buf() override;
+
   int open_file(const char* name, bool stdin_off, int flag) override;
 
   void reset_file(int f) override;
diff --git a/vowpalwabbit/conditional_contextual_bandit.cc b/vowpalwabbit/conditional_contextual_bandit.cc
index f34618c4717..82aa98b74e1 100644
--- a/vowpalwabbit/conditional_contextual_bandit.cc
+++ b/vowpalwabbit/conditional_contextual_bandit.cc
@@ -12,7 +12,6 @@
 #include "cb_adf.h"
 #include "cb_algs.h"
 #include "constant.h"
-#include "v_array_pool.h"
 
 #include <numeric>
 #include <algorithm>
@@ -23,13 +22,6 @@ using namespace LEARNER;
 using namespace VW;
 using namespace VW::config;
 
-template <typename T>
-void return_v_array(v_array<T>& array, VW::v_array_pool<T>& pool)
-{
-  array.clear();
-  pool.return_object(array);
-}
-
 struct ccb
 {
   vw* all;
@@ -51,9 +43,6 @@ struct ccb
   std::string id_namespace_str;
 
   size_t base_learner_stride_shift;
-
-  VW::v_array_pool<CB::cb_class> cb_label_pool;
-  VW::v_array_pool<ACTION_SCORE::action_score> action_score_pool;
 };
 
 namespace CCB
@@ -76,7 +65,7 @@ bool split_multi_example_and_stash_labels(const multi_ex& examples, ccb& data)
 {
   for (auto ex : examples)
   {
-    switch (ex->l.conditional_contextual_bandit.type)
+    switch (ex->l.ccb().type)
     {
       case example_type::shared:
         data.shared = ex;
@@ -93,8 +82,9 @@ bool split_multi_example_and_stash_labels(const multi_ex& examples, ccb& data)
     }
 
     // Stash the CCB labels before rewriting them.
-    data.stored_labels.push_back({ex->l.conditional_contextual_bandit.type, ex->l.conditional_contextual_bandit.outcome,
-        ex->l.conditional_contextual_bandit.explicit_included_actions, 0.});
+    data.stored_labels.push_back(std::move(ex->l.ccb()));
+    // Since we have just moved out of the label we should reset to avoid using garbage memory.
+    ex->l.reset();
   }
 
   return true;
@@ -112,10 +102,11 @@ bool sanity_checks(ccb& data)
 
   if (is_learn)
   {
-    for (auto slot : data.slots)
+    auto first_slot_index = 1 /*shared*/ + data.actions.size();
+    for (size_t index = first_slot_index; index < data.stored_labels.size(); index++)
     {
-      if (slot->l.conditional_contextual_bandit.outcome != nullptr &&
-          slot->l.conditional_contextual_bandit.outcome->probabilities.size() == 0)
+      const auto& slot_label = data.stored_labels[index];
+      if (slot_label.outcome != nullptr && slot_label.outcome->probabilities.size() == 0)
       {
         std::cerr << "ccb_adf_explore: badly formatted example - missing label probability";
         return false;
@@ -128,23 +119,23 @@ bool sanity_checks(ccb& data)
 // create empty/default cb labels
 void create_cb_labels(ccb& data)
 {
-  data.shared->l.cb.costs = data.cb_label_pool.get_object();
-  data.shared->l.cb.costs.push_back(data.default_cb_label);
+  data.shared->l.init_as_cb();
+  data.shared->l.cb().costs.push_back(data.default_cb_label);
   for (example* action : data.actions)
   {
-    action->l.cb.costs = data.cb_label_pool.get_object();
+    action->l.reset();
+    action->l.init_as_cb();
   }
-  data.shared->l.cb.weight = 1.0;
+  data.shared->l.cb().weight = 1.0;
 }
 
 // the polylabel (union) must be manually cleaned up
 void delete_cb_labels(ccb& data)
 {
-  return_v_array(data.shared->l.cb.costs, data.cb_label_pool);
-
+  data.shared->l.reset();
   for (example* action : data.actions)
   {
-    return_v_array(action->l.cb.costs, data.cb_label_pool);
+    action->l.reset();
   }
 }
 
@@ -157,14 +148,13 @@ void attach_label_to_example(
   data.cb_label.probability = outcome->probabilities[0].score;
   data.cb_label.cost = outcome->cost;
 
-  example->l.cb.costs.push_back(data.cb_label);
+  example->l.cb().costs.push_back(data.cb_label);
 }
 
 void save_action_scores(ccb& data, decision_scores_t& decision_scores)
 {
-  auto& pred = data.shared->pred.a_s;
-  decision_scores.push_back(pred);
-
+  decision_scores.push_back(std::move(data.shared->pred.action_probs()));
+  auto& pred = decision_scores[decision_scores.size() - 1];
   // correct indices: we want index relative to the original ccb multi-example, with no actions filtered
   for (auto& action_score : pred)
   {
@@ -181,7 +171,7 @@ void clear_pred_and_label(ccb& data)
   // Don't need to return to pool, as that will be done when the example is output.
 
   // This just needs to be cleared as it is reused.
-  data.actions[data.action_with_label]->l.cb.costs.clear();
+  data.actions[data.action_with_label]->l.cb().costs.clear();
 }
 
 // true if there exists at least 1 action in the cb multi-example
@@ -321,9 +311,9 @@ void calculate_and_insert_interactions(
 
 // build a cb example from the ccb example
 template <bool is_learn>
-void build_cb_example(multi_ex& cb_ex, example* slot, ccb& data)
+void build_cb_example(multi_ex& cb_ex, example* slot, CCB::label& slot_label, ccb& data)
 {
-  bool slot_has_label = slot->l.conditional_contextual_bandit.outcome != nullptr;
+  bool slot_has_label = slot_label.outcome != nullptr;
 
   // Merge the slot features with the shared example and set it in the cb multi-example
   // TODO is it imporant for total_sum_feat_sq and num_features to be correct at this point?
@@ -331,7 +321,7 @@ void build_cb_example(multi_ex& cb_ex, example* slot, ccb& data)
   cb_ex.push_back(data.shared);
 
   // Retrieve the action index whitelist (if the list is empty, then all actions are white-listed)
-  auto& explicit_includes = slot->l.conditional_contextual_bandit.explicit_included_actions;
+  auto& explicit_includes = slot_label.explicit_included_actions;
   if (explicit_includes.size() != 0)
   {
     // First time seeing this, initialize the vector with falses so we can start setting each included action.
@@ -367,17 +357,20 @@ void build_cb_example(multi_ex& cb_ex, example* slot, ccb& data)
     data.origin_index[index++] = (uint32_t)i;
 
     // Remember the index of the chosen action
-    if (is_learn && slot_has_label && i == slot->l.conditional_contextual_bandit.outcome->probabilities[0].action)
+    if (is_learn && slot_has_label && i == slot_label.outcome->probabilities[0].action)
     {
       // This is used to remove the label later.
       data.action_with_label = (uint32_t)i;
-      attach_label_to_example(index, data.actions[i], slot->l.conditional_contextual_bandit.outcome, data);
+      attach_label_to_example(index, data.actions[i], slot_label.outcome, data);
     }
   }
 
-  // Must provide a prediction that cb can write into, this will be saved into the decision scores object later.
-  data.shared->pred.a_s = data.action_score_pool.get_object();
-
+  for (auto example : cb_ex)
+  {
+    example->pred.reset();
+    example->pred.init_as_action_probs();
+  }
+  
   // Tag can be used for specifying the sampling seed per slot. For it to be used it must be inserted into the shared
   // example.
   std::swap(data.shared->tag, slot->tag);
@@ -389,6 +382,7 @@ template <bool is_learn>
 void learn_or_predict(ccb& data, multi_learner& base, multi_ex& examples)
 {
   clear_all(data);
+  data.stored_labels.reserve(examples.size());
   if (!split_multi_example_and_stash_labels(examples, data))  // split shared, actions and slots
     return;
 
@@ -403,7 +397,7 @@ void learn_or_predict(ccb& data, multi_learner& base, multi_ex& examples)
   // Reset exclusion list for this example.
   data.exclude_list.assign(data.actions.size(), false);
 
-  auto decision_scores = examples[0]->pred.decision_scores;
+  auto decision_scores = std::move(examples[0]->pred.decision_scores());
 
   // for each slot, re-build the cb example and call cb_explore_adf
   size_t slot_id = 0;
@@ -420,8 +414,9 @@ void learn_or_predict(ccb& data, multi_learner& base, multi_ex& examples)
       ex->interactions = &data.generated_interactions;
     }
 
+    const auto example_index = examples.size() - data.slots.size() + slot_id;
     data.include_list.clear();
-    build_cb_example<is_learn>(data.cb_ex, slot, data);
+    build_cb_example<is_learn>(data.cb_ex, slot, data.stored_labels[example_index], data);
 
     if (data.all->audit)
       inject_slot_id<true>(data, data.shared, slot_id);
@@ -438,7 +433,7 @@ void learn_or_predict(ccb& data, multi_learner& base, multi_ex& examples)
     else
     {
       // the cb example contains no action => cannot decide
-      decision_scores.push_back(data.action_score_pool.get_object());
+      decision_scores.push_back(ACTION_SCORE::action_scores());
     }
 
     data.shared->interactions = data.original_interactions;
@@ -464,12 +459,13 @@ void learn_or_predict(ccb& data, multi_learner& base, multi_ex& examples)
   // Restore ccb labels to the example objects.
   for (size_t i = 0; i < examples.size(); i++)
   {
-    examples[i]->l.conditional_contextual_bandit = {
-        data.stored_labels[i].type, data.stored_labels[i].outcome, data.stored_labels[i].explicit_included_actions, 0.};
+    examples[i]->l.init_as_ccb(std::move(data.stored_labels[i]));
   }
+  data.stored_labels.clear();
 
   // Save the predictions
-  examples[0]->pred.decision_scores = decision_scores;
+  examples[0]->pred.reset();
+  examples[0]->pred.init_as_decision_scores(std::move(decision_scores));
 }
 
 void print_decision_scores(int f, decision_scores_t& decision_scores)
@@ -505,7 +501,7 @@ void print_update(vw& all, std::vector<example*>& slots, decision_scores_t& deci
     {
       counter++;
 
-      auto outcome = slot->l.conditional_contextual_bandit.outcome;
+      auto outcome = slot->l.ccb().outcome;
       if (outcome == nullptr)
       {
         label_str += delim;
@@ -572,7 +568,7 @@ void output_example(vw& all, ccb& /*c*/, multi_ex& ec_seq)
   {
     num_features += ec->num_features;
 
-    if (ec->l.conditional_contextual_bandit.type == CCB::example_type::slot)
+    if (ec->l.ccb().type == CCB::example_type::slot)
     {
       slots.push_back(ec);
     }
@@ -580,10 +576,10 @@ void output_example(vw& all, ccb& /*c*/, multi_ex& ec_seq)
 
   // Is it hold out?
   size_t num_labelled = 0;
-  auto preds = ec_seq[0]->pred.decision_scores;
+  auto& preds = ec_seq[0]->pred.decision_scores();
   for (size_t i = 0; i < slots.size(); i++)
   {
-    auto outcome = slots[i]->l.conditional_contextual_bandit.outcome;
+    auto outcome = slots[i]->l.ccb().outcome;
     if (outcome != nullptr)
     {
       num_labelled++;
@@ -605,7 +601,7 @@ void output_example(vw& all, ccb& /*c*/, multi_ex& ec_seq)
   all.sd->update(holdout_example, num_labelled > 0, loss, ec_seq[SHARED_EX_INDEX]->weight, num_features);
 
   for (auto sink : all.final_prediction_sink)
-    print_decision_scores(sink, ec_seq[SHARED_EX_INDEX]->pred.decision_scores);
+    print_decision_scores(sink, ec_seq[SHARED_EX_INDEX]->pred.decision_scores());
 
   CCB::print_update(all, slots, preds, num_features);
 }
@@ -618,12 +614,6 @@ void finish_multiline_example(vw& all, ccb& data, multi_ex& ec_seq)
     CB_ADF::global_print_newline(all.final_prediction_sink);
   }
 
-  for (auto& a_s : ec_seq[0]->pred.decision_scores)
-  {
-    return_v_array(a_s, data.action_score_pool);
-  }
-  ec_seq[0]->pred.decision_scores.clear();
-
   VW::finish_example(all, ec_seq);
 }
 
@@ -663,7 +653,6 @@ base_learner* ccb_explore_adf_setup(options_i& options, vw& all)
 
   auto base = as_multiline(setup_base(options, all));
   all.p->lp = CCB::ccb_label_parser;
-  all.label_type = label_type_t::ccb;
 
   // Stash the base learners stride_shift so we can properly add a feature later.
   data->base_learner_stride_shift = all.weights.stride_shift();
@@ -679,13 +668,15 @@ base_learner* ccb_explore_adf_setup(options_i& options, vw& all)
   data->id_namespace_hash = VW::hash_space(all, data->id_namespace_str);
 
   learner<ccb, multi_ex>& l =
-      init_learner(data, base, learn_or_predict<true>, learn_or_predict<false>, 1, prediction_type_t::decision_probs);
-
-  all.delete_prediction = ACTION_SCORE::delete_action_scores;
-
+      init_learner(data, base, learn_or_predict<true>, learn_or_predict<false>, 1, prediction_type_t::decision_scores);
+  l.label_type = label_type_t::conditional_contextual_bandit;
   l.set_finish_example(finish_multiline_example);
   return make_base(l);
 }
 
-bool ec_is_example_header(example const& ec) { return ec.l.conditional_contextual_bandit.type == example_type::shared; }
+bool ec_is_example_header(example const& ec)
+{
+  return ec.l.get_type() == label_type_t::conditional_contextual_bandit &&
+      ec.l.ccb().type == example_type::shared;
+}
 }  // namespace CCB
diff --git a/vowpalwabbit/confidence.cc b/vowpalwabbit/confidence.cc
index 591d849b88c..831c6f47cbc 100644
--- a/vowpalwabbit/confidence.cc
+++ b/vowpalwabbit/confidence.cc
@@ -21,20 +21,20 @@ void predict_or_learn_with_confidence(confidence& /* c */, single_learner& base,
   float threshold = 0.f;
   float sensitivity = 0.f;
 
-  float existing_label = ec.l.simple.label;
+  float existing_label = ec.l.simple().label;
   if (existing_label == FLT_MAX)
   {
     base.predict(ec);
     float opposite_label = 1.f;
-    if (ec.pred.scalar > 0)
+    if (ec.pred.scalar() > 0)
       opposite_label = -1.f;
-    ec.l.simple.label = opposite_label;
+    ec.l.simple().label = opposite_label;
   }
 
   if (!is_confidence_after_training)
     sensitivity = base.sensitivity(ec);
 
-  ec.l.simple.label = existing_label;
+  ec.l.simple().label = existing_label;
   if (is_learn)
     base.learn(ec);
   else
@@ -43,7 +43,7 @@ void predict_or_learn_with_confidence(confidence& /* c */, single_learner& base,
   if (is_confidence_after_training)
     sensitivity = base.sensitivity(ec);
 
-  ec.confidence = fabsf(ec.pred.scalar - threshold) / sensitivity;
+  ec.confidence = fabsf(ec.pred.scalar() - threshold) / sensitivity;
 }
 
 void confidence_print_result(int f, float res, float confidence, v_array<char> tag)
@@ -64,7 +64,7 @@ void confidence_print_result(int f, float res, float confidence, v_array<char> t
 
 void output_and_account_confidence_example(vw& all, example& ec)
 {
-  label_data& ld = ec.l.simple;
+  label_data& ld = ec.l.simple();
 
   all.sd->update(ec.test_only, ld.label != FLT_MAX, ec.loss, ec.weight, ec.num_features);
   if (ld.label != FLT_MAX && !ec.test_only)
@@ -75,7 +75,7 @@ void output_and_account_confidence_example(vw& all, example& ec)
   for (size_t i = 0; i < all.final_prediction_sink.size(); i++)
   {
     int f = (int)all.final_prediction_sink[i];
-    confidence_print_result(f, ec.pred.scalar, ec.confidence, ec.tag);
+    confidence_print_result(f, ec.pred.scalar(), ec.confidence, ec.tag);
   }
 
   print_update(all, ec);
@@ -130,6 +130,7 @@ base_learner* confidence_setup(options_i& options, vw& all)
       data, as_singleline(setup_base(options, all)), learn_with_confidence_ptr, predict_with_confidence_ptr);
 
   l.set_finish_example(return_confidence_example);
+  l.label_type = label_type_t::simple;
 
   return make_base(l);
 }
diff --git a/vowpalwabbit/cost_sensitive.cc b/vowpalwabbit/cost_sensitive.cc
index 9df12d8f461..86afc2600ab 100644
--- a/vowpalwabbit/cost_sensitive.cc
+++ b/vowpalwabbit/cost_sensitive.cc
@@ -31,10 +31,10 @@ void name_value(VW::string_view& s, v_array<VW::string_view>& name, float& v)
   }
 }
 
-char* bufread_label(label* ld, char* c, io_buf& cache)
+char* bufread_label(label& ld, char* c, io_buf& cache)
 {
   size_t num = *(size_t*)c;
-  ld->costs.clear();
+  ld.costs.clear();
   c += sizeof(size_t);
   size_t total = sizeof(wclass) * num;
   if (cache.buf_read(c, (int)total) < total)
@@ -46,16 +46,17 @@ char* bufread_label(label* ld, char* c, io_buf& cache)
   {
     wclass temp = *(wclass*)c;
     c += sizeof(wclass);
-    ld->costs.push_back(temp);
+    ld.costs.push_back(temp);
   }
 
   return c;
 }
 
-size_t read_cached_label(shared_data*, void* v, io_buf& cache)
+size_t read_cached_label(shared_data*, polylabel& v, io_buf& cache)
 {
-  label* ld = (label*)v;
-  ld->costs.clear();
+  auto& ld = v.cs();
+
+  ld.costs.clear();
   char* c;
   size_t total = sizeof(size_t);
   if (cache.buf_read(c, (int)total) < total)
@@ -65,66 +66,57 @@ size_t read_cached_label(shared_data*, void* v, io_buf& cache)
   return total;
 }
 
-float weight(void*) { return 1.; }
+float weight(polylabel&) { return 1.; }
 
-char* bufcache_label(label* ld, char* c)
+char* bufcache_label(label& ld, char* c)
 {
-  *(size_t*)c = ld->costs.size();
+  *(size_t*)c = ld.costs.size();
   c += sizeof(size_t);
-  for (unsigned int i = 0; i < ld->costs.size(); i++)
+  for (unsigned int i = 0; i < ld.costs.size(); i++)
   {
-    *(wclass*)c = ld->costs[i];
+    *(wclass*)c = ld.costs[i];
     c += sizeof(wclass);
   }
   return c;
 }
 
-void cache_label(void* v, io_buf& cache)
+void cache_label(polylabel& v, io_buf& cache)
 {
   char* c;
-  label* ld = (label*)v;
-  cache.buf_write(c, sizeof(size_t) + sizeof(wclass) * ld->costs.size());
+  auto& ld = v.cs();
+  cache.buf_write(c, sizeof(size_t) + sizeof(wclass) * ld.costs.size());
   bufcache_label(ld, c);
 }
 
-void default_label(void* v)
+void default_label(label& label) { label.costs.clear(); }
+
+void default_label(polylabel& v)
 {
-  label* ld = (label*)v;
-  ld->costs.clear();
+  if (v.get_type() != label_type_t::cs)
+  {
+    v.reset();
+    v.init_as_cs();
+  }
+
+  auto& ld = v.cs();
+  default_label(ld);
 }
 
-bool test_label(void* v)
+bool test_label(polylabel& v)
 {
-  label* ld = (label*)v;
-  if (ld->costs.size() == 0)
+  auto& ld = v.cs();
+  if (ld.costs.size() == 0)
     return true;
-  for (unsigned int i = 0; i < ld->costs.size(); i++)
-    if (FLT_MAX != ld->costs[i].x)
+  for (unsigned int i = 0; i < ld.costs.size(); i++)
+    if (FLT_MAX != ld.costs[i].x)
       return false;
   return true;
 }
 
-void delete_label(void* v)
+void parse_label(parser* p, shared_data* sd, polylabel& v, v_array<VW::string_view>& words)
 {
-  label* ld = (label*)v;
-  if (ld)
-    ld->costs.delete_v();
-}
-
-void copy_label(void* dst, void* src)
-{
-  if (dst && src)
-  {
-    label* ldD = (label*)dst;
-    label* ldS = (label*)src;
-    copy_array(ldD->costs, ldS->costs);
-  }
-}
-
-void parse_label(parser* p, shared_data* sd, void* v, v_array<VW::string_view>& words)
-{
-  label* ld = (label*)v;
-  ld->costs.clear();
+  auto& ld = v.cs();
+  ld.costs.clear();
 
   // handle shared and label first
   if (words.size() == 1)
@@ -147,7 +139,7 @@ void parse_label(parser* p, shared_data* sd, void* v, v_array<VW::string_view>&
         else
         {
           wclass f = {-FLT_MAX, 0, 0., 0.};
-          ld->costs.push_back(f);
+          ld.costs.push_back(f);
         }
       }
       if (eq_label)
@@ -157,7 +149,7 @@ void parse_label(parser* p, shared_data* sd, void* v, v_array<VW::string_view>&
         else
         {
           wclass f = {float_of_string(p->parse_name[1]), 0, 0., 0.};
-          ld->costs.push_back(f);
+          ld.costs.push_back(f);
         }
       }
       return;
@@ -183,12 +175,12 @@ void parse_label(parser* p, shared_data* sd, void* v, v_array<VW::string_view>&
     else
       THROW("malformed cost specification on '" << (p->parse_name[0]) << "'");
 
-    ld->costs.push_back(f);
+    ld.costs.push_back(f);
   }
 }
 
-label_parser cs_label = {default_label, parse_label, cache_label, read_cached_label, delete_label, weight, copy_label,
-    test_label, sizeof(label)};
+label_parser cs_label = {default_label, parse_label, cache_label, read_cached_label, polylabel_delete_label, weight,
+    polylabel_copy_label, test_label, sizeof(label)};
 
 void print_update(vw& all, bool is_test, example& ec, multi_ex* ec_seq, bool action_scores, uint32_t prediction)
 {
@@ -218,12 +210,12 @@ void print_update(vw& all, bool is_test, example& ec, multi_ex* ec_seq, bool act
       if (all.sd->ldict)
       {
         if (action_scores)
-          pred_buf << all.sd->ldict->get(ec.pred.a_s[0].action);
+          pred_buf << all.sd->ldict->get(ec.pred.action_scores()[0].action);
         else
           pred_buf << all.sd->ldict->get(prediction);
       }
       else
-        pred_buf << ec.pred.a_s[0].action;
+        pred_buf << ec.pred.action_scores()[0].action;
       if (action_scores)
         pred_buf << ".....";
       all.sd->print_update(all.holdout_set_off, all.current_pass, label_buf, pred_buf.str(), num_current_features,
@@ -238,13 +230,13 @@ void print_update(vw& all, bool is_test, example& ec, multi_ex* ec_seq, bool act
 
 void output_example(vw& all, example& ec)
 {
-  label& ld = ec.l.cs;
+  label& ld = ec.l.cs();
 
   float loss = 0.;
-  if (!test_label(&ld))
+  if (!test_label(ec.l))
   {
     // need to compute exact loss
-    size_t pred = (size_t)ec.pred.multiclass;
+    size_t pred = (size_t)ec.pred.multiclass();
 
     float chosen_loss = FLT_MAX;
     float min = FLT_MAX;
@@ -264,14 +256,14 @@ void output_example(vw& all, example& ec)
     // loss = chosen_loss;
   }
 
-  all.sd->update(ec.test_only, !test_label(&ld), loss, ec.weight, ec.num_features);
+  all.sd->update(ec.test_only, !test_label(ec.l), loss, ec.weight, ec.num_features);
 
   for (int sink : all.final_prediction_sink)
     if (!all.sd->ldict)
-      all.print_by_ref(sink, (float)ec.pred.multiclass, 0, ec.tag);
+      all.print_by_ref(sink, (float)ec.pred.multiclass(), 0, ec.tag);
     else
     {
-      VW::string_view sv_pred = all.sd->ldict->get(ec.pred.multiclass);
+      VW::string_view sv_pred = all.sd->ldict->get(ec.pred.multiclass());
       all.print_text_by_ref(sink, sv_pred.to_string(), ec.tag);
     }
 
@@ -288,7 +280,7 @@ void output_example(vw& all, example& ec)
     all.print_text_by_ref(all.raw_prediction, outputStringStream.str(), ec.tag);
   }
 
-  print_update(all, test_label(&ec.l.cs), ec, nullptr, false, ec.pred.multiclass);
+  print_update(all, test_label(ec.l), ec, nullptr, false, ec.pred.multiclass());
 }
 
 void finish_example(vw& all, example& ec)
@@ -299,7 +291,7 @@ void finish_example(vw& all, example& ec)
 
 bool example_is_test(example& ec)
 {
-  v_array<COST_SENSITIVE::wclass> costs = ec.l.cs.costs;
+  auto& costs = ec.l.cs().costs;
   if (costs.size() == 0)
     return true;
   for (size_t j = 0; j < costs.size(); j++)
@@ -310,13 +302,18 @@ bool example_is_test(example& ec)
 
 bool ec_is_example_header(example const& ec)  // example headers look like "shared"
 {
-  v_array<COST_SENSITIVE::wclass> costs = ec.l.cs.costs;
-  if (costs.size() != 1)
-    return false;
-  if (costs[0].class_index != 0)
-    return false;
-  if (costs[0].x != -FLT_MAX)
-    return false;
-  return true;
+  if (ec.l.get_type() == label_type_t::cs)
+  {
+    auto& costs = ec.l.cs().costs;
+    if (costs.size() != 1)
+      return false;
+    if (costs[0].class_index != 0)
+      return false;
+    if (costs[0].x != -FLT_MAX)
+      return false;
+    return true;
+  }
+
+  return false;
 }
 }  // namespace COST_SENSITIVE
diff --git a/vowpalwabbit/cost_sensitive.h b/vowpalwabbit/cost_sensitive.h
index bf216e6c2ea..14f1a45f71b 100644
--- a/vowpalwabbit/cost_sensitive.h
+++ b/vowpalwabbit/cost_sensitive.h
@@ -29,6 +29,9 @@ struct label
   v_array<wclass> costs;
 };
 
+void delete_label(label& label);
+void default_label(label& label);
+
 void output_example(vw& all, example& ec);
 void finish_example(vw& all, example& ec);
 template <class T>
diff --git a/vowpalwabbit/cs_active.cc b/vowpalwabbit/cs_active.cc
index 804ec1c0879..786e3617285 100644
--- a/vowpalwabbit/cs_active.cc
+++ b/vowpalwabbit/cs_active.cc
@@ -61,8 +61,6 @@ struct cs_active
   size_t labels_outside_range;
   float distance_to_range;
   float range;
-
-  ~cs_active() { examples_by_queries.delete_v(); }
 };
 
 float binarySearch(float fhat, float delta, float sens, float tol)
@@ -98,18 +96,18 @@ inline void inner_loop(cs_active& cs_a, single_learner& base, example& ec, uint3
   if (is_learn)
   {
     vw& all = *cs_a.all;
-    ec.l.simple.weight = 1.;
+    ec.l.simple().weight = 1.;
     ec.weight = 1.;
     if (is_simulation)
     {
       // In simulation mode
       if (query_this_label)
       {
-        ec.l.simple.label = cost;
+        ec.l.simple().label = cost;
         all.sd->queries += 1;
       }
       else
-        ec.l.simple.label = FLT_MAX;
+        ec.l.simple().label = FLT_MAX;
     }
     else
     {
@@ -118,16 +116,16 @@ inline void inner_loop(cs_active& cs_a, single_learner& base, example& ec, uint3
       // If the cost of this label was not queried, then skip it.
       if (query_needed)
       {
-        ec.l.simple.label = cost;
+        ec.l.simple().label = cost;
         if ((cost < cs_a.cost_min) || (cost > cs_a.cost_max))
           cerr << "warning: cost " << cost << " outside of cost range [" << cs_a.cost_min << ", " << cs_a.cost_max
                << "]!" << endl;
       }
       else
-        ec.l.simple.label = FLT_MAX;
+        ec.l.simple().label = FLT_MAX;
     }
 
-    if (ec.l.simple.label != FLT_MAX)
+    if (ec.l.simple().label != FLT_MAX)
       base.learn(ec, i - 1);
   }
   else if (!is_simulation)
@@ -164,10 +162,10 @@ inline void find_cost_range(cs_active& cs_a, single_learner& base, example& ec,
   else
   {
     // finding max_pred and min_pred by binary search
-    max_pred =
-        std::min(ec.pred.scalar + sens * binarySearch(cs_a.cost_max - ec.pred.scalar, delta, sens, tol), cs_a.cost_max);
-    min_pred =
-        std::max(ec.pred.scalar - sens * binarySearch(ec.pred.scalar - cs_a.cost_min, delta, sens, tol), cs_a.cost_min);
+    max_pred = std::min(
+        ec.pred.scalar() + sens * binarySearch(cs_a.cost_max - ec.pred.scalar(), delta, sens, tol), cs_a.cost_max);
+    min_pred = std::max(
+        ec.pred.scalar() - sens * binarySearch(ec.pred.scalar() - cs_a.cost_min, delta, sens, tol), cs_a.cost_min);
     is_range_large = (max_pred - min_pred > eta);
     if (cs_a.print_debug_stuff)
       cerr << "  find_cost_rangeB: i=" << i << " pp=" << ec.partial_prediction << " sens=" << sens << " eta=" << eta
@@ -179,7 +177,7 @@ template <bool is_learn, bool is_simulation>
 void predict_or_learn(cs_active& cs_a, single_learner& base, example& ec)
 {
   // cerr << "------------- passthrough" << endl;
-  COST_SENSITIVE::label ld = ec.l.cs;
+  COST_SENSITIVE::label ld = std::move(ec.l.cs());
 
   // cerr << "is_learn=" << is_learn << " ld.costs.size()=" << ld.costs.size() << endl;
   if (cs_a.all->sd->queries >= cs_a.min_labels * cs_a.num_classes)
@@ -215,7 +213,10 @@ void predict_or_learn(cs_active& cs_a, single_learner& base, example& ec)
 
   uint32_t prediction = 1;
   float score = FLT_MAX;
-  ec.l.simple = {0., 0., 0.};
+  ec.l.reset();
+  ec.l.init_as_simple();
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
 
   float min_max_cost = FLT_MAX;
   float t = (float)cs_a.t;  // ec.example_t;  // current round
@@ -269,7 +270,7 @@ void predict_or_learn(cs_active& cs_a, single_learner& base, example& ec)
       inner_loop<is_learn, is_simulation>(cs_a, base, ec, lqd.cl.class_index, lqd.cl.x, prediction, score,
           lqd.cl.partial_prediction, query_label, lqd.query_needed);
       if (lqd.query_needed)
-        ec.pred.multilabels.label_v.push_back(lqd.cl.class_index);
+        ec.pred.multilabels().label_v.push_back(lqd.cl.class_index);
       if (cs_a.print_debug_stuff)
         cerr << "label=" << lqd.cl.class_index << " x=" << lqd.cl.x << " prediction=" << prediction
              << " score=" << score << " pp=" << lqd.cl.partial_prediction << " ql=" << query_label
@@ -279,7 +280,7 @@ void predict_or_learn(cs_active& cs_a, single_learner& base, example& ec)
     }
 
     // Need to pop metadata
-    cs_a.query_data.delete_v();
+    cs_a.query_data.clear();
 
     if (cs_a.all->sd->queries - queries > 0)
       cs_a.num_any_queries++;
@@ -304,11 +305,13 @@ void predict_or_learn(cs_active& cs_a, single_learner& base, example& ec)
     }
   }
 
-  ec.pred.multiclass = prediction;
-  ec.l.cs = ld;
+  ec.pred.reset();
+  ec.pred.init_as_multiclass() = prediction;
+  ec.l.reset();
+  ec.l.init_as_cs(std::move(ld));
 }
 
-void finish_example(vw& all, cs_active& cs_a, example& ec) { CSOAA::finish_example(all, *(CSOAA::csoaa*)&cs_a, ec); }
+void finish_example(vw& all, cs_active&, example& ec) { CSOAA::finish_example(all, ec); }
 
 base_learner* cs_active_setup(options_i& options, vw& all)
 {
@@ -372,12 +375,13 @@ base_learner* cs_active_setup(options_i& options, vw& all)
 
   learner<cs_active, example>& l = simulation
       ? init_learner(data, as_singleline(setup_base(options, all)), predict_or_learn<true, true>,
-            predict_or_learn<false, true>, data->num_classes, prediction_type_t::multilabels)
+            predict_or_learn<false, true>, data->num_classes, prediction_type_t::multiclass)
       : init_learner(data, as_singleline(setup_base(options, all)), predict_or_learn<true, false>,
-            predict_or_learn<false, false>, data->num_classes, prediction_type_t::multilabels);
+            predict_or_learn<false, false>, data->num_classes, prediction_type_t::multiclass);
 
   l.set_finish_example(finish_example);
   base_learner* b = make_base(l);
+  l.label_type = label_type_t::cs;
   all.cost_sensitive = b;
   return b;
 }
diff --git a/vowpalwabbit/csoaa.cc b/vowpalwabbit/csoaa.cc
index a0d363c3b0b..aab6125e43f 100644
--- a/vowpalwabbit/csoaa.cc
+++ b/vowpalwabbit/csoaa.cc
@@ -22,8 +22,7 @@ namespace CSOAA
 struct csoaa
 {
   uint32_t num_classes;
-  polyprediction* pred;
-  ~csoaa() { free(pred); }
+  std::vector<polyprediction> pred;
 };
 
 template <bool is_learn>
@@ -33,7 +32,7 @@ inline void inner_loop(single_learner& base, example& ec, uint32_t i, float cost
   if (is_learn)
   {
     ec.weight = (cost == FLT_MAX) ? 0.f : 1.f;
-    ec.l.simple.label = cost;
+    ec.l.simple().label = cost;
     base.learn(ec, i - 1);
   }
   else
@@ -54,11 +53,15 @@ template <bool is_learn>
 void predict_or_learn(csoaa& c, single_learner& base, example& ec)
 {
   // std::cerr << "------------- passthrough" << std::endl;
-  COST_SENSITIVE::label ld = ec.l.cs;
+  COST_SENSITIVE::label ld = std::move(ec.l.cs());
   uint32_t prediction = 1;
   float score = FLT_MAX;
   size_t pt_start = ec.passthrough ? ec.passthrough->size() : 0;
-  ec.l.simple = {0., 0., 0.};
+
+  ec.l.reset();
+  ec.l.init_as_simple();
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
   if (!ld.costs.empty())
   {
     for (auto& cl : ld.costs)
@@ -67,15 +70,15 @@ void predict_or_learn(csoaa& c, single_learner& base, example& ec)
   }
   else if (DO_MULTIPREDICT && !is_learn)
   {
-    ec.l.simple = {FLT_MAX, 0.f, 0.f};
-    base.multipredict(ec, 0, c.num_classes, c.pred, false);
+    ec.l.simple() = {FLT_MAX, 0.f, 0.f};
+    base.multipredict(ec, 0, c.num_classes, c.pred.data(), false);
     for (uint32_t i = 1; i <= c.num_classes; i++)
     {
-      add_passthrough_feature(ec, i, c.pred[i - 1].scalar);
-      if (c.pred[i - 1].scalar < c.pred[prediction - 1].scalar)
+      add_passthrough_feature(ec, i, c.pred[i - 1].scalar());
+      if (c.pred[i - 1].scalar() < c.pred[prediction - 1].scalar())
         prediction = i;
     }
-    ec.partial_prediction = c.pred[prediction - 1].scalar;
+    ec.partial_prediction = c.pred[prediction - 1].scalar();
   }
   else
   {
@@ -105,11 +108,15 @@ void predict_or_learn(csoaa& c, single_learner& base, example& ec)
       add_passthrough_feature(ec, constant * 3, 1.);
   }
 
-  ec.pred.multiclass = prediction;
-  ec.l.cs = ld;
+  ec.pred.reset();
+  ec.pred.init_as_multiclass() = prediction;
+  ec.l.reset();
+  ec.l.init_as_cs(std::move(ld));
 }
 
 void finish_example(vw& all, csoaa&, example& ec) { COST_SENSITIVE::finish_example(all, ec); }
+void finish_example(vw& all, example& ec) { COST_SENSITIVE::finish_example(all, ec); }
+
 
 base_learner* csoaa_setup(options_i& options, vw& all)
 {
@@ -121,14 +128,18 @@ base_learner* csoaa_setup(options_i& options, vw& all)
   if (!options.was_supplied("csoaa"))
     return nullptr;
 
-  c->pred = calloc_or_throw<polyprediction>(c->num_classes);
+  c->pred.resize(c->num_classes);
+  for (auto& pred : c->pred)
+  {
+    pred.init_as_scalar();
+  }
 
   learner<csoaa, example>& l = init_learner(c, as_singleline(setup_base(*all.options, all)), predict_or_learn<true>,
       predict_or_learn<false>, c->num_classes, prediction_type_t::multiclass);
   all.p->lp = cs_label;
-  all.label_type = label_type_t::cs;
 
   l.set_finish_example(finish_example);
+  l.label_type = label_type_t::cs;
   all.cost_sensitive = make_base(l);
   return all.cost_sensitive;
 }
@@ -153,12 +164,6 @@ struct ldf
   uint64_t ft_offset;
 
   v_array<action_scores> stored_preds;
-
-  ~ldf()
-  {
-    a_s.delete_v();
-    stored_preds.delete_v();
-  }
 };
 
 bool ec_is_label_definition(example& ec)  // label defs look like "0:___" or just "label:___"
@@ -167,7 +172,7 @@ bool ec_is_label_definition(example& ec)  // label defs look like "0:___" or jus
     return false;
   if (ec.indices[0] != 'l')
     return false;
-  const auto& costs = ec.l.cs.costs;
+  const auto& costs = ec.l.cs().costs;
   for (auto const& cost : costs)
     if ((cost.class_index != 0) || (cost.x <= 0.))
       return false;
@@ -246,14 +251,18 @@ void unsubtract_example(example* ec)
 
 void make_single_prediction(ldf& data, single_learner& base, example& ec)
 {
-  COST_SENSITIVE::label ld = ec.l.cs;
+  COST_SENSITIVE::label ld = std::move(ec.l.cs());
   label_data simple_label;
-  simple_label.initial = 0.;
   simple_label.label = FLT_MAX;
 
   LabelDict::add_example_namespace_from_memory(data.label_features, ec, ld.costs[0].class_index);
 
-  ec.l.simple = simple_label;
+  ec.l.reset();
+  ec.l.init_as_simple(simple_label);
+
+  ec.pred.reset();
+  ec.pred.init_as_scalar(0.f);
+
   uint64_t old_offset = ec.ft_offset;
   ec.ft_offset = data.ft_offset;
   base.predict(ec);  // make a prediction
@@ -261,7 +270,8 @@ void make_single_prediction(ldf& data, single_learner& base, example& ec)
   ld.costs[0].partial_prediction = ec.partial_prediction;
 
   LabelDict::del_example_namespace_from_memory(data.label_features, ec, ld.costs[0].class_index);
-  ec.l.cs = ld;
+  ec.l.reset();
+  ec.l.init_as_cs(std::move(ld));
 }
 
 bool test_ldf_sequence(ldf& data, multi_ex& ec_seq)
@@ -270,13 +280,13 @@ bool test_ldf_sequence(ldf& data, multi_ex& ec_seq)
   if (ec_seq.empty())
     isTest = true;
   else
-    isTest = COST_SENSITIVE::cs_label.test_label(&ec_seq[0]->l);
+    isTest = COST_SENSITIVE::cs_label.test_label(ec_seq[0]->l);
   for (const auto& ec : ec_seq)
   {
     // Each sub-example must have just one cost
-    assert(ec->l.cs.costs.size() == 1);
+    assert(ec->l.cs().costs.size() == 1);
 
-    if (COST_SENSITIVE::cs_label.test_label(&ec->l) != isTest)
+    if (COST_SENSITIVE::cs_label.test_label(ec->l) != isTest)
     {
       isTest = true;
       data.all->trace_message << "warning: ldf example has mix of train/test data; assuming test" << std::endl;
@@ -289,18 +299,17 @@ void do_actual_learning_wap(ldf& data, single_learner& base, multi_ex& ec_seq)
 {
   size_t K = ec_seq.size();
   std::vector<COST_SENSITIVE::wclass*> all_costs;
-  for (const auto& example : ec_seq) all_costs.push_back(&example->l.cs.costs[0]);
+  for (const auto& example : ec_seq) all_costs.push_back(&example->l.cs().costs[0]);
   compute_wap_values(all_costs);
 
   for (size_t k1 = 0; k1 < K; k1++)
   {
     example* ec1 = ec_seq[k1];
 
-    // save original variables
-    COST_SENSITIVE::label save_cs_label = ec1->l.cs;
-    label_data& simple_label = ec1->l.simple;
+    // Save original label.
+    COST_SENSITIVE::label save_cs_label(std::move(ec1->l.cs()));
 
-    v_array<COST_SENSITIVE::wclass> costs1 = save_cs_label.costs;
+    auto& costs1 = save_cs_label.costs;
     if (costs1[0].class_index == (uint32_t)-1)
       continue;
 
@@ -309,7 +318,7 @@ void do_actual_learning_wap(ldf& data, single_learner& base, multi_ex& ec_seq)
     for (size_t k2 = k1 + 1; k2 < K; k2++)
     {
       example* ec2 = ec_seq[k2];
-      v_array<COST_SENSITIVE::wclass> costs2 = ec2->l.cs.costs;
+      auto& costs2 = ec2->l.cs().costs;
 
       if (costs2[0].class_index == (uint32_t)-1)
         continue;
@@ -321,8 +330,10 @@ void do_actual_learning_wap(ldf& data, single_learner& base, multi_ex& ec_seq)
       LabelDict::add_example_namespace_from_memory(data.label_features, *ec2, costs2[0].class_index);
 
       // learn
-      simple_label.initial = 0.;
+      ec1->l.reset();
+      label_data& simple_label = ec1->l.init_as_simple();
       simple_label.label = (costs1[0].x < costs2[0].x) ? -1.0f : 1.0f;
+
       float old_weight = ec1->weight;
       ec1->weight = value_diff;
       ec1->partial_prediction = 0.;
@@ -338,8 +349,9 @@ void do_actual_learning_wap(ldf& data, single_learner& base, multi_ex& ec_seq)
     }
     LabelDict::del_example_namespace_from_memory(data.label_features, *ec1, costs1[0].class_index);
 
-    // restore original cost-sensitive label, sum of importance weights
-    ec1->l.cs = save_cs_label;
+    // Restore original cost-sensitive label, sum of importance weights.
+    ec1->l.reset();
+    ec1->l.init_as_cs(std::move(save_cs_label));
     // TODO: What about partial_prediction? See do_actual_learning_oaa.
   }
 }
@@ -351,7 +363,7 @@ void do_actual_learning_oaa(ldf& data, single_learner& base, multi_ex& ec_seq)
 
   for (const auto& example : ec_seq)
   {
-    float ec_cost = example->l.cs.costs[0].x;
+    float ec_cost = example->l.cs().costs[0].x;
     if (ec_cost < min_cost)
       min_cost = ec_cost;
     if (ec_cost > max_cost)
@@ -361,7 +373,7 @@ void do_actual_learning_oaa(ldf& data, single_learner& base, multi_ex& ec_seq)
   for (const auto& ec : ec_seq)
   {
     // save original variables
-    label save_cs_label = ec->l.cs;
+    label save_cs_label = std::move(ec->l.cs());
     const auto& costs = save_cs_label.costs;
 
     // build example for the base learner
@@ -384,7 +396,8 @@ void do_actual_learning_oaa(ldf& data, single_learner& base, multi_ex& ec_seq)
         ec->weight = old_weight * (costs[0].x - min_cost);
       }
     }
-    ec->l.simple = simple_label;
+    ec->l.reset();
+    ec->l.init_as_simple(simple_label);
 
     // learn
     LabelDict::add_example_namespace_from_memory(data.label_features, *ec, costs[0].class_index);
@@ -396,8 +409,9 @@ void do_actual_learning_oaa(ldf& data, single_learner& base, multi_ex& ec_seq)
     ec->weight = old_weight;
 
     // restore original cost-sensitive label, sum of importance weights and partial_prediction
-    ec->l.cs = save_cs_label;
     ec->partial_prediction = costs[0].partial_prediction;
+    ec->l.reset();
+    ec->l.init_as_cs(std::move(save_cs_label));
   }
 }
 
@@ -446,7 +460,7 @@ void do_actual_learning(ldf& data, single_learner& base, multi_ex& ec_seq_all)
     for (uint32_t k = 0; k < K; k++)
     {
       example* ec = ec_seq[k];
-      data.stored_preds.push_back(ec->pred.a_s);
+      data.stored_preds.push_back(std::move(ec->pred.action_scores()));
       make_single_prediction(data, base, *ec);
       action_score s;
       s.score = ec->partial_prediction;
@@ -480,45 +494,47 @@ void do_actual_learning(ldf& data, single_learner& base, multi_ex& ec_seq_all)
       do_actual_learning_oaa(data, base, ec_seq);
   }
 
-  if (data.rank)
+  // Clear the existing prediction
+  for (auto& example : ec_seq)
   {
-    data.stored_preds[0].clear();
-    for (size_t k = 0; k < K; k++)
-    {
-      ec_seq[k]->pred.a_s = data.stored_preds[k];
-      ec_seq[0]->pred.a_s.push_back(data.a_s[k]);
-    }
+    example->pred.reset();
   }
-  else
+
+  // Set the prediction.
+  if (data.rank)
   {
-    // Mark the predicted subexample with its class_index, all other with 0
+    data.stored_preds[0].clear();
     for (size_t k = 0; k < K; k++)
     {
-      if (k == predicted_K)
-        ec_seq[k]->pred.multiclass = ec_seq[k]->l.cs.costs[0].class_index;
-      else
-        ec_seq[k]->pred.multiclass = 0;
+      ec_seq[k]->pred.init_as_action_scores() = std::move(data.stored_preds[k]);
+      ec_seq[0]->pred.action_scores().push_back(data.a_s[k]);
     }
   }
-
-  ////////////////////// compute probabilities
-  if (data.is_probabilities)
+  else if (data.is_probabilities)
   {
     float sum_prob = 0;
-    for (const auto& example : ec_seq)
+    for (auto& example : ec_seq)
     {
       // probability(correct_class) = 1 / (1+exp(-score)), where score is higher for better classes,
       // but partial_prediction is lower for better classes (we are predicting the cost),
       // so we need to take score = -partial_prediction,
       // thus probability(correct_class) = 1 / (1+exp(-(-partial_prediction)))
       float prob = 1.f / (1.f + correctedExp(example->partial_prediction));
-      example->pred.prob = prob;
+      example->pred.init_as_prob() = prob;
       sum_prob += prob;
     }
     // make sure that the probabilities sum up (exactly) to one
-    for (const auto& example : ec_seq)
+    for (auto& example : ec_seq)
+    {
+      example->pred.prob() /= sum_prob;
+    }
+  }
+  else
+  {
+    // Mark the predicted subexample with its class_index, all other with 0
+    for (size_t k = 0; k < K; k++)
     {
-      example->pred.prob /= sum_prob;
+      ec_seq[k]->pred.init_as_multiclass() = k == predicted_K ? ec_seq[k]->l.cs().costs[0].class_index : 0;
     }
   }
 }
@@ -538,8 +554,8 @@ void global_print_newline(vw& all)
 
 void output_example(vw& all, example& ec, bool& hit_loss, multi_ex* ec_seq, ldf& data)
 {
-  label& ld = ec.l.cs;
-  v_array<COST_SENSITIVE::wclass> costs = ld.costs;
+  label& ld = ec.l.cs();
+  v_array<COST_SENSITIVE::wclass>& costs = ld.costs;
 
   if (example_is_newline(ec))
     return;
@@ -554,7 +570,7 @@ void output_example(vw& all, example& ec, bool& hit_loss, multi_ex* ec_seq, ldf&
   if (data.is_probabilities)
   {
     // predicted_K was already computed in do_actual_learning(),
-    // but we cannot store it in ec.pred union because we store ec.pred.prob there.
+    // but we cannot store it in ec.pred union because we store ec.pred.prob() there.
     // So we must compute it again.
     uint32_t predicted_K = 0;
     float min_score = FLT_MAX;
@@ -567,12 +583,12 @@ void output_example(vw& all, example& ec, bool& hit_loss, multi_ex* ec_seq, ldf&
         predicted_K = (uint32_t)k;
       }
     }
-    predicted_class = (*ec_seq)[predicted_K]->l.cs.costs[0].class_index;
+    predicted_class = (*ec_seq)[predicted_K]->l.cs().costs[0].class_index;
   }
   else
-    predicted_class = ec.pred.multiclass;
+    predicted_class = ec.pred.multiclass();
 
-  if (!COST_SENSITIVE::cs_label.test_label(&ec.l))
+  if (!COST_SENSITIVE::cs_label.test_label(ec.l))
   {
     for (auto const& cost : costs)
     {
@@ -590,7 +606,7 @@ void output_example(vw& all, example& ec, bool& hit_loss, multi_ex* ec_seq, ldf&
   }
 
   for (int sink : all.final_prediction_sink)
-    all.print_by_ref(sink, data.is_probabilities ? ec.pred.prob : (float)ec.pred.multiclass, 0, ec.tag);
+    all.print_by_ref(sink, data.is_probabilities ? ec.pred.prob() : (float)ec.pred.multiclass(), 0, ec.tag);
 
   if (all.raw_prediction > 0)
   {
@@ -606,12 +622,12 @@ void output_example(vw& all, example& ec, bool& hit_loss, multi_ex* ec_seq, ldf&
     all.print_text_by_ref(all.raw_prediction, outputStringStream.str(), ec.tag);
   }
 
-  COST_SENSITIVE::print_update(all, COST_SENSITIVE::cs_label.test_label(&ec.l), ec, ec_seq, false, predicted_class);
+  COST_SENSITIVE::print_update(all, COST_SENSITIVE::cs_label.test_label(ec.l), ec, ec_seq, false, predicted_class);
 }
 
 void output_rank_example(vw& all, example& head_ec, bool& hit_loss, multi_ex* ec_seq)
 {
-  const auto& costs = head_ec.l.cs.costs;
+  const auto& costs = head_ec.l.cs().costs;
 
   if (example_is_newline(head_ec))
     return;
@@ -621,9 +637,9 @@ void output_rank_example(vw& all, example& head_ec, bool& hit_loss, multi_ex* ec
   all.sd->total_features += head_ec.num_features;
 
   float loss = 0.;
-  v_array<action_score>& preds = head_ec.pred.a_s;
+  v_array<action_score>& preds = head_ec.pred.action_scores();
 
-  if (!COST_SENSITIVE::cs_label.test_label(&head_ec.l))
+  if (!COST_SENSITIVE::cs_label.test_label(head_ec.l))
   {
     size_t idx = 0;
     for (example* ex : *ec_seq)
@@ -632,7 +648,7 @@ void output_rank_example(vw& all, example& head_ec, bool& hit_loss, multi_ex* ec
         break;
       if (preds[0].action == idx)
       {
-        loss = ex->l.cs.costs[0].x;
+        loss = ex->l.cs().costs[0].x;
         hit_loss = true;
       }
       idx++;
@@ -642,7 +658,7 @@ void output_rank_example(vw& all, example& head_ec, bool& hit_loss, multi_ex* ec
     assert(loss >= 0);
   }
 
-  for (int sink : all.final_prediction_sink) print_action_score(sink, head_ec.pred.a_s, head_ec.tag);
+  for (int sink : all.final_prediction_sink) print_action_score(sink, head_ec.pred.action_scores(), head_ec.tag);
 
   if (all.raw_prediction > 0)
   {
@@ -658,7 +674,7 @@ void output_rank_example(vw& all, example& head_ec, bool& hit_loss, multi_ex* ec
     all.print_text_by_ref(all.raw_prediction, outputStringStream.str(), head_ec.tag);
   }
 
-  COST_SENSITIVE::print_update(all, COST_SENSITIVE::cs_label.test_label(&head_ec.l), head_ec, ec_seq, true, 0);
+  COST_SENSITIVE::print_update(all, COST_SENSITIVE::cs_label.test_label(head_ec.l), head_ec, ec_seq, true, 0);
 }
 
 void output_example_seq(vw& all, ldf& data, multi_ex& ec_seq)
@@ -680,7 +696,7 @@ void output_example_seq(vw& all, ldf& data, multi_ex& ec_seq)
 
     if (all.raw_prediction > 0)
     {
-      v_array<char> empty = {nullptr, nullptr, nullptr, 0};
+      v_array<char> empty;
       all.print_text_by_ref(all.raw_prediction, "", empty);
     }
 
@@ -691,7 +707,7 @@ void output_example_seq(vw& all, ldf& data, multi_ex& ec_seq)
 
       for (size_t k = 0; k < K; k++)
       {
-        float ec_cost = ec_seq[k]->l.cs.costs[0].x;
+        float ec_cost = ec_seq[k]->l.cs().costs[0].x;
         if (ec_cost < min_cost)
         {
           min_cost = ec_cost;
@@ -700,7 +716,7 @@ void output_example_seq(vw& all, ldf& data, multi_ex& ec_seq)
       }
 
       float multiclass_log_loss = 999;  // -log(0) = plus infinity
-      float correct_class_prob = ec_seq[correct_class_k]->pred.prob;
+      float correct_class_prob = ec_seq[correct_class_k]->pred.prob();
       if (correct_class_prob > 0)
         multiclass_log_loss = -log(correct_class_prob);
 
@@ -736,8 +752,8 @@ void finish_multiline_example(vw& all, ldf& data, multi_ex& ec_seq)
  */
 void inline process_label(ldf& data, example* ec)
 {
-  //auto new_fs = ec->feature_space[ec->indices[0]];
-  auto& costs = ec->l.cs.costs;
+  // auto new_fs = ec->feature_space[ec->indices[0]];
+  auto& costs = ec->l.cs().costs;
   for (auto const& cost : costs)
   {
     const auto lab = (size_t)cost.x;
@@ -829,11 +845,8 @@ base_learner* csldf_setup(options_i& options, vw& all)
   }
   if (options.was_supplied("ldf_override"))
     ldf_arg = ldf_override;
-  if (ld->rank)
-    all.delete_prediction = delete_action_scores;
 
   all.p->lp = COST_SENSITIVE::cs_label;
-  all.label_type = label_type_t::cs;
 
   ld->treat_as_classifier = false;
   if (ldf_arg == "multiline" || ldf_arg == "m")
@@ -866,6 +879,9 @@ base_learner* csldf_setup(options_i& options, vw& all)
   ld->label_features.reserve(256);
   prediction_type_t pred_type;
 
+  if (ld->rank && ld->is_probabilities)
+    THROW("Cannot specify both csoaa_rank and probabilities at the same time.");
+
   if (ld->rank)
     pred_type = prediction_type_t::action_scores;
   else if (ld->is_probabilities)
@@ -878,6 +894,7 @@ base_learner* csldf_setup(options_i& options, vw& all)
       do_actual_learning<false>, 1, pred_type);
   l.set_finish_example(finish_multiline_example);
   l.set_end_pass(end_pass);
+  l.label_type = label_type_t::cs;
   all.cost_sensitive = make_base(l);
   return all.cost_sensitive;
 }
diff --git a/vowpalwabbit/csoaa.h b/vowpalwabbit/csoaa.h
index 8ef69503045..078477591d6 100644
--- a/vowpalwabbit/csoaa.h
+++ b/vowpalwabbit/csoaa.h
@@ -11,4 +11,5 @@ LEARNER::base_learner* csoaa_setup(VW::config::options_i& options, vw& all);
 LEARNER::base_learner* csldf_setup(VW::config::options_i& options, vw& all);
 struct csoaa;
 void finish_example(vw& all, csoaa&, example& ec);
+void finish_example(vw& all, example& ec);
 }  // namespace CSOAA
diff --git a/vowpalwabbit/ect.cc b/vowpalwabbit/ect.cc
index cbfc786b36a..dcd702f25b2 100644
--- a/vowpalwabbit/ect.cc
+++ b/vowpalwabbit/ect.cc
@@ -47,24 +47,9 @@ struct ect
   uint32_t last_pair;
 
   v_array<bool> tournaments_won;
-
-  ~ect()
-  {
-    for (auto& all_level : all_levels)
-    {
-      for (auto& t : all_level) t.delete_v();
-      all_level.delete_v();
-    }
-    all_levels.delete_v();
-    final_nodes.delete_v();
-    up_directions.delete_v();
-    directions.delete_v();
-    down_directions.delete_v();
-    tournaments_won.delete_v();
-  }
 };
 
-bool exists(v_array<size_t> db)
+bool exists(const v_array<size_t>& db)
 {
   for (unsigned long i : db)
     if (i != 0)
@@ -104,8 +89,8 @@ size_t create_circuit(ect& e, uint64_t max_label, uint64_t eliminations)
   if (max_label == 1)
     return 0;
 
-  v_array<v_array<uint32_t>> tournaments = v_init<v_array<uint32_t>>();
-  v_array<uint32_t> t = v_init<uint32_t>();
+  v_array<v_array<uint32_t>> tournaments;
+  v_array<uint32_t> t;
 
   for (uint32_t i = 0; i < max_label; i++)
   {
@@ -114,11 +99,11 @@ size_t create_circuit(ect& e, uint64_t max_label, uint64_t eliminations)
     e.directions.push_back(d);
   }
 
-  tournaments.push_back(t);
+  tournaments.push_back(std::move(t));
 
   for (size_t i = 0; i < eliminations - 1; i++) tournaments.push_back(v_array<uint32_t>());
 
-  e.all_levels.push_back(tournaments);
+  e.all_levels.push_back(std::move(tournaments));
 
   size_t level = 0;
 
@@ -126,22 +111,21 @@ size_t create_circuit(ect& e, uint64_t max_label, uint64_t eliminations)
 
   while (not_empty(e.all_levels[level]))
   {
-    v_array<v_array<uint32_t>> new_tournaments = v_init<v_array<uint32_t>>();
-    tournaments = e.all_levels[level];
+    v_array<v_array<uint32_t>> new_tournaments;
+    auto& current_tournaments = e.all_levels[level];
 
-    for (size_t t = 0; t < tournaments.size(); t++)
+    for (size_t t = 0; t < current_tournaments.size(); t++)
     {
-      v_array<uint32_t> empty = v_init<uint32_t>();
-      new_tournaments.push_back(empty);
+      new_tournaments.push_back(v_array<uint32_t>());
     }
 
-    for (size_t t = 0; t < tournaments.size(); t++)
+    for (size_t t = 0; t < current_tournaments.size(); t++)
     {
-      for (size_t j = 0; j < tournaments[t].size() / 2; j++)
+      for (size_t j = 0; j < current_tournaments[t].size() / 2; j++)
       {
         uint32_t id = node++;
-        uint32_t left = tournaments[t][2 * j];
-        uint32_t right = tournaments[t][2 * j + 1];
+        uint32_t left = current_tournaments[t][2 * j];
+        uint32_t right = current_tournaments[t][2 * j + 1];
 
         direction d = {id, t, 0, 0, left, right, false};
         e.directions.push_back(d);
@@ -157,10 +141,10 @@ size_t create_circuit(ect& e, uint64_t max_label, uint64_t eliminations)
         if (e.directions[left].last)
           e.directions[left].winner = direction_index;
 
-        if (tournaments[t].size() == 2 && (t == 0 || tournaments[t - 1].empty()))
+        if (current_tournaments[t].size() == 2 && (t == 0 || current_tournaments[t - 1].empty()))
         {
           e.directions[direction_index].last = true;
-          if (t + 1 < tournaments.size())
+          if (t + 1 < current_tournaments.size())
             new_tournaments[t + 1].push_back(id);
           else  // winner eliminated.
             e.directions[direction_index].winner = 0;
@@ -168,15 +152,15 @@ size_t create_circuit(ect& e, uint64_t max_label, uint64_t eliminations)
         }
         else
           new_tournaments[t].push_back(id);
-        if (t + 1 < tournaments.size())
+        if (t + 1 < current_tournaments.size())
           new_tournaments[t + 1].push_back(id);
         else  // loser eliminated.
           e.directions[direction_index].loser = 0;
       }
-      if (tournaments[t].size() % 2 == 1)
-        new_tournaments[t].push_back(tournaments[t].last());
+      if (current_tournaments[t].size() % 2 == 1)
+        new_tournaments[t].push_back(current_tournaments[t].last());
     }
-    e.all_levels.push_back(new_tournaments);
+    e.all_levels.push_back(std::move(new_tournaments));
     level++;
   }
 
@@ -196,7 +180,10 @@ uint32_t ect_predict(ect& e, single_learner& base, example& ec)
   uint32_t finals_winner = 0;
 
   // Binary final elimination tournament first
-  ec.l.simple = {FLT_MAX, 0., 0.};
+  ec.l.reset();
+  ec.l.init_as_simple(FLT_MAX, 0.f, 0.f);
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
 
   for (size_t i = e.tree_height - 1; i != (size_t)0 - 1; i--)
   {
@@ -207,7 +194,7 @@ uint32_t ect_predict(ect& e, single_learner& base, example& ec)
 
       base.learn(ec, problem_number);
 
-      if (ec.pred.scalar > e.class_boundary)
+      if (ec.pred.scalar() > e.class_boundary)
         finals_winner = finals_winner | (((size_t)1) << i);
     }
   }
@@ -217,7 +204,7 @@ uint32_t ect_predict(ect& e, single_learner& base, example& ec)
   {
     base.learn(ec, id - e.k);
 
-    if (ec.pred.scalar > e.class_boundary)
+    if (ec.pred.scalar() > e.class_boundary)
       id = e.directions[id].right;
     else
       id = e.directions[id].left;
@@ -229,7 +216,7 @@ void ect_train(ect& e, single_learner& base, example& ec)
 {
   if (e.k == 1)  // nothing to do
     return;
-  MULTICLASS::label_t mc = ec.l.multi;
+  MULTICLASS::label_t mc = ec.l.multi();
 
   label_data simple_temp;
 
@@ -246,14 +233,17 @@ void ect_train(ect& e, single_learner& base, example& ec)
     else
       simple_temp.label = 1;
 
-    ec.l.simple = simple_temp;
+    ec.l.reset();
+    ec.l.init_as_simple(simple_temp);
+    ec.pred.reset();
+    ec.pred.init_as_scalar();
     base.learn(ec, id - e.k);
     float old_weight = ec.weight;
     ec.weight = 0.;
     base.learn(ec, id - e.k);  // inefficient, we should extract final prediction exactly.
     ec.weight = old_weight;
 
-    bool won = (ec.pred.scalar - e.class_boundary) * simple_temp.label > 0;
+    bool won = (ec.pred.scalar() - e.class_boundary) * simple_temp.label > 0;
 
     if (won)
     {
@@ -296,13 +286,14 @@ void ect_train(ect& e, single_learner& base, example& ec)
         else
           simple_temp.label = 1;
         simple_temp.weight = (float)(1 << (e.tree_height - i - 1));
-        ec.l.simple = simple_temp;
+        ec.l.reset();
+        ec.l.init_as_simple(simple_temp);
 
         uint32_t problem_number = e.last_pair + j * (1 << (i + 1)) + (1 << i) - 1;
 
         base.learn(ec, problem_number);
 
-        if (ec.pred.scalar > e.class_boundary)
+        if (ec.pred.scalar() > e.class_boundary)
           e.tournaments_won[j] = right;
         else
           e.tournaments_won[j] = left;
@@ -316,23 +307,32 @@ void ect_train(ect& e, single_learner& base, example& ec)
 
 void predict(ect& e, single_learner& base, example& ec)
 {
-  MULTICLASS::label_t mc = ec.l.multi;
+  MULTICLASS::label_t mc = ec.l.multi();
   if (mc.label == 0 || (mc.label > e.k && mc.label != (uint32_t)-1))
     std::cout << "label " << mc.label << " is not in {1," << e.k << "} This won't work right." << std::endl;
-  ec.pred.multiclass = ect_predict(e, base, ec);
-  ec.l.multi = mc;
+
+  auto pred = ect_predict(e, base, ec);
+  ec.pred.reset();
+  ec.pred.init_as_multiclass() = pred;
+
+  ec.l.reset();
+  ec.l.init_as_multi(mc);
 }
 
 void learn(ect& e, single_learner& base, example& ec)
 {
-  MULTICLASS::label_t mc = ec.l.multi;
+  MULTICLASS::label_t mc = ec.l.multi();
   predict(e, base, ec);
-  uint32_t pred = ec.pred.multiclass;
+  uint32_t pred = ec.pred.multiclass();
 
   if (mc.label != (uint32_t)-1)
     ect_train(e, base, ec);
-  ec.l.multi = mc;
-  ec.pred.multiclass = pred;
+
+  ec.l.reset();
+  ec.l.init_as_multi(mc);
+
+  ec.pred.reset();
+  ec.pred.init_as_multiclass() = pred;
 }
 
 base_learner* ect_setup(options_i& options, vw& all)
@@ -359,6 +359,6 @@ base_learner* ect_setup(options_i& options, vw& all)
     data->class_boundary = 0.5;  // as --link=logistic maps predictions in [0;1]
 
   learner<ect, example>& l = init_multiclass_learner(data, as_singleline(base), learn, predict, all.p, wpp);
-
+  l.label_type = label_type_t::multi;
   return make_base(l);
 }
diff --git a/vowpalwabbit/example.cc b/vowpalwabbit/example.cc
index a66e8783ada..9db124e6675 100644
--- a/vowpalwabbit/example.cc
+++ b/vowpalwabbit/example.cc
@@ -35,17 +35,16 @@ float collision_cleanup(features& fs)
 
 namespace VW
 {
-void copy_example_label(example* dst, example* src, size_t, void (*copy_label)(void*, void*))
+
+VW_DEPRECATED("Copy the label object directly.")
+void copy_example_label(example* dst, example* src, size_t, void (* /*copy_label*/)(polylabel&, polylabel&))
 {
-  if (copy_label)
-    copy_label(&dst->l, &src->l);  // TODO: we really need to delete_label on dst :(
-  else
-    dst->l = src->l;
+  dst->l = src->l;
 }
 
 void copy_example_metadata(bool /* audit */, example* dst, example* src)
 {
-  copy_array(dst->tag, src->tag);
+  dst->tag = src->tag;
   dst->example_counter = src->example_counter;
 
   dst->ft_offset = src->ft_offset;
@@ -55,8 +54,7 @@ void copy_example_metadata(bool /* audit */, example* dst, example* src)
     dst->passthrough = nullptr;
   else
   {
-    dst->passthrough = new features;
-    dst->passthrough->deep_copy_from(*src->passthrough);
+    dst->passthrough = new features(*src->passthrough);
   }
   dst->loss = src->loss;
   dst->weight = src->weight;
@@ -72,18 +70,25 @@ void copy_example_data(bool audit, example* dst, example* src)
   copy_example_metadata(audit, dst, src);
 
   // copy feature data
-  copy_array(dst->indices, src->indices);
-  for (namespace_index c : src->indices) dst->feature_space[c].deep_copy_from(src->feature_space[c]);
+  dst->indices = src->indices;
+  for (namespace_index c : src->indices)
+  {
+    // Performs deep copy of namespace  
+    dst->feature_space[c] = src->feature_space[c];
+  }
   // copy_array(dst->atomics[i], src->atomics[i]);
   dst->num_features = src->num_features;
   dst->total_sum_feat_sq = src->total_sum_feat_sq;
+
+  // Shallow copy
   dst->interactions = src->interactions;
 }
 
-void copy_example_data(bool audit, example* dst, example* src, size_t label_size, void (*copy_label)(void*, void*))
+void copy_example_data(
+    bool audit, example* dst, example* src, size_t /*label_size*/, void (* /*copy_label*/)(polylabel&, polylabel&))
 {
   copy_example_data(audit, dst, src);
-  copy_example_label(dst, src, label_size, copy_label);
+  dst->l = src->l;
 }
 
 void move_feature_namespace(example* dst, example* src, namespace_index c)
@@ -124,7 +129,6 @@ feature* get_features(vw& all, example* ec, size_t& feature_map_len)
   features_and_source fs;
   fs.stride_shift = all.weights.stride_shift();
   fs.mask = (uint64_t)all.weights.mask() >> all.weights.stride_shift();
-  fs.feature_map = v_init<feature>();
   GD::foreach_feature<features_and_source, uint64_t, vec_store>(all, *ec, fs);
 
   feature_map_len = fs.feature_map.size();
@@ -150,7 +154,7 @@ flat_example* flatten_example(vw& all, example* ec)
 {
   flat_example& fec = calloc_or_throw<flat_example>();
   fec.l = ec->l;
-  fec.l.simple.weight = ec->weight;
+  fec.l.simple().weight = ec->weight;
 
   fec.tag_len = ec->tag.size();
   if (fec.tag_len > 0)
@@ -184,50 +188,37 @@ flat_example* flatten_sort_example(vw& all, example* ec)
   return fec;
 }
 
+VW_DEPRECATED("")
 void free_flatten_example(flat_example* fec)
 {
-  // note: The label memory should be freed by by freeing the original example.
   if (fec)
   {
-    fec->fs.~features();
-    if (fec->tag_len > 0)
-      free(fec->tag);
-    free(fec);
+    fec->~flat_example();
   }
 }
 
 namespace VW
 {
-example* alloc_examples(size_t, size_t count = 1)
+example* alloc_examples(size_t count = 1)
 {
   example* ec = calloc_or_throw<example>(count);
   if (ec == nullptr)
     return nullptr;
   for (size_t i = 0; i < count; i++)
   {
-    ec[i].ft_offset = 0;
-    //  std::cerr << "  alloc_example.indices.begin()=" << ec->indices.begin() << " end=" << ec->indices.end() << " //
-    //  ld = " << ec->ld << "\t|| me = " << ec << std::endl;
+    new (&ec[i]) example();
   }
   return ec;
 }
 
-void dealloc_example(void (*delete_label)(void*), example& ec, void (*delete_prediction)(void*))
+example* alloc_examples(size_t, size_t count)
 {
-  if (delete_label)
-    delete_label(&ec.l);
-
-  if (delete_prediction)
-    delete_prediction(&ec.pred);
-
-  ec.tag.delete_v();
-
-  if (ec.passthrough)
-  {
-    delete ec.passthrough;
-  }
+  return alloc_examples(count);
+}
 
-  ec.indices.delete_v();
+VW_DEPRECATED("You can just use the example destructor when deallocating now")
+void dealloc_example(void (* /*delete_label*/)(polylabel&), example& ec, void (* /*delete_prediction*/)(void*))
+{
   ec.~example();
 }
 
diff --git a/vowpalwabbit/example.h b/vowpalwabbit/example.h
index e8573516c5c..08593f68acc 100644
--- a/vowpalwabbit/example.h
+++ b/vowpalwabbit/example.h
@@ -19,35 +19,9 @@
 #include "conditional_contextual_bandit.h"
 #include "ccb_label.h"
 #include <vector>
-
-typedef union
-{
-  no_label::no_label empty;
-  label_data simple;
-  MULTICLASS::label_t multi;
-  COST_SENSITIVE::label cs;
-  CB::label cb;
-  CCB::label conditional_contextual_bandit;
-  CB_EVAL::label cb_eval;
-  MULTILABEL::labels multilabels;
-} polylabel;
-
-inline void delete_scalars(void* v)
-{
-  v_array<float>* preds = (v_array<float>*)v;
-  preds->delete_v();
-}
-
-typedef union
-{
-  float scalar;
-  v_array<float> scalars;           // a sequence of scalar predictions
-  ACTION_SCORE::action_scores a_s;  // a sequence of classes with scores.  Also used for probabilities.
-  CCB::decision_scores_t decision_scores;
-  uint32_t multiclass;
-  MULTILABEL::labels multilabels;
-  float prob;  // for --probabilities --csoaa_ldf=mc
-} polyprediction;
+#include "vw_exception.h"
+#include "label.h"
+#include "prediction.h"
 
 IGNORE_DEPRECATED_USAGE_START
 struct example : public example_predict  // core example datatype.
@@ -74,10 +48,17 @@ struct example : public example_predict  // core example datatype.
 
   bool test_only;
   bool end_pass;  // special example indicating end of pass.
-  bool sorted;    // Are the features sorted or not?
-  
+  bool sorted;    // Are the features sorted or not? 
   VW_DEPRECATED("in_use has been removed, examples taken from the pool are assumed to be in use if there is a reference to them. Standalone examples are by definition always in use.")
   bool in_use = true;
+
+  ~example()
+  {
+    if (passthrough)
+    {
+      delete passthrough;
+    }
+  }
 };
 IGNORE_DEPRECATED_USAGE_END
 
@@ -88,7 +69,7 @@ struct flat_example
   polylabel l;
 
   size_t tag_len;
-  char* tag;  // An identifier for the example.
+  char* tag = nullptr;  // An identifier for the example.
 
   size_t example_counter;
   uint64_t ft_offset;
@@ -97,6 +78,81 @@ struct flat_example
   size_t num_features;      // precomputed, cause it's fast&easy.
   float total_sum_feat_sq;  // precomputed, cause it's kind of fast & easy.
   features fs;              // all the features
+
+  ~flat_example()
+  {
+    if (tag_len > 0)
+      free(tag);
+  }
+
+  flat_example(const flat_example& other)
+  {
+    l = other.l;
+    tag_len = other.tag_len;
+    if (tag_len > 0)
+    {
+      memcpy(tag, other.tag, tag_len);
+    }
+    example_counter = other.example_counter;
+    ft_offset = other.ft_offset;
+    global_weight = other.global_weight;
+    num_features = other.num_features;
+    total_sum_feat_sq = other.total_sum_feat_sq;
+    fs = other.fs;
+  }
+
+  flat_example& operator=(const flat_example& other)
+  {
+    l = other.l;
+    tag_len = other.tag_len;
+    if(tag != nullptr)
+    {
+      free(tag);
+      tag = nullptr;
+    }
+    if (tag_len > 0)
+    {
+      memcpy(tag, other.tag, tag_len);
+    }
+    example_counter = other.example_counter;
+    ft_offset = other.ft_offset;
+    global_weight = other.global_weight;
+    num_features = other.num_features;
+    total_sum_feat_sq = other.total_sum_feat_sq;
+    fs = other.fs;
+    return *this;
+  }
+
+  flat_example(flat_example&& other)
+  {
+    l = std::move(other.l);
+    tag_len = other.tag_len;
+    tag = other.tag;
+    example_counter = other.example_counter;
+    ft_offset = other.ft_offset;
+    global_weight = other.global_weight;
+    num_features = other.num_features;
+    total_sum_feat_sq = other.total_sum_feat_sq;
+    fs = std::move(other.fs);
+  }
+
+  flat_example& operator=(flat_example&& other)
+  {
+    l = std::move(other.l);
+    tag_len = other.tag_len;
+    if(tag != nullptr)
+    {
+      free(tag);
+    }
+    tag = other.tag;
+    example_counter = other.example_counter;
+    ft_offset = other.ft_offset;
+    global_weight = other.global_weight;
+    num_features = other.num_features;
+    total_sum_feat_sq = other.total_sum_feat_sq;
+    fs = std::move(other.fs);
+    return *this;
+  }
 };
 
 flat_example* flatten_example(vw& all, example* ec);
diff --git a/vowpalwabbit/example_predict.cc b/vowpalwabbit/example_predict.cc
index 0031a46b4e0..69853cb05c0 100644
--- a/vowpalwabbit/example_predict.cc
+++ b/vowpalwabbit/example_predict.cc
@@ -6,14 +6,10 @@
 
 safe_example_predict::safe_example_predict()
 {
-  indices = v_init<namespace_index>();
-  ft_offset = 0;
-  // feature_space is initialized through constructors
 }
 
 safe_example_predict::~safe_example_predict()
 {
-  indices.delete_v();
 }
 
 void safe_example_predict::clear()
diff --git a/vowpalwabbit/example_predict.h b/vowpalwabbit/example_predict.h
index d167127bad1..2fbf5584174 100644
--- a/vowpalwabbit/example_predict.h
+++ b/vowpalwabbit/example_predict.h
@@ -37,7 +37,7 @@ struct example_predict
 
   v_array<namespace_index> indices;
   std::array<features, NUM_NAMESPACES> feature_space;  // Groups of feature values.
-  uint64_t ft_offset;                                  // An offset for all feature values.
+  uint64_t ft_offset = 0;                              // An offset for all feature values.
 
   // Interactions are specified by this vector of strings, where each string is an interaction and each char is a
   // namespace.
@@ -48,7 +48,9 @@ struct example_predict
 };
 
 // make sure we have an exception safe version of example_predict
-class safe_example_predict : public example_predict
+class
+VW_DEPRECATED("example now uses C++ lifecycle functions. Please migrate to that instead for RAII needs.")
+safe_example_predict : public example_predict
 {
  public:
   safe_example_predict();
diff --git a/vowpalwabbit/explore_eval.cc b/vowpalwabbit/explore_eval.cc
index d03e3def73f..6bbf4a73d78 100644
--- a/vowpalwabbit/explore_eval.cc
+++ b/vowpalwabbit/explore_eval.cc
@@ -61,7 +61,7 @@ void output_example(vw& all, explore_eval& c, example& ec, multi_ex* ec_seq)
   size_t num_features = 0;
 
   float loss = 0.;
-  ACTION_SCORE::action_scores preds = (*ec_seq)[0]->pred.a_s;
+  const auto& preds = (*ec_seq)[0]->pred.action_probs();
 
   for (size_t i = 0; i < (*ec_seq).size(); i++)
     if (!CB::ec_is_example_header(*(*ec_seq)[i]))
@@ -84,13 +84,13 @@ void output_example(vw& all, explore_eval& c, example& ec, multi_ex* ec_seq)
 
   all.sd->update(holdout_example, labeled_example, loss, ec.weight, num_features);
 
-  for (int sink : all.final_prediction_sink) print_action_score(sink, ec.pred.a_s, ec.tag);
+  for (int sink : all.final_prediction_sink) print_action_score(sink, ec.pred.action_probs(), ec.tag);
 
   if (all.raw_prediction > 0)
   {
     std::string outputString;
     std::stringstream outputStringStream(outputString);
-    const auto& costs = ec.l.cb.costs;
+    const auto& costs = ec.l.cb().costs;
 
     for (size_t i = 0; i < costs.size(); i++)
     {
@@ -131,18 +131,18 @@ void do_actual_learning(explore_eval& data, multi_learner& base, multi_ex& ec_se
 
   if (label_example != nullptr)  // extract label
   {
-    data.action_label = label_example->l.cb;
-    label_example->l.cb = data.empty_label;
+    data.action_label = label_example->l.cb();
+    label_example->l.cb() = data.empty_label;
   }
   multiline_learn_or_predict<false>(base, ec_seq, data.offset);
 
   if (label_example != nullptr)  // restore label
-    label_example->l.cb = data.action_label;
+    label_example->l.cb() = data.action_label;
 
   data.known_cost = CB_ADF::get_observed_cost(ec_seq);
   if (label_example != nullptr && is_learn)
   {
-    ACTION_SCORE::action_scores& a_s = ec_seq[0]->pred.a_s;
+    auto& a_s = ec_seq[0]->pred.action_probs();
 
     float action_probability = 0;
     for (size_t i = 0; i < a_s.size(); i++)
@@ -164,12 +164,12 @@ void do_actual_learning(explore_eval& data, multi_learner& base, multi_ex& ec_se
       example* ec_found = nullptr;
       for (example*& ec : ec_seq)
       {
-        if (ec->l.cb.costs.size() == 1 && ec->l.cb.costs[0].cost != FLT_MAX && ec->l.cb.costs[0].probability > 0)
+        if (ec->l.cb().costs.size() == 1 && ec->l.cb().costs[0].cost != FLT_MAX && ec->l.cb().costs[0].probability > 0)
           ec_found = ec;
         if (threshold > 1)
           ec->weight *= threshold;
       }
-      ec_found->l.cb.costs[0].probability = action_probability;
+      ec_found->l.cb().costs[0].probability = action_probability;
 
       multiline_learn_or_predict<true>(base, ec_seq, data.offset);
 
@@ -178,7 +178,7 @@ void do_actual_learning(explore_eval& data, multi_learner& base, multi_ex& ec_se
         float inv_threshold = 1.f / threshold;
         for (auto& ec : ec_seq) ec->weight *= inv_threshold;
       }
-      ec_found->l.cb.costs[0].probability = data.known_cost.probability;
+      ec_found->l.cb().costs[0].probability = data.known_cost.probability;
       data.update_count++;
     }
   }
@@ -211,16 +211,14 @@ base_learner* explore_eval_setup(options_i& options, vw& all)
   if (!options.was_supplied("cb_explore_adf"))
     options.insert("cb_explore_adf", "");
 
-  all.delete_prediction = nullptr;
-
   multi_learner* base = as_multiline(setup_base(options, all));
   all.p->lp = CB::cb_label;
-  all.label_type = label_type_t::cb;
 
   learner<explore_eval, multi_ex>& l =
       init_learner(data, base, do_actual_learning<true>, do_actual_learning<false>, 1, prediction_type_t::action_probs);
 
   l.set_finish_example(finish_multiline_example);
   l.set_finish(finish);
+  l.label_type = label_type_t::cb;
   return make_base(l);
 }
diff --git a/vowpalwabbit/expreplay.h b/vowpalwabbit/expreplay.h
index 78ed788bf23..eb55d40bfb5 100644
--- a/vowpalwabbit/expreplay.h
+++ b/vowpalwabbit/expreplay.h
@@ -8,6 +8,7 @@
 #include "parse_args.h"
 #include "rand48.h"
 #include <memory>
+#include <vector>
 
 namespace ExpReplay
 {
@@ -16,23 +17,13 @@ struct expreplay
 {
   vw* all;
   std::shared_ptr<rand_state> _random_state;
-  size_t N;             // how big is the buffer?
-  example* buf;         // the deep copies of examples (N of them)
-  bool* filled;         // which of buf[] is filled
+  size_t N;                  // how big is the buffer?
+  std::vector<example> buf;  // the deep copies of examples (N of them)
+
+  std::vector<bool> filled;  // which of buf[] is filled
   size_t replay_count;  // each time er.learn() is called, how many times do we call base.learn()? default=1 (in which
                         // case we're just permuting)
   LEARNER::single_learner* base;
-
-  ~expreplay()
-  {
-    for (size_t n = 0; n < N; n++)
-    {
-      lp.delete_label(&buf[n].l);
-      VW::dealloc_example(NULL, buf[n], NULL);  // TODO: need to free label
-    }
-    free(buf);
-    free(filled);
-  }
 };
 
 template <bool is_learn, label_parser& lp>
@@ -40,7 +31,7 @@ void predict_or_learn(expreplay<lp>& er, LEARNER::single_learner& base, example&
 {  // regardless of what happens, we must predict
   base.predict(ec);
   // if we're not learning, that's all that has to happen
-  if (!is_learn || lp.get_weight(&ec.l) == 0.)
+  if (!is_learn || lp.get_weight(ec.l) == 0.)
     return;
 
   for (size_t replay = 1; replay < er.replay_count; replay++)
@@ -56,10 +47,11 @@ void predict_or_learn(expreplay<lp>& er, LEARNER::single_learner& base, example&
 
   er.filled[n] = true;
   VW::copy_example_data(er.all->audit, &er.buf[n], &ec);  // don't copy the label
-  if (lp.copy_label)
-    lp.copy_label(&er.buf[n].l, &ec.l);
-  else
-    er.buf[n].l = ec.l;
+
+  // By copying these, we don't need to know the type and it can be generic.
+  er.buf[n].l = ec.l;
+  // Technically we don't need to copy here, but this allows us to set the type of pred correctly.
+  er.buf[n].pred = ec.pred;
 }
 
 template <label_parser& lp>
@@ -81,6 +73,7 @@ void end_pass(expreplay<lp>& er)
     }
 }
 
+// TODO Only lp dependency is on weight - which should be able to be removed once weight is an example concept.
 template <char er_level, label_parser& lp>
 LEARNER::base_learner* expreplay_setup(VW::config::options_i& options, vw& all)
 {
@@ -106,23 +99,25 @@ LEARNER::base_learner* expreplay_setup(VW::config::options_i& options, vw& all)
 
   er->all = &all;
   er->_random_state = all.get_random_state();
-  er->buf = VW::alloc_examples(1, er->N);
-  er->buf->interactions = &all.interactions;
-
-  if (er_level == 'c')
-    for (size_t n = 0; n < er->N; n++) er->buf[n].l.cs.costs = v_init<COST_SENSITIVE::wclass>();
+  er->buf.resize(er->N);
+  for (auto& ex : er->buf)
+  {
+    ex.interactions = &all.interactions;
+  }
 
-  er->filled = calloc_or_throw<bool>(er->N);
+  er->filled.resize(er->N, false);
 
   if (!all.quiet)
     std::cerr << "experience replay level=" << er_level << ", buffer=" << er->N << ", replay count=" << er->replay_count
               << std::endl;
 
-  er->base = LEARNER::as_singleline(setup_base(options, all));
+  // er is a unique ptr and after calling init_learner it is reset. So that we can reference base after init_learner we need to store it here.
+  auto base = LEARNER::as_singleline(setup_base(options, all));
+  er->base = base;
   LEARNER::learner<expreplay<lp>, example>* l =
       &init_learner(er, er->base, predict_or_learn<true, lp>, predict_or_learn<false, lp>);
   l->set_end_pass(end_pass<lp>);
-
+  l->label_type = base->label_type;
   return make_base(*l);
 }
 }  // namespace ExpReplay
diff --git a/vowpalwabbit/ezexample.h b/vowpalwabbit/ezexample.h
index d11966e666e..079f8a8965b 100644
--- a/vowpalwabbit/ezexample.h
+++ b/vowpalwabbit/ezexample.h
@@ -42,11 +42,12 @@ class ezexample
 
   example* get_new_example()
   {
-    example* new_ec = VW::new_unused_example(*vw_par_ref);
-    vw_par_ref->p->lp.default_label(&new_ec->l);
+    auto new_ec = VW::new_unused_example(*vw_par_ref);
+    vw_par_ref->p->lp.default_label(new_ec->l);
     new_ec->tag.clear();
     new_ec->indices.clear();
-    for (auto& i : new_ec->feature_space) i.clear();
+    for (auto& i : new_ec->feature_space)
+      i.clear();
 
     new_ec->ft_offset = 0;
     new_ec->num_features = 0;
@@ -73,7 +74,8 @@ class ezexample
     quadratic_features_num = 0;
     quadratic_features_sqr = 0.;
 
-    for (bool& ns_exist : ns_exists) ns_exist = false;
+    for (bool& ns_exist : ns_exists)
+      ns_exist = false;
 
     example_changed_since_prediction = true;
   }
@@ -97,7 +99,7 @@ class ezexample
   ezexample(vw* this_vw, bool multiline = false, vw* this_vw_parser = nullptr)
   {
     setup_new_ezexample(this_vw, multiline, this_vw_parser);
-    example_copies = v_init<example*>();
+    example_copies.clear();
     ec = get_new_example();
     we_create_ec = true;
 
@@ -115,7 +117,8 @@ class ezexample
     ec = this_ec;
     we_create_ec = false;
 
-    for (auto ns : ec->indices) ns_exists[ns] = true;
+    for (auto ns : ec->indices)
+      ns_exists[ns] = true;
     if (current_ns != 0)
     {
       str[0] = current_ns;
@@ -131,7 +134,6 @@ class ezexample
       if (VW::is_ring_example(*vw_par_ref, ec))
         VW::finish_example(*vw_par_ref, *ecc);
     example_copies.clear();
-    free(example_copies.begin());
   }
 
   bool ensure_ns_exists(char c)  // returns TRUE iff we should ignore it :)
@@ -230,7 +232,7 @@ class ezexample
   void mini_setup_example()
   {
     ec->partial_prediction = 0.;
-    ec->weight = vw_par_ref->p->lp.get_weight(&ec->l);
+    ec->weight = vw_par_ref->p->lp.get_weight(ec->l);
 
     ec->num_features -= quadratic_features_num;
     ec->total_sum_feat_sq -= quadratic_features_sqr;
@@ -260,7 +262,7 @@ class ezexample
   float predict()
   {
     setup_for_predict();
-    return ec->pred.scalar;
+    return ec->pred.scalar();
   }
 
   float predict_partial()
@@ -284,7 +286,7 @@ class ezexample
     else  // is multiline
     {     // we need to make a copy
       example* copy = get_new_example();
-      VW::copy_example_data(vw_ref->audit, copy, ec, vw_par_ref->p->lp.label_size, vw_par_ref->p->lp.copy_label);
+      *copy = *ec;
       vw_ref->learn(*copy);
       example_copies.push_back(copy);
     }
diff --git a/vowpalwabbit/feature_group.h b/vowpalwabbit/feature_group.h
index f6ed984020e..4fcebc69f2d 100644
--- a/vowpalwabbit/feature_group.h
+++ b/vowpalwabbit/feature_group.h
@@ -273,68 +273,12 @@ struct features
     iterator_all end() { return iterator_all(_outer->values.end(), _outer->indicies.end(), _outer->space_names.end()); }
   };
 
-  features()
-  {
-    values = v_init<feature_value>();
-    indicies = v_init<feature_index>();
-    space_names = v_init<audit_strings_ptr>();
-    sum_feat_sq = 0.f;
-  }
+  features() { sum_feat_sq = 0.f; }
 
-  ~features() { 
-     values.delete_v();
-     indicies.delete_v();
-     space_names.delete_v();
-   }
-   features(const features&) = delete;
-   features & operator=( const features& ) = delete;
-   
-   
-   // custom move operators required since we need to leave the old value in
-   // a null state to prevent freeing of shallow copied v_arrays
-   features(features&& other) :
-   values(std::move(other.values)),
-   indicies(std::move(other.indicies)),
-   space_names(std::move(other.space_names)),
-   sum_feat_sq(other.sum_feat_sq)
-   {
-     // We need to null out all the v_arrays to prevent double freeing during moves
-     auto & v = other.values;
-     v._begin = nullptr;
-     v._end = nullptr;
-     v.end_array = nullptr;
-     auto & i = other.indicies;
-     i._begin = nullptr;
-     i._end = nullptr;
-     i.end_array = nullptr;
-     auto & s = other.space_names;
-     s._begin = nullptr;
-     s._end = nullptr;
-     s.end_array = nullptr;
-     other.sum_feat_sq = 0;
-   }
-   features & operator=(features&& other)
-   {
-     values = std::move(other.values);
-     indicies = std::move(other.indicies);
-     space_names = std::move(other.space_names);
-     sum_feat_sq = other.sum_feat_sq;
-     // We need to null out all the v_arrays to prevent double freeing during moves
-     auto & v = other.values;
-     v._begin = nullptr;
-     v._end = nullptr;
-     v.end_array = nullptr;
-     auto & i = other.indicies;
-     i._begin = nullptr;
-     i._end = nullptr;
-     i.end_array = nullptr;
-     auto & s = other.space_names;
-     s._begin = nullptr;
-     s._end = nullptr;
-     s.end_array = nullptr;
-     other.sum_feat_sq = 0;
-     return *this;
-   }
+  features(const features&) = default;
+  features& operator=(const features&) = default;
+  features(features&& other) = default;
+  features& operator=(features&& other) = default;
 
   inline size_t size() const { return values.size(); }
 
@@ -441,6 +385,7 @@ struct features
     return true;
   }
 
+  VW_DEPRECATED("Use copy constructor")
   void deep_copy_from(const features& src)
   {
     copy_array(values, src.values);
diff --git a/vowpalwabbit/ftrl.cc b/vowpalwabbit/ftrl.cc
index 79fc3958e07..1a393547a7c 100644
--- a/vowpalwabbit/ftrl.cc
+++ b/vowpalwabbit/ftrl.cc
@@ -77,7 +77,7 @@ template <bool audit>
 void predict(ftrl& b, single_learner&, example& ec)
 {
   ec.partial_prediction = GD::inline_predict(*b.all, ec);
-  ec.pred.scalar = GD::finalize_prediction(b.all->sd, ec.partial_prediction);
+  ec.pred.scalar() = GD::finalize_prediction(b.all->sd, ec.partial_prediction);
   if (audit)
     GD::print_audit_features(*(b.all), ec);
 }
@@ -87,7 +87,7 @@ void multipredict(
     ftrl& b, base_learner&, example& ec, size_t count, size_t step, polyprediction* pred, bool finalize_predictions)
 {
   vw& all = *b.all;
-  for (size_t c = 0; c < count; c++) pred[c].scalar = ec.l.simple.initial;
+  for (size_t c = 0; c < count; c++) pred[c].scalar() = ec.l.simple().initial;
   if (b.all->weights.sparse)
   {
     GD::multipredict_info<sparse_parameters> mp = {
@@ -100,14 +100,14 @@ void multipredict(
     GD::foreach_feature<GD::multipredict_info<dense_parameters>, uint64_t, GD::vec_add_multipredict>(all, ec, mp);
   }
   if (all.sd->contraction != 1.)
-    for (size_t c = 0; c < count; c++) pred[c].scalar *= (float)all.sd->contraction;
+    for (size_t c = 0; c < count; c++) pred[c].scalar() *= (float)all.sd->contraction;
   if (finalize_predictions)
-    for (size_t c = 0; c < count; c++) pred[c].scalar = GD::finalize_prediction(all.sd, pred[c].scalar);
+    for (size_t c = 0; c < count; c++) pred[c].scalar() = GD::finalize_prediction(all.sd, pred[c].scalar());
   if (audit)
   {
     for (size_t c = 0; c < count; c++)
     {
-      ec.pred.scalar = pred[c].scalar;
+      ec.pred.scalar() = pred[c].scalar();
       GD::print_audit_features(all, ec);
       ec.ft_offset += (uint64_t)step;
     }
@@ -229,7 +229,7 @@ void update_state_and_predict_cb(ftrl& b, single_learner&, example& ec)
 
   ec.partial_prediction = b.data.predict / ((float)((b.all->normalized_sum_norm_x + 1e-6) / b.total_weight));
 
-  ec.pred.scalar = GD::finalize_prediction(b.all->sd, ec.partial_prediction);
+  ec.pred.scalar() = GD::finalize_prediction(b.all->sd, ec.partial_prediction);
 }
 
 void update_state_and_predict_pistol(ftrl& b, single_learner&, example& ec)
@@ -238,26 +238,26 @@ void update_state_and_predict_pistol(ftrl& b, single_learner&, example& ec)
 
   GD::foreach_feature<update_data, inner_update_pistol_state_and_predict>(*b.all, ec, b.data);
   ec.partial_prediction = b.data.predict;
-  ec.pred.scalar = GD::finalize_prediction(b.all->sd, ec.partial_prediction);
+  ec.pred.scalar() = GD::finalize_prediction(b.all->sd, ec.partial_prediction);
 }
 
 void update_after_prediction_proximal(ftrl& b, example& ec)
 {
-  b.data.update = b.all->loss->first_derivative(b.all->sd, ec.pred.scalar, ec.l.simple.label) * ec.weight;
+  b.data.update = b.all->loss->first_derivative(b.all->sd, ec.pred.scalar(), ec.l.simple().label) * ec.weight;
 
   GD::foreach_feature<update_data, inner_update_proximal>(*b.all, ec, b.data);
 }
 
 void update_after_prediction_pistol(ftrl& b, example& ec)
 {
-  b.data.update = b.all->loss->first_derivative(b.all->sd, ec.pred.scalar, ec.l.simple.label) * ec.weight;
+  b.data.update = b.all->loss->first_derivative(b.all->sd, ec.pred.scalar(), ec.l.simple().label) * ec.weight;
 
   GD::foreach_feature<update_data, inner_update_pistol_post>(*b.all, ec, b.data);
 }
 
 void update_after_prediction_cb(ftrl& b, example& ec)
 {
-  b.data.update = b.all->loss->first_derivative(b.all->sd, ec.pred.scalar, ec.l.simple.label) * ec.weight;
+  b.data.update = b.all->loss->first_derivative(b.all->sd, ec.pred.scalar(), ec.l.simple().label) * ec.weight;
 
   GD::foreach_feature<update_data, inner_update_cb_post>(*b.all, ec, b.data);
 }
@@ -425,5 +425,6 @@ base_learner* ftrl_setup(options_i& options, vw& all)
     l->set_multipredict(multipredict<false>);
   l->set_save_load(save_load);
   l->set_end_pass(end_pass);
+  l->label_type = label_type_t::simple;
   return make_base(*l);
 }
diff --git a/vowpalwabbit/gd.cc b/vowpalwabbit/gd.cc
index 3a79e2c1f94..a028ccfb71c 100644
--- a/vowpalwabbit/gd.cc
+++ b/vowpalwabbit/gd.cc
@@ -323,7 +323,7 @@ void print_features(vw& all, example& ec)
 void print_audit_features(vw& all, example& ec)
 {
   if (all.audit)
-    print_result_by_ref(all.stdout_fileno, ec.pred.scalar, -1, ec.tag);
+    print_result_by_ref(all.stdout_fileno, ec.pred.scalar(), -1, ec.tag);
   fflush(stdout);
   print_features(all, ec);
 }
@@ -356,7 +356,7 @@ inline void vec_add_trunc(trunc_data& p, const float fx, float& fw)
 
 inline float trunc_predict(vw& all, example& ec, double gravity)
 {
-  trunc_data temp = {ec.l.simple.initial, (float)gravity};
+  trunc_data temp = {ec.l.simple().initial, (float)gravity};
   foreach_feature<trunc_data, vec_add_trunc>(all, ec, temp);
   return temp.prediction;
 }
@@ -377,7 +377,13 @@ void predict(gd& g, base_learner&, example& ec)
     ec.partial_prediction = inline_predict(all, ec);
 
   ec.partial_prediction *= (float)all.sd->contraction;
-  ec.pred.scalar = finalize_prediction(all.sd, ec.partial_prediction);
+
+  if (ec.pred.get_type() != prediction_type_t::unset)
+  {
+    ec.pred.reset();
+  }
+
+  ec.pred.init_as_scalar() = finalize_prediction(all.sd, ec.partial_prediction);
   if (audit)
     print_audit_features(all, ec);
 }
@@ -387,7 +393,7 @@ inline void vec_add_trunc_multipredict(multipredict_info<T>& mp, const float fx,
 {
   size_t index = fi;
   for (size_t c = 0; c < mp.count; c++, index += mp.step)
-    mp.pred[c].scalar += fx * trunc_weight(mp.weights[index], mp.gravity);
+    mp.pred[c].scalar() += fx * trunc_weight(mp.weights[index], mp.gravity);
 }
 
 template <bool l1, bool audit>
@@ -395,7 +401,7 @@ void multipredict(
     gd& g, base_learner&, example& ec, size_t count, size_t step, polyprediction* pred, bool finalize_predictions)
 {
   vw& all = *g.all;
-  for (size_t c = 0; c < count; c++) pred[c].scalar = ec.l.simple.initial;
+  for (size_t c = 0; c < count; c++) pred[c].scalar() = ec.l.simple().initial;
   if (g.all->weights.sparse)
   {
     multipredict_info<sparse_parameters> mp = {
@@ -414,14 +420,14 @@ void multipredict(
       foreach_feature<multipredict_info<dense_parameters>, uint64_t, vec_add_multipredict>(all, ec, mp);
   }
   if (all.sd->contraction != 1.)
-    for (size_t c = 0; c < count; c++) pred[c].scalar *= (float)all.sd->contraction;
+    for (size_t c = 0; c < count; c++) pred[c].scalar() *= (float)all.sd->contraction;
   if (finalize_predictions)
-    for (size_t c = 0; c < count; c++) pred[c].scalar = finalize_prediction(all.sd, pred[c].scalar);
+    for (size_t c = 0; c < count; c++) pred[c].scalar() = finalize_prediction(all.sd, pred[c].scalar());
   if (audit)
   {
     for (size_t c = 0; c < count; c++)
     {
-      ec.pred.scalar = pred[c].scalar;
+      ec.pred.scalar() = pred[c].scalar();
       print_audit_features(all, ec);
       ec.ft_offset += (uint64_t)step;
     }
@@ -533,12 +539,12 @@ template <bool sqrt_rate, bool feature_mask_off, bool adax, size_t adaptive, siz
 float get_pred_per_update(gd& g, example& ec)
 {
   // We must traverse the features in _precisely_ the same order as during training.
-  label_data& ld = ec.l.simple;
+  label_data& ld = ec.l.simple();
   vw& all = *g.all;
 
   float grad_squared = ec.weight;
   if (!adax)
-    grad_squared *= all.loss->getSquareGrad(ec.pred.scalar, ld.label);
+    grad_squared *= all.loss->getSquareGrad(ec.pred.scalar(), ld.label);
 
   if (grad_squared == 0 && !stateless)
     return 1.;
@@ -601,25 +607,25 @@ template <bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off,
 float compute_update(gd& g, example& ec)
 {
   // invariant: not a test label, importance weight > 0
-  label_data& ld = ec.l.simple;
+  label_data& ld = ec.l.simple();
   vw& all = *g.all;
 
   float update = 0.;
-  ec.updated_prediction = ec.pred.scalar;
-  if (all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) > 0.)
+  ec.updated_prediction = ec.pred.scalar();
+  if (all.loss->getLoss(all.sd, ec.pred.scalar(), ld.label) > 0.)
   {
     float pred_per_update = sensitivity<sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare, false>(g, ec);
     float update_scale = get_scale<adaptive>(g, ec, ec.weight);
     if (invariant)
-      update = all.loss->getUpdate(ec.pred.scalar, ld.label, update_scale, pred_per_update);
+      update = all.loss->getUpdate(ec.pred.scalar(), ld.label, update_scale, pred_per_update);
     else
-      update = all.loss->getUnsafeUpdate(ec.pred.scalar, ld.label, update_scale);
+      update = all.loss->getUnsafeUpdate(ec.pred.scalar(), ld.label, update_scale);
     // changed from ec.partial_prediction to ld.prediction
     ec.updated_prediction += pred_per_update * update;
 
     if (all.reg_mode && fabs(update) > 1e-8)
     {
-      double dev1 = all.loss->first_derivative(all.sd, ec.pred.scalar, ld.label);
+      double dev1 = all.loss->first_derivative(all.sd, ec.pred.scalar(), ld.label);
       double eta_bar = (fabs(dev1) > 1e-8) ? (-update / dev1) : 0.0;
       if (fabs(dev1) > 1e-8)
         all.sd->contraction *= (1. - all.l2_lambda * eta_bar);
@@ -629,7 +635,7 @@ float compute_update(gd& g, example& ec)
   }
 
   if (sparse_l2)
-    update -= g.sparse_l2 * ec.pred.scalar;
+    update -= g.sparse_l2 * ec.pred.scalar();
 
   return update;
 }
@@ -653,7 +659,7 @@ template <bool sparse_l2, bool invariant, bool sqrt_rate, bool feature_mask_off,
 void learn(gd& g, base_learner& base, example& ec)
 {
   // invariant: not a test label, importance weight > 0
-  assert(ec.l.simple.label != FLT_MAX);
+  assert(ec.l.simple().label != FLT_MAX);
   assert(ec.weight > 0.);
   g.predict(g, base, ec);
   update<sparse_l2, invariant, sqrt_rate, feature_mask_off, adax, adaptive, normalized, spare>(g, base, ec);
@@ -1242,6 +1248,7 @@ base_learner* setup(options_i& options, vw& all)
   ret.set_update(bare->update);
   ret.set_save_load(save_load);
   ret.set_end_pass(end_pass);
+  ret.label_type = label_type_t::simple;
   return make_base(ret);
 }
 
diff --git a/vowpalwabbit/gd.h b/vowpalwabbit/gd.h
index f0ae9896c64..82ef317d3c4 100644
--- a/vowpalwabbit/gd.h
+++ b/vowpalwabbit/gd.h
@@ -46,14 +46,14 @@ inline void vec_add_multipredict(multipredict_info<T>& mp, const float fx, uint6
   {
     i += fi;
     for (; i <= top; i += mp.step, ++p)
-      p->scalar +=
+      p->scalar() +=
           fx * mp.weights[i];  // TODO: figure out how to use weight_parameters::iterator (not using change_begin())
   }
   else  // TODO: this could be faster by unrolling into two loops
     for (size_t c = 0; c < mp.count; ++c, fi += (uint64_t)mp.step, ++p)
     {
       fi &= mask;
-      p->scalar += fx * mp.weights[fi];
+      p->scalar() += fx * mp.weights[fi];
     }
 }
 
@@ -94,9 +94,9 @@ inline void foreach_feature(vw& all, example& ec, R& dat)
 inline float inline_predict(vw& all, example& ec)
 {
   return all.weights.sparse ? inline_predict<sparse_parameters>(all.weights.sparse_weights, all.ignore_some_linear,
-                                  all.ignore_linear, *ec.interactions, all.permutations, ec, ec.l.simple.initial)
+                                  all.ignore_linear, *ec.interactions, all.permutations, ec, ec.l.simple().initial)
                             : inline_predict<dense_parameters>(all.weights.dense_weights, all.ignore_some_linear,
-                                  all.ignore_linear, *ec.interactions, all.permutations, ec, ec.l.simple.initial);
+                                  all.ignore_linear, *ec.interactions, all.permutations, ec, ec.l.simple().initial);
 }
 
 inline float sign(float w)
diff --git a/vowpalwabbit/gd_mf.cc b/vowpalwabbit/gd_mf.cc
index 662554c7995..ea3be16770f 100644
--- a/vowpalwabbit/gd_mf.cc
+++ b/vowpalwabbit/gd_mf.cc
@@ -27,7 +27,6 @@ struct gdmf
   uint32_t rank;
   size_t no_win_counter;
   uint64_t early_stop_thres;
-  ~gdmf() { scalars.delete_v(); }
 };
 
 void mf_print_offset_features(gdmf& d, example& ec, size_t offset)
@@ -77,7 +76,7 @@ void mf_print_offset_features(gdmf& d, example& ec, size_t offset)
 
 void mf_print_audit_features(gdmf& d, example& ec, size_t offset)
 {
-  print_result_by_ref(d.all->stdout_fileno, ec.pred.scalar, -1, ec.tag);
+  print_result_by_ref(d.all->stdout_fileno, ec.pred.scalar(), -1, ec.tag);
   mf_print_offset_features(d, ec, offset);
 }
 
@@ -93,7 +92,7 @@ template <class T>
 float mf_predict(gdmf& d, example& ec, T& weights)
 {
   vw& all = *d.all;
-  label_data& ld = ec.l.simple;
+  label_data& ld = ec.l.simple();
   float prediction = ld.initial;
 
   for (std::string& i : d.all->pairs)
@@ -153,15 +152,15 @@ float mf_predict(gdmf& d, example& ec, T& weights)
 
   all.set_minmax(all.sd, ld.label);
 
-  ec.pred.scalar = GD::finalize_prediction(all.sd, ec.partial_prediction);
+  ec.pred.scalar() = GD::finalize_prediction(all.sd, ec.partial_prediction);
 
   if (ld.label != FLT_MAX)
-    ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar, ld.label) * ec.weight;
+    ec.loss = all.loss->getLoss(all.sd, ec.pred.scalar(), ld.label) * ec.weight;
 
   if (all.audit)
     mf_print_audit_features(d, ec, 0);
 
-  return ec.pred.scalar;
+  return ec.pred.scalar();
 }
 
 float mf_predict(gdmf& d, example& ec)
@@ -184,12 +183,12 @@ template <class T>
 void mf_train(gdmf& d, example& ec, T& weights)
 {
   vw& all = *d.all;
-  label_data& ld = ec.l.simple;
+  label_data& ld = ec.l.simple();
 
   // use final prediction to get update size
   // update = eta_t*(y-y_hat) where eta_t = eta/(3*t^p) * importance weight
   float eta_t = all.eta / powf((float)all.sd->t + ec.weight, (float)all.power_t) / 3.f * ec.weight;
-  float update = all.loss->getUpdate(ec.pred.scalar, ld.label, eta_t, 1.);  // ec.total_sum_feat_sq);
+  float update = all.loss->getUpdate(ec.pred.scalar(), ld.label, eta_t, 1.);  // ec.total_sum_feat_sq);
 
   float regularization = eta_t * all.l2_lambda;
 
@@ -317,7 +316,7 @@ void learn(gdmf& d, single_learner&, example& ec)
   vw& all = *d.all;
 
   mf_predict(d, ec);
-  if (all.training && ec.l.simple.label != FLT_MAX)
+  if (all.training && ec.l.simple().label != FLT_MAX)
     mf_train(d, ec);
 }
 
@@ -377,6 +376,6 @@ base_learner* gd_mf_setup(options_i& options, vw& all)
   learner<gdmf, example>& l = init_learner(data, learn, predict, (UINT64_ONE << all.weights.stride_shift()));
   l.set_save_load(save_load);
   l.set_end_pass(end_pass);
-
+  l.label_type = label_type_t::simple;
   return make_base(l);
 }
diff --git a/vowpalwabbit/gen_cs_example.cc b/vowpalwabbit/gen_cs_example.cc
index 6df5c39360d..2f57a7bc272 100644
--- a/vowpalwabbit/gen_cs_example.cc
+++ b/vowpalwabbit/gen_cs_example.cc
@@ -46,7 +46,7 @@ void gen_cs_example_ips(multi_ex& examples, COST_SENSITIVE::label& cs_labels, fl
   cs_labels.costs.clear();
   for (uint32_t i = 0; i < examples.size(); i++)
   {
-    CB::label ld = examples[i]->l.cb;
+    CB::label& ld = examples[i]->l.cb();
 
     COST_SENSITIVE::wclass wc = {0., i, 0., 0.};
     if (ld.costs.size() == 1 && ld.costs[0].cost != FLT_MAX)
@@ -61,7 +61,7 @@ void gen_cs_example_dm(multi_ex& examples, COST_SENSITIVE::label& cs_labels)
   cs_labels.costs.clear();
   for (uint32_t i = 0; i < examples.size(); i++)
   {
-    CB::label ld = examples[i]->l.cb;
+    CB::label& ld = examples[i]->l.cb();
 
     COST_SENSITIVE::wclass wc = {0., i, 0., 0.};
     if (ld.costs.size() == 1 && ld.costs[0].cost != FLT_MAX)
@@ -145,7 +145,7 @@ void gen_cs_example_mtr(cb_to_cs_adf& c, multi_ex& ec_seq, COST_SENSITIVE::label
   cs_labels.costs.clear();
   for (size_t i = 0; i < ec_seq.size(); i++)
   {
-    CB::label ld = ec_seq[i]->l.cb;
+    CB::label& ld = ec_seq[i]->l.cb();
 
     COST_SENSITIVE::wclass wc = {0, 0, 0, 0};
 
diff --git a/vowpalwabbit/gen_cs_example.h b/vowpalwabbit/gen_cs_example.h
index a4e1656009c..53f88fb4945 100644
--- a/vowpalwabbit/gen_cs_example.h
+++ b/vowpalwabbit/gen_cs_example.h
@@ -8,6 +8,7 @@
 #include "reductions.h"
 #include "cb_algs.h"
 #include "vw_exception.h"
+#include "util.h"
 
 namespace GEN_CS
 {
@@ -50,7 +51,7 @@ void gen_cs_example_ips(cb_to_cs& c, CB::label& ld, COST_SENSITIVE::label& cs_ld
 template <bool is_learn>
 void gen_cs_example_dm(cb_to_cs& c, example& ec, COST_SENSITIVE::label& cs_ld)
 {  // this implements the direct estimation method, where costs are directly specified by the learned regressor.
-  CB::label ld = ec.l.cb;
+  CB::label& ld = ec.l.cb();
 
   float min = FLT_MAX;
   uint32_t argmin = 1;
@@ -115,7 +116,7 @@ void gen_cs_example_dm(cb_to_cs& c, example& ec, COST_SENSITIVE::label& cs_ld)
     }
   }
 
-  ec.pred.multiclass = argmin;
+  ec.pred.multiclass() = argmin;
 }
 
 template <bool is_learn>
@@ -262,26 +263,31 @@ void call_cs_ldf(LEARNER::multi_learner& base, multi_ex& examples, v_array<CB::l
   size_t index = 0;
   for (auto ec : examples)
   {
-    cb_labels.push_back(ec->l.cb);
+    cb_labels.push_back(std::move(ec->l.cb()));
     prepped_cs_labels[index].costs.clear();
     prepped_cs_labels[index].costs.push_back(cs_labels.costs[index]);
-    ec->l.cs = prepped_cs_labels[index++];
+    ec->l.reset();
+    ec->l.init_as_cs(std::move(prepped_cs_labels[index++]));
     ec->ft_offset = offset;
   }
 
+  swap_to_scores(examples);
   // 2nd: predict for each ex
   // // call base.predict for all examples
   if (is_learn)
     base.learn(examples, (int32_t)id);
   else
     base.predict(examples, (int32_t)id);
+  swap_to_probs(examples);
 
   // 3rd: restore cb_label for each example
-  // (**ec).l.cb = array.element.
+  // (**ec).l.cb() = array.element.
   // and restore offsets
   for (size_t i = 0; i < examples.size(); ++i)
   {
-    examples[i]->l.cb = cb_labels[i];
+    prepped_cs_labels[i].costs = std::move(examples[i]->l.cs().costs);
+    examples[i]->l.reset();
+    examples[i]->l.init_as_cb(std::move(cb_labels[i]));
     examples[i]->ft_offset = saved_offset;
   }
 }
diff --git a/vowpalwabbit/global_data.cc b/vowpalwabbit/global_data.cc
index dce672af3b2..4cdbb82b832 100644
--- a/vowpalwabbit/global_data.cc
+++ b/vowpalwabbit/global_data.cc
@@ -312,6 +312,11 @@ vw_ostream::vw_ostream() : std::ostream(&buf), buf(*this), trace_context(nullptr
   trace_listener = trace_listener_cerr;
 }
 
+void delete_polyprediction(polyprediction& pred)
+{
+  pred.reset();
+}
+
 IGNORE_DEPRECATED_USAGE_START
 vw::vw()
 {
@@ -323,8 +328,6 @@ vw::vw()
   sd->max_label = 0;
   sd->min_label = 0;
 
-  label_type = label_type_t::simple;
-
   l = nullptr;
   scorer = nullptr;
   cost_sensitive = nullptr;
@@ -335,7 +338,7 @@ vw::vw()
   current_pass = 0;
 
   data_filename = "";
-  delete_prediction = nullptr;
+  delete_prediction = &delete_polyprediction;
 
   bfgs = false;
   no_bias = false;
diff --git a/vowpalwabbit/global_data.h b/vowpalwabbit/global_data.h
index d08ebb77894..897d1ba8297 100644
--- a/vowpalwabbit/global_data.h
+++ b/vowpalwabbit/global_data.h
@@ -319,18 +319,6 @@ enum AllReduceType
 
 class AllReduce;
 
-
-enum class label_type_t
-{
-  simple,
-  cb,       // contextual-bandit
-  cb_eval,  // contextual-bandit evaluation
-  cs,       // cost-sensitive
-  multi,
-  mc,
-  ccb  // conditional contextual-bandit
-};
-
 struct rand_state
 {
  private:
@@ -372,6 +360,11 @@ struct vw
 
   void (*set_minmax)(shared_data* sd, float label);
 
+  label_type_t get_label_type() const
+  {
+    return l->label_type;
+  }
+
   uint64_t current_pass;
 
   uint32_t num_bits;  // log_2 of the number of features.
@@ -462,8 +455,9 @@ struct vw
   // This array is required to be value initialized so that the std::vectors are constructed.
   std::array<std::vector<std::shared_ptr<feature_dict>>, NUM_NAMESPACES>
       namespace_dictionaries{};  // each namespace has a list of dictionaries attached to it
-
-  void (*delete_prediction)(void*);
+  
+  VW_DEPRECATED("Use the polyprediciton destructor")
+  void (*delete_prediction)(polyprediction&);
   bool audit;     // should I print lots of debugging information?
   bool quiet;     // Should I suppress progress-printing of updates?
   bool training;  // Should I train if lable data is available?
@@ -537,13 +531,13 @@ struct vw
   vw();
   std::shared_ptr<rand_state> get_random_state() { return _random_state_sp; }
 
-  vw(const vw&) = delete;
-  vw& operator=(const vw&) = delete;
-
-  // vw object cannot be moved as many objects hold a pointer to it.
-  // That pointer would be invalidated if it were to be moved.
-  vw(const vw&&) = delete;
-  vw& operator=(const vw&&) = delete;
+  vw(const vw&) = delete;
+  vw& operator=(const vw&) = delete;
+
+  // vw object cannot be moved as many objects hold a pointer to it.
+  // That pointer would be invalidated if it were to be moved.
+  vw(const vw&&) = delete;
+  vw& operator=(const vw&&) = delete;
 };
 
 VW_DEPRECATED("Use print_result_by_ref instead")
diff --git a/vowpalwabbit/interact.cc b/vowpalwabbit/interact.cc
index 3d9786cf6e9..7438caa1c0d 100644
--- a/vowpalwabbit/interact.cc
+++ b/vowpalwabbit/interact.cc
@@ -112,7 +112,8 @@ void predict_or_learn(interact& in, LEARNER::single_learner& base, example& ec)
   ec.num_features -= f1.size();
   ec.num_features -= f2.size();
 
-  in.feat_store.deep_copy_from(f1);
+  // Deep copy of features
+  in.feat_store = f1;
 
   multiply(f1, f2, in);
   ec.total_sum_feat_sq += f1.sum_feat_sq;
@@ -144,7 +145,9 @@ void predict_or_learn(interact& in, LEARNER::single_learner& base, example& ec)
   memmove(&ec.indices[n2_i + 1], &ec.indices[n2_i], sizeof(unsigned char) * (ec.indices.size() - n2_i - 1));
   ec.indices[n2_i] = in.n2;
 
-  f1.deep_copy_from(in.feat_store);
+  // Deep copy of features
+  f1 = in.feat_store;
+
   ec.total_sum_feat_sq = in.total_sum_feat_sq;
   ec.num_features = in.num_features;
 }
@@ -174,9 +177,9 @@ LEARNER::base_learner* interact_setup(options_i& options, vw& all)
     std::cerr << "Interacting namespaces " << data->n1 << " and " << data->n2 << std::endl;
   data->all = &all;
 
-  LEARNER::learner<interact, example>* l;
-  l = &LEARNER::init_learner(
-      data, as_singleline(setup_base(options, all)), predict_or_learn<true, true>, predict_or_learn<false, true>, 1);
-
-  return make_base(*l);
+  auto base = as_singleline(setup_base(options, all));
+  auto& l = LEARNER::init_learner(
+      data, base, predict_or_learn<true, true>, predict_or_learn<false, true>, 1);
+  l.label_type = base->label_type;
+  return make_base(l);
 }
diff --git a/vowpalwabbit/interactions.cc b/vowpalwabbit/interactions.cc
index 80ff61f672e..a786bab9f2d 100644
--- a/vowpalwabbit/interactions.cc
+++ b/vowpalwabbit/interactions.cc
@@ -234,7 +234,7 @@ void eval_count_of_generated_ft(vw& all, example& ec, size_t& new_features_cnt,
   new_features_cnt = 0;
   new_features_value = 0.;
 
-  v_array<float> results = v_init<float>();
+  v_array<float> results;
 
   if (all.permutations)
   {
@@ -388,8 +388,6 @@ void eval_count_of_generated_ft(vw& all, example& ec, size_t& new_features_cnt,
                         << correct_features_value << std::endl;
 #endif
   }
-
-  results.delete_v();
 }
 
 }  // namespace INTERACTIONS
diff --git a/vowpalwabbit/interactions_predict.h b/vowpalwabbit/interactions_predict.h
index 1638761eefa..71943c7f2c4 100644
--- a/vowpalwabbit/interactions_predict.h
+++ b/vowpalwabbit/interactions_predict.h
@@ -106,7 +106,7 @@ inline void generate_interactions(std::vector<std::string>& interactions, bool p
   //    const uint64_t stride_shift = all.stride_shift; // it seems we don't need stride shift in FTRL-like hash
 
   // statedata for generic non-recursive iteration
-  v_array<feature_gen_data> state_data = v_init<feature_gen_data>();
+  v_array<feature_gen_data> state_data;
 
   feature_gen_data empty_ns_data;  // micro-optimization. don't want to call its constructor each time in loop.
   empty_ns_data.loop_idx = 0;
@@ -371,7 +371,5 @@ inline void generate_interactions(std::vector<std::string>& interactions, bool p
       }    // while do_it
     }
   }  // foreach interaction in all.interactions
-
-  state_data.delete_v();
 }
 }  // namespace INTERACTIONS
diff --git a/vowpalwabbit/io_buf.h b/vowpalwabbit/io_buf.h
index 745fb6701a7..2ed57064bb4 100644
--- a/vowpalwabbit/io_buf.h
+++ b/vowpalwabbit/io_buf.h
@@ -69,6 +69,27 @@ class io_buf
   static constexpr int READ = 1;
   static constexpr int WRITE = 2;
 
+  io_buf(io_buf& other) = delete;
+  io_buf& operator=(io_buf& other) = delete;
+  io_buf(io_buf&& other) = delete;
+  io_buf& operator=(io_buf&& other) = delete;
+  
+  virtual ~io_buf() 
+  {
+#ifdef _WIN32
+    int f = _fileno(stdin);
+#else
+    int f = fileno(stdin);
+#endif
+
+    while (!files.empty() && files.last() == f)
+      files.pop();
+
+    // Calling a virtual function in a constructor or destructor will actually result
+    // in calling this classes implementation. Make it explicit so it is less confusing.
+    while (io_buf::close_file());
+  }
+
   void verify_hash(bool verify)
   {
     _verify_hash = verify;
@@ -144,20 +165,10 @@ class io_buf
 
   io_buf() : _verify_hash{false}, _hash{0}, count{0}, current{0}
   {
-    space = v_init<char>();
-    files = v_init<int>();
-    currentname = v_init<char>();
-    finalname = v_init<char>();
     space.resize(INITIAL_BUFF_SIZE);
     head = space.begin();
   }
 
-  virtual ~io_buf()
-  {
-    files.delete_v();
-    space.delete_v();
-  }
-
   void set(char* p) { head = p; }
 
   virtual size_t num_files() { return files.size(); }
@@ -213,12 +224,6 @@ class io_buf
 
   static void close_file_or_socket(int f);
 
-  void close_files()
-  {
-    while (close_file())
-      ;
-  }
-
   static bool is_socket(int f);
 
   void buf_write(char*& pointer, size_t n);
diff --git a/vowpalwabbit/kernel_svm.cc b/vowpalwabbit/kernel_svm.cc
index 5a369a4273f..c3f46d238ac 100644
--- a/vowpalwabbit/kernel_svm.cc
+++ b/vowpalwabbit/kernel_svm.cc
@@ -41,7 +41,6 @@ struct svm_example
   v_array<float> krow;
   flat_example ex;
 
-  ~svm_example();
   void init_svm_example(flat_example* fec);
   int compute_kernels(svm_params& params);
   int clear_kernels();
@@ -73,9 +72,7 @@ void free_svm_model(svm_model* model)
     model->support_vec[i] = 0;
   }
 
-  model->support_vec.delete_v();
-  model->alpha.delete_v();
-  model->delta.delete_v();
+  model->~svm_model();
   free(model);
 }
 
@@ -144,17 +141,6 @@ void svm_example::init_svm_example(flat_example* fec)
   free(fec);
 }
 
-svm_example::~svm_example()
-{
-  krow.delete_v();
-  // free flatten example contents
-  //flat_example* fec = &calloc_or_throw<flat_example>();
-  //*fec = ex;
-  //free_flatten_example(fec);  // free contents of flat example and frees fec.
-  if (ex.tag_len > 0)
-    free(ex.tag);
-}
-
 float kernel_function(const flat_example* fec1, const flat_example* fec2, void* params, size_t kernel_type);
 
 int svm_example::compute_kernels(svm_params& params)
@@ -273,7 +259,6 @@ int save_load_flat_example(io_buf& model_file, bool read, flat_example*& fec)
       {
         features& fs = fec->fs;
         size_t len = fs.size();
-        fs.values = v_init<feature_value>();
         fs.values.resize(len);
         brw = model_file.bin_read_fixed((char*)fs.values.begin(), len * sizeof(feature_value), "");
         if (!brw)
@@ -281,7 +266,7 @@ int save_load_flat_example(io_buf& model_file, bool read, flat_example*& fec)
         fs.values.end() = fs.values.begin() + len;
 
         len = fs.indicies.size();
-        fs.indicies = v_init<feature_index>();
+        fs.indicies.clear();
         fs.indicies.resize(len);
         brw = model_file.bin_read_fixed((char*)fs.indicies.begin(), len * sizeof(feature_index), "");
         if (!brw)
@@ -471,7 +456,7 @@ void predict(svm_params& params, single_learner&, example& ec)
     sec->init_svm_example(fec);
     float score;
     predict(params, &sec, &score, 1);
-    ec.pred.scalar = score;
+    ec.pred.scalar() = score;
     sec->~svm_example();
     free(sec);
   }
@@ -484,9 +469,9 @@ size_t suboptimality(svm_model* model, double* subopt)
   double max_val = 0;
   for (size_t i = 0; i < model->num_support; i++)
   {
-    float tmp = model->alpha[i] * model->support_vec[i]->ex.l.simple.label;
+    float tmp = model->alpha[i] * model->support_vec[i]->ex.l.simple().label;
 
-    if ((tmp < model->support_vec[i]->ex.l.simple.weight && model->delta[i] < 0) || (tmp > 0 && model->delta[i] > 0))
+    if ((tmp < model->support_vec[i]->ex.l.simple().weight && model->delta[i] < 0) || (tmp > 0 && model->delta[i] > 0))
       subopt[i] = fabs(model->delta[i]);
     else
       subopt[i] = 0;
@@ -555,7 +540,7 @@ bool update(svm_params& params, size_t pos)
   bool overshoot = false;
   // params.all->opts_n_args.trace_message<<"Updating model "<<pos<<" "<<model->num_support<<" ";
   svm_example* fec = model->support_vec[pos];
-  label_data& ld = fec->ex.l.simple;
+  label_data& ld = fec->ex.l.simple();
   fec->compute_kernels(params);
   float* inprods = fec->krow.begin();
   float alphaKi = dense_dot(inprods, model->alpha, model->num_support);
@@ -569,8 +554,8 @@ bool update(svm_params& params, size_t pos)
   // std::cout<<model->num_support<<" "<<pos<<" "<<proj<<" "<<alphaKi<<" "<<alpha_old<<" "<<ld.label<<"
   // "<<model->delta[pos]<<" " << ai<<" "<<params.lambda<< endl;
 
-  if (ai > fec->ex.l.simple.weight)
-    ai = fec->ex.l.simple.weight;
+  if (ai > fec->ex.l.simple().weight)
+    ai = fec->ex.l.simple().weight;
   else if (ai < 0)
     ai = 0;
 
@@ -589,7 +574,7 @@ bool update(svm_params& params, size_t pos)
 
   for (size_t i = 0; i < model->num_support; i++)
   {
-    label_data& ldi = model->support_vec[i]->ex.l.simple;
+    label_data& ldi = model->support_vec[i]->ex.l.simple();
     model->delta[i] += diff * inprods[i] * ldi.label / params.lambda;
   }
 
@@ -646,7 +631,7 @@ void sync_queries(vw& all, svm_params& params, bool* train_pool)
   {
     queries = calloc_or_throw<char>(total_sum);
     memcpy(queries + prev_sum, b->space.begin(), b->head - b->space.begin());
-    b->space.delete_v();
+    b->space.clear();
     all_reduce<char, copy_char>(all, queries, total_sum);
 
     b->space.begin() = queries;
@@ -667,7 +652,6 @@ void sync_queries(vw& all, svm_params& params, bool* train_pool)
         // for(int j = 0;j < fec->feature_map_len;j++)
         //   params.all->opts_n_args.trace_message<<fec->feature_map[j].weight_index<<":"<<fec->feature_map[j].x<<" ";
         // params.all->opts_n_args.trace_message<< endl;
-        // params.pool[i]->in_use = true;
         // params.current_t += ((label_data*) params.pool[i]->ld)->weight;
         // params.pool[i]->example_t = params.current_t;
       }
@@ -731,7 +715,7 @@ void train(svm_params& params)
         if (params._random_state->get_and_update_random() < queryp)
         {
           svm_example* fec = params.pool[i];
-          fec->ex.l.simple.weight *= 1 / queryp;
+          fec->ex.l.simple().weight *= 1 / queryp;
           train_pool[i] = 1;
         }
       }
@@ -833,9 +817,9 @@ void learn(svm_params& params, single_learner&, example& ec)
     sec->init_svm_example(fec);
     float score = 0;
     predict(params, &sec, &score, 1);
-    ec.pred.scalar = score;
+    ec.pred.scalar() = score;
     // std::cout<<"Score = "<<score<< endl;
-    ec.loss = std::max(0.f, 1.f - score * ec.l.simple.label);
+    ec.loss = std::max(0.f, 1.f - score * ec.l.simple().label);
     params.loss_sum += ec.loss;
     if (params.all->training && ec.example_counter % 100 == 0)
       trim_cache(params);
@@ -942,5 +926,6 @@ LEARNER::base_learner* kernel_svm_setup(options_i& options, vw& all)
 
   learner<svm_params, example>& l = init_learner(params, learn, predict, 1);
   l.set_save_load(save_load);
+  l.label_type = label_type_t::simple;
   return make_base(l);
 }
diff --git a/vowpalwabbit/label.h b/vowpalwabbit/label.h
new file mode 100644
index 00000000000..8e518f4f0b2
--- /dev/null
+++ b/vowpalwabbit/label.h
@@ -0,0 +1,428 @@
+#pragma once
+
+/*
+When a new label type needs to be added the following actions must be taken:
+- LABEL_TYPE is the type that will be used
+- LABEL_NAME is the name to identify this label type
+Steps:
+  1. Add a new variant to label_type_t called LABEL_NAME
+  2. Add the corresponding row to to_string:
+    TO_STRING_CASE(label_type_t::LABEL_NAME)
+  3. Add the new type to the union:
+    LABEL_TYPE _LABEL_NAME;
+  3. Add the corresponding row to polylabel::copy_from
+    case (label_type_t::LABEL_NAME):
+      init_as_LABEL_NAME(std::move(other._LABEL_NAME));
+      break;
+  4. Add the corresponding row to polylabel::move_from
+    case (label_type_t::LABEL_NAME):
+      init_as_LABEL_NAME(std::move(other._LABEL_NAME));
+      break;
+  5. Add the corresponding row to polylabel::reset
+    case (label_type_t::LABEL_NAME):
+        destruct(_LABEL_NAME);
+        break;
+  6. Add another three methods that correspond to the new type according to this template
+    template <typename... Args>
+    LABEL_TYPE& init_as_LABEL_NAME(Args&&... args)
+    {
+      ensure_is_type(label_type_t::unset);
+      new (&_LABEL_NAME) LABEL_TYPE(std::forward<Args>(args)...);
+      _tag = label_type_t::LABEL_NAME;
+      return _LABEL_NAME;
+    }
+
+    const LABEL_TYPE& LABEL_NAME() const
+    {
+      ensure_is_type(label_type_t::LABEL_NAME);
+      return _LABEL_NAME;
+    }
+
+    LABEL_TYPE& LABEL_NAME()
+    {
+      ensure_is_type(label_type_t::LABEL_NAME);
+      return _LABEL_NAME;
+    }
+*/
+
+#include "no_label.h"
+#include "simple_label.h"
+#include "multiclass.h"
+#include "multilabel.h"
+#include "cost_sensitive.h"
+#include "cb.h"
+#include "example_predict.h"
+#include "ccb_label.h"
+
+#define TO_STRING_CASE(enum_type) \
+  case enum_type:                 \
+    return #enum_type;
+
+enum class label_type_t
+{
+  unset,
+  empty,
+  simple,
+  multi,
+  cs,
+  cb,
+  conditional_contextual_bandit,
+  cb_eval,
+  multilabels
+};
+
+inline const char* to_string(label_type_t label_type)
+{
+  switch (label_type)
+  {
+    TO_STRING_CASE(label_type_t::unset)
+    TO_STRING_CASE(label_type_t::empty)
+    TO_STRING_CASE(label_type_t::simple)
+    TO_STRING_CASE(label_type_t::multi)
+    TO_STRING_CASE(label_type_t::cs)
+    TO_STRING_CASE(label_type_t::cb)
+    TO_STRING_CASE(label_type_t::conditional_contextual_bandit)
+    TO_STRING_CASE(label_type_t::cb_eval)
+    TO_STRING_CASE(label_type_t::multilabels)
+    default:
+      return "<unsupported>";
+  }
+}
+
+struct polylabel
+{
+ private:
+  union {
+    no_label::no_label _empty;
+    label_data _simple;
+    MULTICLASS::label_t _multi;
+    COST_SENSITIVE::label _cs;
+    CB::label _cb;
+    CCB::label _conditional_contextual_bandit;
+    CB_EVAL::label _cb_eval;
+    MULTILABEL::labels _multilabels;
+  };
+  label_type_t _tag;
+
+  inline void ensure_is_type(label_type_t type) const
+  {
+#ifndef NDEBUG
+    if (_tag != type)
+    {
+      THROW("Expected type: " << to_string(type) << ", but found: " << to_string(_tag));
+    }
+#else
+    _UNUSED(type);
+#endif
+  }
+
+  template <typename T>
+  void destruct(T& item)
+  {
+    item.~T();
+  }
+
+  // These two functions only differ by parameter
+  void copy_from(const polylabel& other)
+  {
+    switch (other._tag)
+    {
+      case (label_type_t::unset):
+        break;
+      case (label_type_t::empty):
+        init_as_empty(other._empty);
+        break;
+      case (label_type_t::simple):
+        init_as_simple(other._simple);
+        break;
+      case (label_type_t::multi):
+        init_as_multi(other._multi);
+        break;
+      case (label_type_t::cs):
+        init_as_cs(other._cs);
+        break;
+      case (label_type_t::cb):
+        init_as_cb(other._cb);
+        break;
+      case (label_type_t::conditional_contextual_bandit):
+        init_as_ccb(other._conditional_contextual_bandit);
+        break;
+      case (label_type_t::cb_eval):
+        init_as_cb_eval(other._cb_eval);
+        break;
+      case (label_type_t::multilabels):
+        init_as_multilabels(other._multilabels);
+        break;
+      default:;
+    }
+  }
+
+  void move_from(polylabel&& other)
+  {
+    switch (other._tag)
+    {
+      case (label_type_t::unset):
+        break;
+      case (label_type_t::empty):
+        init_as_empty(std::move(other._empty));
+        break;
+      case (label_type_t::simple):
+        init_as_simple(std::move(other._simple));
+        break;
+      case (label_type_t::multi):
+        init_as_multi(std::move(other._multi));
+        break;
+      case (label_type_t::cs):
+        init_as_cs(std::move(other._cs));
+        break;
+      case (label_type_t::cb):
+        init_as_cb(std::move(other._cb));
+        break;
+      case (label_type_t::conditional_contextual_bandit):
+        init_as_ccb(std::move(other._conditional_contextual_bandit));
+        break;
+      case (label_type_t::cb_eval):
+        init_as_cb_eval(std::move(other._cb_eval));
+        break;
+      case (label_type_t::multilabels):
+        init_as_multilabels(std::move(other._multilabels));
+        break;
+      default:;
+    }
+  }
+
+ public:
+  polylabel() { _tag = label_type_t::unset; // Perhaps we should memset here?
+  };
+  ~polylabel() { reset(); }
+
+  polylabel(polylabel&& other)
+  {
+    _tag = label_type_t::unset;
+    move_from(std::move(other));
+  }
+
+  polylabel& operator=(polylabel&& other)
+  {
+    reset();
+    move_from(std::move(other));
+    return *this;
+  }
+
+  polylabel(const polylabel& other) {
+    _tag = label_type_t::unset;
+    copy_from(other);
+  }
+
+  polylabel& operator=(const polylabel& other) {
+    reset();
+    copy_from(other);
+    return *this;
+  }
+
+  label_type_t get_type() const { return _tag; }
+
+  void reset()
+  {
+    switch (_tag)
+    {
+      case (label_type_t::unset):
+        // Nothing to do! Whatever was in here has already been destroyed.
+        return;
+      case (label_type_t::empty):
+        destruct(_empty);
+        break;
+      case (label_type_t::simple):
+        destruct(_simple);
+        break;
+      case (label_type_t::multi):
+        destruct(_multi);
+        break;
+      case (label_type_t::cs):
+        destruct(_cs);
+        break;
+      case (label_type_t::cb):
+        destruct(_cb);
+        break;
+      case (label_type_t::conditional_contextual_bandit):
+        destruct(_conditional_contextual_bandit);
+        break;
+      case (label_type_t::cb_eval):
+        destruct(_cb_eval);
+        break;
+      case (label_type_t::multilabels):
+        destruct(_multilabels);
+        break;
+      default:;
+    }
+
+    _tag = label_type_t::unset;
+  }
+
+  template <typename... Args>
+  no_label::no_label& init_as_empty(Args&&... args)
+  {
+    ensure_is_type(label_type_t::unset);
+    new (&_empty) no_label::no_label(std::forward<Args>(args)...);
+    _tag = label_type_t::empty;
+    return _empty;
+  }
+
+  const no_label::no_label& empty() const
+  {
+    ensure_is_type(label_type_t::empty);
+    return _empty;
+  }
+
+  no_label::no_label& empty()
+  {
+    ensure_is_type(label_type_t::empty);
+    return _empty;
+  }
+
+  template <typename... Args>
+  label_data& init_as_simple(Args&&... args)
+  {
+    ensure_is_type(label_type_t::unset);
+    new (&_simple) label_data(std::forward<Args>(args)...);
+    _tag = label_type_t::simple;
+    return _simple;
+  }
+
+  const label_data& simple() const
+  {
+    ensure_is_type(label_type_t::simple);
+    return _simple;
+  }
+
+  label_data& simple()
+  {
+    ensure_is_type(label_type_t::simple);
+    return _simple;
+  }
+
+  template <typename... Args>
+  MULTICLASS::label_t& init_as_multi(Args&&... args)
+  {
+    ensure_is_type(label_type_t::unset);
+    new (&_multi) MULTICLASS::label_t(std::forward<Args>(args)...);
+    _tag = label_type_t::multi;
+    return _multi;
+  }
+
+  const MULTICLASS::label_t& multi() const
+  {
+    ensure_is_type(label_type_t::multi);
+    return _multi;
+  }
+
+  MULTICLASS::label_t& multi()
+  {
+    ensure_is_type(label_type_t::multi);
+    return _multi;
+  }
+
+  template <typename... Args>
+  COST_SENSITIVE::label& init_as_cs(Args&&... args)
+  {
+    ensure_is_type(label_type_t::unset);
+    new (&_cs) COST_SENSITIVE::label(std::forward<Args>(args)...);
+    _tag = label_type_t::cs;
+    return _cs;
+  }
+
+  const COST_SENSITIVE::label& cs() const
+  {
+    ensure_is_type(label_type_t::cs);
+    return _cs;
+  }
+
+  COST_SENSITIVE::label& cs()
+  {
+    ensure_is_type(label_type_t::cs);
+    return _cs;
+  }
+
+  template <typename... Args>
+  CB::label& init_as_cb(Args&&... args)
+  {
+    ensure_is_type(label_type_t::unset);
+    new (&_cb) CB::label(std::forward<Args>(args)...);
+    _tag = label_type_t::cb;
+    return _cb;
+  }
+  const CB::label& cb() const
+  {
+    ensure_is_type(label_type_t::cb);
+    return _cb;
+  }
+
+  CB::label& cb()
+  {
+    ensure_is_type(label_type_t::cb);
+    return _cb;
+  }
+
+  template <typename... Args>
+  CCB::label& init_as_ccb(Args&&... args)
+  {
+    ensure_is_type(label_type_t::unset);
+    new (&_conditional_contextual_bandit) CCB::label(std::forward<Args>(args)...);
+    _tag = label_type_t::conditional_contextual_bandit;
+    return _conditional_contextual_bandit;
+  }
+
+  const CCB::label& ccb() const
+  {
+    ensure_is_type(label_type_t::conditional_contextual_bandit);
+    return _conditional_contextual_bandit;
+  }
+
+  CCB::label& ccb()
+  {
+    ensure_is_type(label_type_t::conditional_contextual_bandit);
+    return _conditional_contextual_bandit;
+  }
+
+  template <typename... Args>
+  CB_EVAL::label& init_as_cb_eval(Args&&... args)
+  {
+    ensure_is_type(label_type_t::unset);
+    new (&_cb_eval) CB_EVAL::label(std::forward<Args>(args)...);
+    _tag = label_type_t::cb_eval;
+    return _cb_eval;
+  }
+
+  const CB_EVAL::label& cb_eval() const
+  {
+    ensure_is_type(label_type_t::cb_eval);
+    return _cb_eval;
+  }
+
+  CB_EVAL::label& cb_eval()
+  {
+    ensure_is_type(label_type_t::cb_eval);
+    return _cb_eval;
+  }
+
+  template <typename... Args>
+  MULTILABEL::labels& init_as_multilabels(Args&&... args)
+  {
+    ensure_is_type(label_type_t::unset);
+    new (&_multilabels) MULTILABEL::labels(std::forward<Args>(args)...);
+    _tag = label_type_t::multilabels;
+    return _multilabels;
+  }
+
+  const MULTILABEL::labels& multilabels() const
+  {
+    ensure_is_type(label_type_t::multilabels);
+    return _multilabels;
+  }
+
+  MULTILABEL::labels& multilabels()
+  {
+    ensure_is_type(label_type_t::multilabels);
+    return _multilabels;
+  }
+};
diff --git a/vowpalwabbit/label_dictionary.cc b/vowpalwabbit/label_dictionary.cc
index 3eeeb79f01c..c1f19aabda3 100644
--- a/vowpalwabbit/label_dictionary.cc
+++ b/vowpalwabbit/label_dictionary.cc
@@ -95,9 +95,7 @@ void set_label_features(label_feature_map& lfm, size_t lab, features& fs)
 {
   if (lfm.find(lab) == lfm.end())
     return;
-  features tmp_features;
-  tmp_features.deep_copy_from(fs);
-  lfm.emplace(lab, std::move(tmp_features));
+  lfm.emplace(lab, fs);
 }
 
 }  // namespace LabelDict
diff --git a/vowpalwabbit/label_parser.cc b/vowpalwabbit/label_parser.cc
new file mode 100644
index 00000000000..79bddd8c5c5
--- /dev/null
+++ b/vowpalwabbit/label_parser.cc
@@ -0,0 +1,12 @@
+#include "label_parser.h"
+#include "label.h"
+
+void polylabel_copy_label(polylabel& left, polylabel& right)
+{
+  left = right;
+}
+
+void polylabel_delete_label(polylabel& label)
+{
+  label.reset();
+}
\ No newline at end of file
diff --git a/vowpalwabbit/label_parser.h b/vowpalwabbit/label_parser.h
index 3a8fa533070..1ab18049f4a 100644
--- a/vowpalwabbit/label_parser.h
+++ b/vowpalwabbit/label_parser.h
@@ -12,19 +12,26 @@
 
 struct parser;
 struct shared_data;
+struct polylabel;
+
+void polylabel_copy_label(polylabel& left, polylabel& right);
+void polylabel_delete_label(polylabel& label);
 
 struct label_parser
 {
-  void (*default_label)(void*);
-  void (*parse_label)(parser*, shared_data*, void*, v_array<VW::string_view>&);
-  void (*cache_label)(void*, io_buf& cache);
-  size_t (*read_cached_label)(shared_data*, void*, io_buf& cache);
-  void (*delete_label)(void*);
-  float (*get_weight)(void*);
-  void (*copy_label)(void*, void*);  // copy_label(dst,src) performs a DEEP copy of src into dst (dst is allocated
+  void (*default_label)(polylabel&);
+  void (*parse_label)(parser*, shared_data*, polylabel&, v_array<VW::string_view>&);
+  void (*cache_label)(polylabel&, io_buf& cache);
+  size_t (*read_cached_label)(shared_data*, polylabel&, io_buf& cache);
+  VW_DEPRECATED("Removed")
+  void (*delete_label)(polylabel&);
+  float (*get_weight)(polylabel&);
+  VW_DEPRECATED("Removed")
+  void (*copy_label)(polylabel&, polylabel&);  // copy_label(dst,src) performs a DEEP copy of src into dst (dst is allocated
                                      // correctly).  if this function is nullptr, then we assume that a memcpy of size
                                      // label_size is sufficient, so you need only specify this function if your label
-                                     // constains, for instance, pointers (otherwise you'll get double-free errors)
-  bool (*test_label)(void*);
+                                     // constains, for instance, pointers (otherwise you'll get double-free errors)  size_t label_size;
+  bool (*test_label)(polylabel&);
+  VW_DEPRECATED("Removed")
   size_t label_size;
 };
diff --git a/vowpalwabbit/lda_core.cc b/vowpalwabbit/lda_core.cc
index a857ebef068..a8aaaf47f5b 100644
--- a/vowpalwabbit/lda_core.cc
+++ b/vowpalwabbit/lda_core.cc
@@ -87,18 +87,6 @@ struct lda
   inline float powf(float x, float p);
   inline void expdigammify(vw &all, float *gamma);
   inline void expdigammify_2(vw &all, float *gamma, float *norm);
-
-  ~lda()
-  {
-    Elogtheta.delete_v();
-    decay_levels.delete_v();
-    total_new.delete_v();
-    examples.delete_v();
-    total_lambda.delete_v();
-    doc_lengths.delete_v();
-    digammas.delete_v();
-    v.delete_v();
-  }
 };
 
 // #define VW_NO_INLINE_SIMD
@@ -677,8 +665,9 @@ static inline float find_cw(lda &l, float *u_for_w, float *v)
 namespace
 {
 // Effectively, these are static and not visible outside the compilation unit.
-v_array<float> new_gamma = v_init<float>();
-v_array<float> old_gamma = v_init<float>();
+// TODO: Make these non global as it makes this code non threadsafe
+v_array<float> new_gamma;
+v_array<float> old_gamma;
 }  // namespace
 
 // Returns an estimate of the part of the variational bound that
@@ -731,10 +720,10 @@ float lda_loop(lda &l, v_array<float> &Elogtheta, float *v, example *ec, float)
     for (size_t k = 0; k < l.topics; k++) new_gamma[k] = new_gamma[k] * v[k] + l.lda_alpha;
   } while (average_diff(*l.all, old_gamma.begin(), new_gamma.begin()) > l.lda_epsilon);
 
-  ec->pred.scalars.clear();
-  ec->pred.scalars.resize(l.topics);
-  memcpy(ec->pred.scalars.begin(), new_gamma.begin(), l.topics * sizeof(float));
-  ec->pred.scalars.end() = ec->pred.scalars.begin() + l.topics;
+  ec->pred.scalars().clear();
+  ec->pred.scalars().resize(l.topics);
+  memcpy(ec->pred.scalars().begin(), new_gamma.begin(), l.topics * sizeof(float));
+  ec->pred.scalars().end() = ec->pred.scalars().begin() + l.topics;
 
   score += theta_kl(l, Elogtheta, new_gamma.begin());
 
@@ -846,7 +835,7 @@ void save_load(lda &l, io_buf &model_file, bool read, bool text)
 void return_example(vw &all, example &ec)
 {
   all.sd->update(ec.test_only, true, ec.loss, ec.weight, ec.num_features);
-  for (int f : all.final_prediction_sink) MWT::print_scalars(f, ec.pred.scalars, ec.tag);
+  for (int f : all.final_prediction_sink) MWT::print_scalars(f, ec.pred.scalars(), ec.tag);
 
   if (all.sd->weighted_examples() >= all.sd->dump_interval && !all.quiet)
     all.sd->print_update(
@@ -866,12 +855,12 @@ void learn_batch(lda &l)
     // do in this case, we just return.
     for (size_t d = 0; d < l.examples.size(); d++)
     {
-      l.examples[d]->pred.scalars.clear();
-      l.examples[d]->pred.scalars.resize(l.topics);
-      memset(l.examples[d]->pred.scalars.begin(), 0, l.topics * sizeof(float));
-      l.examples[d]->pred.scalars.end() = l.examples[d]->pred.scalars.begin() + l.topics;
+      l.examples[d]->pred.scalars().clear();
+      l.examples[d]->pred.scalars().resize(l.topics);
+      memset(l.examples[d]->pred.scalars().begin(), 0, l.topics * sizeof(float));
+      l.examples[d]->pred.scalars().end() = l.examples[d]->pred.scalars().begin() + l.topics;
 
-      l.examples[d]->pred.scalars.clear();
+      l.examples[d]->pred.scalars().clear();
       return_example(*l.all, *l.examples[d]);
     }
     l.examples.clear();
@@ -994,6 +983,18 @@ void learn(lda &l, LEARNER::single_learner &, example &ec)
   uint32_t num_ex = (uint32_t)l.examples.size();
   l.examples.push_back(&ec);
   l.doc_lengths.push_back(0);
+
+  // The contract of a reduction is that prediction and label must be valid on the way in and out.
+  // In the LDA batch, examples are cleared and so it breaks this contract. Copying them here only
+  // for the final example allows us to support that. This is not great either and should be revisited.
+  polylabel pl;
+  polyprediction pp;
+  if (num_ex + 1 == l.minibatch)
+  {
+    pl = ec.l;
+    pp = ec.pred;
+  }
+
   for (features &fs : ec)
   {
     for (features::iterator &f : fs)
@@ -1003,8 +1004,12 @@ void learn(lda &l, LEARNER::single_learner &, example &ec)
       l.doc_lengths[num_ex] += (int)f.value();
     }
   }
-  if (++num_ex == l.minibatch)
+  if (num_ex + 1 == l.minibatch)
+  {
     learn_batch(l);
+    ec.l = std::move(pl);
+    ec.pred = std::move(pp);
+  }
 }
 
 void learn_with_metrics(lda &l, LEARNER::single_learner &base, example &ec)
@@ -1315,7 +1320,6 @@ LEARNER::base_learner *lda_setup(options_i &options, vw &all)
     return nullptr;
 
   all.lda = (uint32_t)ld->topics;
-  all.delete_prediction = delete_scalars;
   ld->sorted_features = std::vector<index_feature>();
   ld->total_lambda_init = false;
   ld->all = &all;
@@ -1361,6 +1365,6 @@ LEARNER::base_learner *lda_setup(options_i &options, vw &all)
   l.set_finish_example(finish_example);
   l.set_end_examples(end_examples);
   l.set_end_pass(end_pass);
-
+  l.label_type = label_type_t::empty;
   return make_base(l);
 }
diff --git a/vowpalwabbit/learner.cc b/vowpalwabbit/learner.cc
index f1b5e9a25a2..d7f3e97c66a 100644
--- a/vowpalwabbit/learner.cc
+++ b/vowpalwabbit/learner.cc
@@ -8,28 +8,6 @@
 #include "parse_regressor.h"
 #include "parse_dispatch_loop.h"
 
-
-#define CASE(type) \
-  case type:       \
-    return #type;
-
-const char* to_string(prediction_type_t prediction_type)
-{
-  switch (prediction_type)
-  {
-    CASE(prediction_type_t::scalar)
-    CASE(prediction_type_t::scalars)
-    CASE(prediction_type_t::action_scores)
-    CASE(prediction_type_t::action_probs)
-    CASE(prediction_type_t::multiclass)
-    CASE(prediction_type_t::multilabels)
-    CASE(prediction_type_t::prob)
-    CASE(prediction_type_t::multiclassprobs)
-    default:
-      return "<unsupported>";
-  }
-}
-
 namespace LEARNER
 {
 void learn_ex(example& ec, vw& all)
@@ -72,7 +50,7 @@ inline bool example_is_newline_not_header(example& ec, vw& all)
 {
   // If we are using CCB, test against CCB implementation otherwise fallback to previous behavior.
   bool is_header = false;
-  if (all.label_type == label_type_t::ccb)
+  if (all.get_label_type() == label_type_t::conditional_contextual_bandit)
   {
     is_header = CCB::ec_is_example_header(ec);
   }
@@ -168,7 +146,7 @@ class multi_example_handler
   bool complete_multi_ex(example* ec)
   {
     auto& master = _context.get_master();
-    const bool is_test_ec = master.p->lp.test_label(&ec->l);
+    const bool is_test_ec = master.p->lp.test_label(ec->l);
     const bool is_newline = (example_is_newline_not_header(*ec, master) && is_test_ec);
     if (!is_newline)
     {
diff --git a/vowpalwabbit/learner.h b/vowpalwabbit/learner.h
index 7959f5b25b0..29c6492c8e8 100644
--- a/vowpalwabbit/learner.h
+++ b/vowpalwabbit/learner.h
@@ -8,24 +8,10 @@
 #include "multiclass.h"
 #include "simple_label.h"
 #include "parser.h"
+#include "prediction.h"
 #include "future_compat.h"
-#include <memory>
-
 
-enum class prediction_type_t
-{
-  scalar,
-  scalars,
-  action_scores,
-  action_probs,
-  multiclass,
-  multilabels,
-  prob,
-  multiclassprobs,
-  decision_probs
-};
-
-const char* to_string(prediction_type_t prediction_type);
+#include <memory>
 
 namespace LEARNER
 {
@@ -56,8 +42,8 @@ inline func_data tuple_dbf(void* data, base_learner* base, void (*func)(void*))
 struct learn_data
 {
   using fn = void (*)(void* data, base_learner& base, void* ex);
-  using multi_fn = void (*)(void* data, base_learner& base, void* ex, size_t count, size_t step, polyprediction* pred,
-      bool finalize_predictions);
+  using multi_fn = void (*)(void* data, base_learner& base, void* ex, size_t count, size_t step,
+      polyprediction* pred, bool finalize_predictions);
 
   void* data;
   base_learner* base;
@@ -128,6 +114,52 @@ inline void decrement_offset(multi_ex& ec_seq, const size_t increment, const siz
   }
 }
 
+template <typename T>
+void check_prediction_state(T& example_obj, prediction_type_t pred_type) = delete;
+
+template <>
+inline void check_prediction_state<example>(example& example_obj, prediction_type_t pred_type)
+{
+  // The compiler sees these as unused as the only place they are used in an assert statement.
+  _UNUSED(pred_type);
+  _UNUSED(example_obj);
+  assert(example_obj.pred.get_type() == pred_type);
+}
+
+template <>
+inline void check_prediction_state<multi_ex>(multi_ex& example_obj, prediction_type_t pred_type)
+{
+  _UNUSED(pred_type);
+  _UNUSED(example_obj);
+  if (example_obj.size() > 0)
+  {
+    assert(example_obj[0]->pred.get_type() == pred_type);
+  }
+}
+
+template <typename T>
+void check_label_state(T& example_obj, label_type_t label_type) = delete;
+
+template <>
+inline void check_label_state<example>(example& example_obj, label_type_t label_type)
+{
+  // The compiler sees these as unused as the only place they are used in an assert statement.
+  _UNUSED(label_type);
+  _UNUSED(example_obj);
+  assert(example_obj.l.get_type() == label_type);
+}
+
+template <>
+inline void check_label_state<multi_ex>(multi_ex& example_obj, label_type_t label_type)
+{
+  _UNUSED(label_type);
+  _UNUSED(example_obj);
+  if (example_obj.size() > 0)
+  {
+    assert(example_obj[0]->l.get_type() == label_type);
+  }
+}
+
 template <class T, class E>
 struct learner
 {
@@ -145,6 +177,7 @@ struct learner
   learner(){};  // Should only be able to construct a learner through init_learner function
  public:
   prediction_type_t pred_type;
+  label_type_t label_type;
   size_t weights;  // this stores the number of "weight vectors" required by the learner.
   size_t increment;
   bool is_multiline;  // Is this a single-line or multi-line reduction?
@@ -157,18 +190,30 @@ struct learner
   {
     assert((is_multiline && std::is_same<multi_ex, E>::value) ||
         (!is_multiline && std::is_same<example, E>::value));  // sanity check under debug compile
+    check_prediction_state(ec, pred_type);
+    check_label_state(ec, label_type);
+
     increment_offset(ec, increment, i);
     learn_fd.learn_f(learn_fd.data, *learn_fd.base, (void*)&ec);
     decrement_offset(ec, increment, i);
+
+    check_prediction_state(ec, pred_type);
+    check_label_state(ec, label_type);
   }
 
   inline void predict(E& ec, size_t i = 0)
   {
     assert((is_multiline && std::is_same<multi_ex, E>::value) ||
         (!is_multiline && std::is_same<example, E>::value));  // sanity check under debug compile
+    check_prediction_state(ec, pred_type);
+    check_label_state(ec, label_type);
+
     increment_offset(ec, increment, i);
     learn_fd.predict_f(learn_fd.data, *learn_fd.base, (void*)&ec);
     decrement_offset(ec, increment, i);
+
+    check_prediction_state(ec, pred_type);
+    check_label_state(ec, label_type);
   }
 
   inline void multipredict(E& ec, size_t lo, size_t count, polyprediction* pred, bool finalize_predictions)
@@ -184,7 +229,7 @@ struct learner
         if (finalize_predictions)
           pred[c] = ec.pred;  // TODO: this breaks for complex labels because = doesn't do deep copy!
         else
-          pred[c].scalar = ec.partial_prediction;
+          pred[c].scalar() = ec.partial_prediction;
         // pred[c].scalar = finalize_prediction ec.partial_prediction; // TODO: this breaks for complex labels because =
         // doesn't do deep copy! // note works if ec.partial_prediction, but only if finalize_prediction is run????
         increment_offset(ec, increment, 1);
@@ -307,8 +352,8 @@ struct learner
   }
 
   template <class L>
-  static learner<T, E>& init_learner(T* dat, L* base, void (*learn)(T&, L&, E&), void (*predict)(T&, L&, E&), size_t ws,
-      prediction_type_t pred_type)
+  static learner<T, E>& init_learner(
+      T* dat, L* base, void (*learn)(T&, L&, E&), void (*predict)(T&, L&, E&), size_t ws, prediction_type_t pred_type)
   {
     learner<T, E>& ret = calloc_or_throw<learner<T, E> >();
 
@@ -355,6 +400,7 @@ struct learner
     ret.learn_fd.predict_f = (learn_data::fn)predict;
     ret.learn_fd.multipredict_f = nullptr;
     ret.pred_type = pred_type;
+    ret.label_type = label_type_t::unset;
     ret.is_multiline = std::is_same<multi_ex, E>::value;
 
     return ret;
@@ -376,8 +422,8 @@ template <class T, class E, class L>
 learner<T, E>& init_learner(
     free_ptr<T>& dat, void (*learn)(T&, L&, E&), void (*predict)(T&, L&, E&), size_t params_per_weight)
 {
-  auto ret =
-      &learner<T, E>::init_learner(dat.get(), (L*)nullptr, learn, predict, params_per_weight, prediction_type_t::scalar);
+  auto ret = &learner<T, E>::init_learner(
+      dat.get(), (L*)nullptr, learn, predict, params_per_weight, prediction_type_t::scalar);
 
   dat.release();
   return *ret;
@@ -431,8 +477,7 @@ learner<T, E>& init_learner(L* base, void (*learn)(T&, L&, E&), void (*predict)(
 // multiclass reduction
 template <class T, class E, class L>
 learner<T, E>& init_multiclass_learner(free_ptr<T>& dat, L* base, void (*learn)(T&, L&, E&),
-    void (*predict)(T&, L&, E&), parser* p, size_t ws,
-    prediction_type_t pred_type = prediction_type_t::multiclass)
+    void (*predict)(T&, L&, E&), parser* p, size_t ws, prediction_type_t pred_type = prediction_type_t::multiclass)
 {
   learner<T, E>& l = learner<T, E>::init_learner(dat.get(), base, learn, predict, ws, pred_type);
 
@@ -444,8 +489,7 @@ learner<T, E>& init_multiclass_learner(free_ptr<T>& dat, L* base, void (*learn)(
 
 template <class T, class E, class L>
 learner<T, E>& init_cost_sensitive_learner(free_ptr<T>& dat, L* base, void (*learn)(T&, L&, E&),
-    void (*predict)(T&, L&, E&), parser* p, size_t ws,
-    prediction_type_t pred_type = prediction_type_t::multiclass)
+    void (*predict)(T&, L&, E&), parser* p, size_t ws, prediction_type_t pred_type = prediction_type_t::multiclass)
 {
   learner<T, E>& l = learner<T, E>::init_learner(dat.get(), base, learn, predict, ws, pred_type);
   dat.release();
diff --git a/vowpalwabbit/log_multi.cc b/vowpalwabbit/log_multi.cc
index 6d5e63d10fd..dee18b127cf 100644
--- a/vowpalwabbit/log_multi.cc
+++ b/vowpalwabbit/log_multi.cc
@@ -47,7 +47,7 @@ class node_pred
   }
 };
 
-typedef struct
+struct node
 {
   // everyone has
   uint32_t parent;           // the parent node
@@ -68,7 +68,7 @@ typedef struct
   // leaf has
   uint32_t max_count;        // the number of samples of the most common label
   uint32_t max_count_label;  // the most common label
-} node;
+};
 
 struct log_multi
 {
@@ -83,13 +83,6 @@ struct log_multi
   uint32_t swap_resist;
 
   uint32_t nbofswaps;
-
-  ~log_multi()
-  {
-    // save_node_stats(b);
-    for (auto& node : nodes) node.preds.delete_v();
-    nodes.delete_v();
-  }
 };
 
 inline void init_leaf(node& n)
@@ -112,7 +105,6 @@ inline node init_node()
 
   node.parent = 0;
   node.min_count = 0;
-  node.preds = v_init<node_pred>();
   init_leaf(node);
 
   return node;
@@ -251,13 +243,13 @@ void train_node(
     log_multi& b, single_learner& base, example& ec, uint32_t& current, uint32_t& class_index, uint32_t /* depth */)
 {
   if (b.nodes[current].norm_Eh > b.nodes[current].preds[class_index].norm_Ehk)
-    ec.l.simple.label = -1.f;
+    ec.l.simple().label = -1.f;
   else
-    ec.l.simple.label = 1.f;
+    ec.l.simple().label = 1.f;
 
   base.learn(ec, b.nodes[current].base_predictor);  // depth
 
-  ec.l.simple.label = FLT_MAX;
+  ec.l.simple().label = FLT_MAX;
   base.predict(ec, b.nodes[current].base_predictor);  // depth
 
   b.nodes[current].Eh += (double)ec.partial_prediction;
@@ -302,47 +294,58 @@ inline uint32_t descend(node& n, float prediction)
 
 void predict(log_multi& b, single_learner& base, example& ec)
 {
-  MULTICLASS::label_t mc = ec.l.multi;
+  MULTICLASS::label_t mc = ec.l.multi();
+
+  ec.l.reset();
+  ec.l.init_as_simple(FLT_MAX, 0.f, 0.f);
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
 
-  ec.l.simple = {FLT_MAX, 0.f, 0.f};
   uint32_t cn = 0;
   uint32_t depth = 0;
   while (b.nodes[cn].internal)
   {
     base.predict(ec, b.nodes[cn].base_predictor);  // depth
-    cn = descend(b.nodes[cn], ec.pred.scalar);
+    cn = descend(b.nodes[cn], ec.pred.scalar());
     depth++;
   }
-  ec.pred.multiclass = b.nodes[cn].max_count_label;
-  ec.l.multi = mc;
+  ec.pred.reset();
+  ec.pred.init_as_multiclass() = b.nodes[cn].max_count_label;
+  ec.l.reset();
+  ec.l.init_as_multi() = mc;
 }
 
 void learn(log_multi& b, single_learner& base, example& ec)
 {
   //    verify_min_dfs(b, b.nodes[0]);
-  if (ec.l.multi.label == (uint32_t)-1 || b.progress)
+  if (ec.l.multi().label == (uint32_t)-1 || b.progress)
     predict(b, base, ec);
 
-  if (ec.l.multi.label != (uint32_t)-1)  // if training the tree
+  if (ec.l.multi().label != (uint32_t)-1)  // if training the tree
   {
-    MULTICLASS::label_t mc = ec.l.multi;
-    uint32_t start_pred = ec.pred.multiclass;
+    MULTICLASS::label_t mc = ec.l.multi();
+    uint32_t start_pred = ec.pred.multiclass();
 
     uint32_t class_index = 0;
-    ec.l.simple = {FLT_MAX, 0.f, 0.f};
+    ec.l.reset();
+    ec.l.init_as_simple(FLT_MAX, 0.f, 0.f);
+    ec.pred.reset();
+    ec.pred.init_as_scalar();
     uint32_t cn = 0;
     uint32_t depth = 0;
     while (children(b, cn, class_index, mc.label))
     {
       train_node(b, base, ec, cn, class_index, depth);
-      cn = descend(b.nodes[cn], ec.pred.scalar);
+      cn = descend(b.nodes[cn], ec.pred.scalar());
       depth++;
     }
 
     b.nodes[cn].min_count++;
     update_min_count(b, cn);
-    ec.pred.multiclass = start_pred;
-    ec.l.multi = mc;
+    ec.pred.reset();
+    ec.pred.init_as_multiclass() = start_pred;
+    ec.l.reset();
+    ec.l.init_as_multi() = mc;
   }
 }
 
@@ -520,6 +523,6 @@ base_learner* log_multi_setup(options_i& options, vw& all)  // learner setup
   learner<log_multi, example>& l = init_multiclass_learner(
       data, as_singleline(setup_base(options, all)), learn, predict, all.p, data->max_predictors);
   l.set_save_load(save_load_tree);
-
+  l.label_type = label_type_t::multi;
   return make_base(l);
 }
diff --git a/vowpalwabbit/lrq.cc b/vowpalwabbit/lrq.cc
index ab9b617f891..9f3a7d0185f 100644
--- a/vowpalwabbit/lrq.cc
+++ b/vowpalwabbit/lrq.cc
@@ -40,8 +40,7 @@ inline float cheesyrand(uint64_t x)
 
   return merand48(seed);
 }
-
-constexpr inline bool example_is_test(example& ec) { return ec.l.simple.label == FLT_MAX; }
+inline bool example_is_test(example& ec) { return ec.l.simple().label == FLT_MAX; }
 
 void reset_seed(LRQstate& lrq)
 {
@@ -140,13 +139,13 @@ void predict_or_learn(LRQstate& lrq, single_learner& base, example& ec)
     // Restore example
     if (iter == 0)
     {
-      first_prediction = ec.pred.scalar;
+      first_prediction = ec.pred.scalar();
       first_loss = ec.loss;
       first_uncertainty = ec.confidence;
     }
     else
     {
-      ec.pred.scalar = first_prediction;
+      ec.pred.scalar() = first_prediction;
       ec.loss = first_loss;
       ec.confidence = first_uncertainty;
     }
@@ -213,7 +212,7 @@ base_learner* lrq_setup(options_i& options, vw& all)
   learner<LRQstate, example>& l = init_learner(
       lrq, as_singleline(setup_base(options, all)), predict_or_learn<true>, predict_or_learn<false>, 1 + maxk);
   l.set_end_pass(reset_seed);
-
+  l.label_type = label_type_t::simple;
   // TODO: leaks memory ?
   return make_base(l);
 }
diff --git a/vowpalwabbit/lrqfa.cc b/vowpalwabbit/lrqfa.cc
index 3df6e8ac2d6..af29a8dbd90 100644
--- a/vowpalwabbit/lrqfa.cc
+++ b/vowpalwabbit/lrqfa.cc
@@ -26,7 +26,7 @@ inline float cheesyrand(uint64_t x)
   return merand48(seed);
 }
 
-constexpr inline bool example_is_test(example& ec) { return ec.l.simple.label == FLT_MAX; }
+inline bool example_is_test(example& ec) { return ec.l.simple().label == FLT_MAX; }
 
 template <bool is_learn>
 void predict_or_learn(LRQFAstate& lrq, single_learner& base, example& ec)
@@ -109,12 +109,12 @@ void predict_or_learn(LRQFAstate& lrq, single_learner& base, example& ec)
     // Restore example
     if (iter == 0)
     {
-      first_prediction = ec.pred.scalar;
+      first_prediction = ec.pred.scalar();
       first_loss = ec.loss;
     }
     else
     {
-      ec.pred.scalar = first_prediction;
+      ec.pred.scalar() = first_prediction;
       ec.loss = first_loss;
     }
 
@@ -158,6 +158,6 @@ LEARNER::base_learner* lrqfa_setup(options_i& options, vw& all)
   all.wpp = all.wpp * (uint64_t)(1 + lrq->k);
   learner<LRQFAstate, example>& l = init_learner(lrq, as_singleline(setup_base(options, all)), predict_or_learn<true>,
       predict_or_learn<false>, 1 + lrq->field_name.size() * lrq->k);
-
+  l.label_type = label_type_t::simple;
   return make_base(l);
 }
diff --git a/vowpalwabbit/marginal.cc b/vowpalwabbit/marginal.cc
index c0e7550548c..a0f7c5702fa 100644
--- a/vowpalwabbit/marginal.cc
+++ b/vowpalwabbit/marginal.cc
@@ -56,7 +56,7 @@ template <bool is_learn>
 void make_marginal(data& sm, example& ec)
 {
   uint64_t mask = sm.all->weights.mask();
-  float label = ec.l.simple.label;
+  float label = ec.l.simple().label;
   vw& all = *sm.all;
   sm.alg_loss = 0.;
   sm.net_weight = 0.;
@@ -132,7 +132,7 @@ void compute_expert_loss(data& sm, example& ec)
 {
   vw& all = *sm.all;
   // add in the feature-based expert and normalize,
-  float label = ec.l.simple.label;
+  float label = ec.l.simple().label;
 
   if (sm.net_weight + sm.net_feature_weight > 0.)
     sm.average_pred += sm.net_feature_weight * sm.feature_pred;
@@ -143,7 +143,7 @@ void compute_expert_loss(data& sm, example& ec)
   }
   float inv_weight = 1.0f / (sm.net_weight + sm.net_feature_weight);
   sm.average_pred *= inv_weight;
-  ec.pred.scalar = sm.average_pred;
+  ec.pred.scalar() = sm.average_pred;
   ec.partial_prediction = sm.average_pred;
 
   if (is_learn)
@@ -157,7 +157,7 @@ void update_marginal(data& sm, example& ec)
 {
   vw& all = *sm.all;
   uint64_t mask = sm.all->weights.mask();
-  float label = ec.l.simple.label;
+  float label = ec.l.simple().label;
   float weight = ec.weight;
   if (sm.unweighted_marginals)
     weight = 1.;
@@ -189,7 +189,7 @@ void update_marginal(data& sm, example& ec)
           e.second.weight = get_adanormalhedge_weights(e.second.regret, e.second.abs_regret);
         }
 
-        m.first = m.first * (1. - sm.decay) + ec.l.simple.label * weight;
+        m.first = m.first * (1. - sm.decay) + ec.l.simple().label * weight;
         m.second = m.second * (1. - sm.decay) + weight;
       }
   }
@@ -203,7 +203,7 @@ void predict_or_learn(data& sm, LEARNER::single_learner& base, example& ec)
     if (sm.update_before_learn)
     {
       base.predict(ec);
-      float pred = ec.pred.scalar;
+      float pred = ec.pred.scalar();
       if (sm.compete)
       {
         sm.feature_pred = pred;
@@ -213,14 +213,14 @@ void predict_or_learn(data& sm, LEARNER::single_learner& base, example& ec)
       update_marginal(sm, ec);  // update features before learning.
       make_marginal<is_learn>(sm, ec);
       base.learn(ec);
-      ec.pred.scalar = pred;
+      ec.pred.scalar() = pred;
     }
     else
     {
       base.learn(ec);
       if (sm.compete)
       {
-        sm.feature_pred = ec.pred.scalar;
+        sm.feature_pred = ec.pred.scalar();
         compute_expert_loss<is_learn>(sm, ec);
       }
       update_marginal(sm, ec);
@@ -228,7 +228,7 @@ void predict_or_learn(data& sm, LEARNER::single_learner& base, example& ec)
   else
   {
     base.predict(ec);
-    float pred = ec.pred.scalar;
+    float pred = ec.pred.scalar();
     if (sm.compete)
     {
       sm.feature_pred = pred;
@@ -381,6 +381,6 @@ LEARNER::base_learner* marginal_setup(options_i& options, vw& all)
   LEARNER::learner<MARGINAL::data, example>& ret =
       init_learner(d, as_singleline(setup_base(options, all)), predict_or_learn<true>, predict_or_learn<false>);
   ret.set_save_load(save_load);
-
+  ret.label_type = label_type_t::simple;
   return make_base(ret);
 }
diff --git a/vowpalwabbit/memory_tree.cc b/vowpalwabbit/memory_tree.cc
index 5438800d666..2ae53a60d8b 100644
--- a/vowpalwabbit/memory_tree.cc
+++ b/vowpalwabbit/memory_tree.cc
@@ -44,24 +44,16 @@ void remove_at_index(v_array<T>& array, uint32_t index)
   return;
 }
 
-void copy_example_data(example* dst, example* src, bool oas = false)  // copy example data.
+void copy_example_data(example* dst, example* src)  
 {
-  if (oas == false)
-  {
-    dst->l = src->l;
-    dst->l.multi.label = src->l.multi.label;
-  }
-  else
-  {
-    dst->l.multilabels.label_v.delete_v();
-    copy_array(dst->l.multilabels.label_v, src->l.multilabels.label_v);
-  }
+  dst->l = src->l;
+  dst->pred = src->pred;
   VW::copy_example_data(false, dst, src);
 }
 
 inline void free_example(example* ec)
 {
-  VW::dealloc_example(nullptr, *ec);
+  ec->~example();
   free(ec);
 }
 
@@ -72,7 +64,7 @@ void diag_kronecker_prod_fs_test(
     features& f1, features& f2, features& prod_f, float& total_sum_feat_sq, float norm_sq1, float norm_sq2)
 {
   // originally called delete_v, but that doesn't seem right. Clearing instead
-  //prod_f.~features();
+  // prod_f.~features();
   prod_f.clear();
   if (f2.indicies.size() == 0)
     return;
@@ -102,11 +94,11 @@ void diag_kronecker_prod_fs_test(
 
 int cmpfunc(const void* a, const void* b) { return *(char*)a - *(char*)b; }
 
-void diag_kronecker_product_test(example& ec1, example& ec2, example& ec, bool oas = false)
+void diag_kronecker_product_test(example& ec1, example& ec2, example& ec)
 {
   // copy_example_data(&ec, &ec1, oas); //no_feat false, oas: true
-  VW::dealloc_example(nullptr, ec, nullptr);  // clear ec
-  copy_example_data(&ec, &ec1, oas);
+  // VW::dealloc_example(nullptr, ec, nullptr);  // clear ec
+  copy_example_data(&ec, &ec1);
 
   ec.total_sum_feat_sq = 0.0;  // sort namespaces.  pass indices array into sort...template (leave this to the end)
 
@@ -167,7 +159,6 @@ struct node
     right = 0;
     nl = 0.001;  // initilze to 1, as we need to do nl/nr.
     nr = 0.001;
-    examples_index = v_init<uint32_t>();
   }
 };
 
@@ -219,8 +210,6 @@ struct memory_tree
 
   memory_tree()
   {
-    nodes = v_init<node>();
-    examples = v_init<example*>();
     alpha = 0.5;
     routers_used = 0;
     iter = 0;
@@ -235,10 +224,6 @@ struct memory_tree
 
   ~memory_tree()
   {
-    for (auto& node : nodes) node.examples_index.delete_v();
-    nodes.delete_v();
-    for (auto ex : examples) free_example(ex);
-    examples.delete_v();
     if (kprod_ec)
       free_example(kprod_ec);
   }
@@ -273,12 +258,28 @@ float linear_kernel(const flat_example* fec1, const flat_example* fec2)
 
 float normalized_linear_prod(memory_tree& b, example* ec1, example* ec2)
 {
+  
+  auto l1 = std::move(ec1->l);
+  auto l2 = std::move(ec2->l);
+  ec1->l.reset();
+  ec1->l.init_as_simple();
+  ec2->l.reset();
+  ec2->l.init_as_simple();
+
   flat_example* fec1 = flatten_sort_example(*b.all, ec1);
   flat_example* fec2 = flatten_sort_example(*b.all, ec2);
   float norm_sqrt = std::pow(fec1->total_sum_feat_sq * fec2->total_sum_feat_sq, 0.5f);
   float linear_prod = linear_kernel(fec1, fec2);
-  // fec1->fs.delete_v();
-  // fec2->fs.delete_v();
+
+  // This function can be called with ec1 and ec2 pointing to the same thing. In this case, only restore ec1.
+  ec1->l.reset();
+  ec1->l = std::move(l1);
+  if (ec1 != ec2)
+  {
+    ec2->l.reset();
+    ec2->l = std::move(l2);
+  }
+
   free_flatten_example(fec1);
   free_flatten_example(fec2);
   return linear_prod / norm_sqrt;
@@ -308,7 +309,7 @@ void init_tree(memory_tree& b)
 
   b.total_num_queries = 0;
   b.max_routers = b.max_nodes;
-  std::cout << "tree initiazliation is done...." << std::endl
+  std::cout << "tree initiazliation is done..." << std::endl
             << "max nodes " << b.max_nodes << std::endl
             << "tree size: " << b.nodes.size() << std::endl
             << "max number of unique labels: " << b.max_num_labels << std::endl
@@ -390,42 +391,47 @@ float train_node(memory_tree& b, single_learner& base, example& ec, const uint64
   MULTILABEL::labels preds;
   if (b.oas == false)
   {
-    mc = ec.l.multi;
-    save_multi_pred = ec.pred.multiclass;
+    mc = ec.l.multi();
+    save_multi_pred = ec.pred.multiclass();
   }
   else
   {
-    multilabels = ec.l.multilabels;
-    preds = ec.pred.multilabels;
+    multilabels = std::move(ec.l.multilabels());
+    preds = std::move(ec.pred.multilabels());
   }
 
-  ec.l.simple = {1.f, 1.f, 0.};
+  ec.l.reset();
+  ec.l.init_as_simple() = {1.f, 1.f, 0.};
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
   base.predict(ec, b.nodes[cn].base_router);
-  float prediction = ec.pred.scalar;
+  float prediction = ec.pred.scalar();
   // float imp_weight = 1.f; //no importance weight.
 
   float weighted_value =
       (float)((1. - b.alpha) * log(b.nodes[cn].nl / (b.nodes[cn].nr + 1e-1)) / log(2.) + b.alpha * prediction);
   float route_label = weighted_value < 0.f ? -1.f : 1.f;
 
-  // ec.l.simple = {route_label, imp_weight, 0.f};
+  // ec.l.simple() = {route_label, imp_weight, 0.f};
   float ec_input_weight = ec.weight;
   ec.weight = 1.f;
-  ec.l.simple = {route_label, 1., 0.f};
+  ec.l.simple() = {route_label, 1., 0.f};
   base.learn(ec, b.nodes[cn].base_router);  // update the router according to the new example.
 
   base.predict(ec, b.nodes[cn].base_router);
-  float save_binary_scalar = ec.pred.scalar;
+  float save_binary_scalar = ec.pred.scalar();
 
+  ec.l.reset();
+  ec.pred.reset();
   if (b.oas == false)
   {
-    ec.l.multi = mc;
-    ec.pred.multiclass = save_multi_pred;
+    ec.l.init_as_multi() = mc;
+    ec.pred.init_as_multiclass() = save_multi_pred;
   }
   else
   {
-    ec.pred.multilabels = preds;
-    ec.l.multilabels = multilabels;
+    ec.pred.init_as_multilabels() = std::move(preds);
+    ec.l.init_as_multilabels() = std::move(multilabels);
   }
   ec.weight = ec_input_weight;
 
@@ -467,51 +473,60 @@ void split_leaf(memory_tree& b, single_learner& base, const uint64_t cn)
   for (size_t ec_id = 0; ec_id < b.nodes[cn].examples_index.size(); ec_id++)  // scan all examples stored in the cn
   {
     uint32_t ec_pos = b.nodes[cn].examples_index[ec_id];
+    auto& current_ex = *b.examples[ec_pos];
     MULTICLASS::label_t mc;
     uint32_t save_multi_pred = 0;
     MULTILABEL::labels multilabels;
     MULTILABEL::labels preds;
     if (b.oas == false)
     {
-      mc = b.examples[ec_pos]->l.multi;
-      save_multi_pred = b.examples[ec_pos]->pred.multiclass;
+      mc = current_ex.l.multi();
+      save_multi_pred = current_ex.pred.multiclass();
     }
     else
     {
-      multilabels = b.examples[ec_pos]->l.multilabels;
-      preds = b.examples[ec_pos]->pred.multilabels;
+      multilabels = std::move(current_ex.l.multilabels());
+      preds = std::move(current_ex.pred.multilabels());
     }
 
-    b.examples[ec_pos]->l.simple = {1.f, 1.f, 0.f};
-    base.predict(*b.examples[ec_pos], b.nodes[cn].base_router);  // re-predict
-    float scalar = b.examples[ec_pos]->pred.scalar;              // this is spliting the leaf.
-    if (scalar < 0)
+    current_ex.l.reset();
+    current_ex.l.init_as_simple() = {1.f, 1.f, 0.f};
+    current_ex.pred.reset();
+    current_ex.pred.init_as_scalar();
+    base.predict(current_ex, b.nodes[cn].base_router);  // re-predict
+    float scalar = current_ex.pred.scalar();            // this is spliting the leaf.
+
+    current_ex.l.reset();
+    current_ex.pred.reset();
+    if (b.oas == false)
     {
-      b.nodes[left_child].examples_index.push_back(ec_pos);
-      float leaf_pred = train_node(b, base, *b.examples[ec_pos], left_child);
-      insert_descent(b.nodes[left_child], leaf_pred);  // fake descent, only for update nl and nr
+      current_ex.l.init_as_multi() = mc;
+      current_ex.pred.init_as_multiclass() = save_multi_pred;
     }
     else
     {
-      b.nodes[right_child].examples_index.push_back(ec_pos);
-      float leaf_pred = train_node(b, base, *b.examples[ec_pos], right_child);
-      insert_descent(b.nodes[right_child], leaf_pred);  // fake descent. for update nr and nl
+      current_ex.pred.init_as_multilabels() = preds;
+      current_ex.l.init_as_multilabels() = multilabels;
     }
 
-    if (b.oas == false)
+    if (scalar < 0)
     {
-      b.examples[ec_pos]->l.multi = mc;
-      b.examples[ec_pos]->pred.multiclass = save_multi_pred;
+      b.nodes[left_child].examples_index.push_back(ec_pos);
+      float leaf_pred = train_node(b, base, current_ex, left_child);
+      insert_descent(b.nodes[left_child], leaf_pred);  // fake descent, only for update nl and nr
     }
     else
     {
-      b.examples[ec_pos]->pred.multilabels = preds;
-      b.examples[ec_pos]->l.multilabels = multilabels;
+      b.nodes[right_child].examples_index.push_back(ec_pos);
+      float leaf_pred = train_node(b, base, current_ex, right_child);
+      insert_descent(b.nodes[right_child], leaf_pred);  // fake descent. for update nr and nl
     }
   }
-  b.nodes[cn].examples_index.delete_v();                                                 // empty the cn's example list
-  b.nodes[cn].nl = std::max(double(b.nodes[left_child].examples_index.size()), 0.001);   // avoid to set nl to zero
-  b.nodes[cn].nr = std::max(double(b.nodes[right_child].examples_index.size()), 0.001);  // avoid to set nr to zero
+  b.nodes[cn].examples_index.clear();  // empty the cn's example list
+  b.nodes[cn].nl =
+      std::max(static_cast<double>(b.nodes[left_child].examples_index.size()), 0.001);  // avoid to set nl to zero
+  b.nodes[cn].nr =
+      std::max(static_cast<double>(b.nodes[right_child].examples_index.size()), 0.001);  // avoid to set nr to zero
 
   if (std::max(b.nodes[cn].nl, b.nodes[cn].nr) > b.max_ex_in_leaf)
   {
@@ -565,9 +580,9 @@ void collect_labels_from_leaf(memory_tree& b, const uint64_t cn, v_array<uint32_
   for (size_t i = 0; i < b.nodes[cn].examples_index.size(); i++)
   {  // scan through each memory in the leaf
     uint32_t loc = b.nodes[cn].examples_index[i];
-    for (uint32_t lab : b.examples[loc]->l.multilabels.label_v)
+    for (uint32_t lab : b.examples[loc]->l.multilabels().label_v)
     {  // scan through each label:
-      if (v_array_contains(leaf_labs, lab) == false)
+      if (std::find(leaf_labs.cbegin(), leaf_labs.cend(), lab) == leaf_labs.cend())
         leaf_labs.push_back(lab);
     }
   }
@@ -575,42 +590,42 @@ void collect_labels_from_leaf(memory_tree& b, const uint64_t cn, v_array<uint32_
 
 inline void train_one_against_some_at_leaf(memory_tree& b, single_learner& base, const uint64_t cn, example& ec)
 {
-  v_array<uint32_t> leaf_labs = v_init<uint32_t>();
+  v_array<uint32_t> leaf_labs;
   collect_labels_from_leaf(b, cn, leaf_labs);  // unique labels from the leaf.
-  MULTILABEL::labels multilabels = ec.l.multilabels;
-  MULTILABEL::labels preds = ec.pred.multilabels;
-  ec.l.simple = {FLT_MAX, 1.f, 0.f};
+  MULTILABEL::labels& multilabels = ec.l.multilabels();
+  MULTILABEL::labels& preds = ec.pred.multilabels();
+  ec.l.simple() = {FLT_MAX, 1.f, 0.f};
   for (size_t i = 0; i < leaf_labs.size(); i++)
   {
-    ec.l.simple.label = -1.f;
-    if (v_array_contains(multilabels.label_v, leaf_labs[i]))
-      ec.l.simple.label = 1.f;
+    ec.l.simple().label = -1.f;
+    if (std::find(multilabels.label_v.cbegin(), multilabels.label_v.cend(), leaf_labs[i]) == multilabels.label_v.cend())
+      ec.l.simple().label = 1.f;
     base.learn(ec, b.max_routers + 1 + leaf_labs[i]);
   }
-  ec.pred.multilabels = preds;
-  ec.l.multilabels = multilabels;
+  ec.pred.multilabels() = preds;
+  ec.l.multilabels() = multilabels;
 }
 
 inline uint32_t compute_hamming_loss_via_oas(
     memory_tree& b, single_learner& base, const uint64_t cn, example& ec, v_array<uint32_t>& selected_labs)
 {
-  selected_labs.delete_v();
-  v_array<uint32_t> leaf_labs = v_init<uint32_t>();
+  selected_labs.clear();
+  v_array<uint32_t> leaf_labs;
   collect_labels_from_leaf(b, cn, leaf_labs);  // unique labels stored in the leaf.
-  MULTILABEL::labels multilabels = ec.l.multilabels;
-  MULTILABEL::labels preds = ec.pred.multilabels;
-  ec.l.simple = {FLT_MAX, 1.f, 0.f};
+  MULTILABEL::labels& multilabels = ec.l.multilabels();
+  MULTILABEL::labels& preds = ec.pred.multilabels();
+  ec.l.simple() = {FLT_MAX, 1.f, 0.f};
   for (size_t i = 0; i < leaf_labs.size(); i++)
   {
     base.predict(ec, b.max_routers + 1 + leaf_labs[i]);
-    float score = ec.pred.scalar;
+    float score = ec.pred.scalar();
     if (score > 0)
       selected_labs.push_back(leaf_labs[i]);
   }
-  ec.pred.multilabels = preds;
-  ec.l.multilabels = multilabels;
+  ec.pred.multilabels() = preds;
+  ec.l.multilabels() = multilabels;
 
-  return hamming_loss(ec.l.multilabels.label_v, selected_labs);
+  return hamming_loss(ec.l.multilabels().label_v, selected_labs);
 }
 
 // pick up the "closest" example in the leaf using the score function.
@@ -630,8 +645,11 @@ int64_t pick_nearest(memory_tree& b, single_learner& base, const uint64_t cn, ex
       if (b.learn_at_leaf == true && b.current_pass >= 1)
       {
         float tmp_s = normalized_linear_prod(b, &ec, b.examples[loc]);
-        diag_kronecker_product_test(ec, *b.examples[loc], *b.kprod_ec, b.oas);
-        b.kprod_ec->l.simple = {FLT_MAX, 0., tmp_s};
+        diag_kronecker_product_test(ec, *b.examples[loc], *b.kprod_ec);
+        b.kprod_ec->l.reset();
+        b.kprod_ec->l.init_as_simple() = {FLT_MAX, 0., tmp_s};
+        b.kprod_ec->pred.reset();
+        b.kprod_ec->pred.init_as_scalar();
         base.predict(*b.kprod_ec, b.max_routers);
         score = b.kprod_ec->partial_prediction;
       }
@@ -653,15 +671,15 @@ int64_t pick_nearest(memory_tree& b, single_learner& base, const uint64_t cn, ex
 // for any two examples, use number of overlap labels to indicate the similarity between these two examples.
 float get_overlap_from_two_examples(example& ec1, example& ec2)
 {
-  return (float)over_lap(ec1.l.multilabels.label_v, ec2.l.multilabels.label_v);
+  return (float)over_lap(ec1.l.multilabels().label_v, ec2.l.multilabels().label_v);
 }
 
 // we use F1 score as the reward signal
 float F1_score_for_two_examples(example& ec1, example& ec2)
 {
   float num_overlaps = get_overlap_from_two_examples(ec1, ec2);
-  float v1 = (float)(num_overlaps / (1e-7 + ec1.l.multilabels.label_v.size() * 1.));
-  float v2 = (float)(num_overlaps / (1e-7 + ec2.l.multilabels.label_v.size() * 1.));
+  float v1 = (float)(num_overlaps / (1e-7 + ec1.l.multilabels().label_v.size() * 1.));
+  float v2 = (float)(num_overlaps / (1e-7 + ec2.l.multilabels().label_v.size() * 1.));
   if (num_overlaps == 0.f)
     return 0.f;
   else
@@ -677,33 +695,39 @@ void predict(memory_tree& b, single_learner& base, example& ec)
   MULTILABEL::labels preds;
   if (b.oas == false)
   {
-    mc = ec.l.multi;
-    save_multi_pred = ec.pred.multiclass;
+    mc = ec.l.multi();
+    save_multi_pred = ec.pred.multiclass();
   }
   else
   {
-    multilabels = ec.l.multilabels;
-    preds = ec.pred.multilabels;
+    multilabels = std::move(ec.l.multilabels());
+    preds = std::move(ec.pred.multilabels());
   }
 
   uint64_t cn = 0;
-  ec.l.simple = {-1.f, 1.f, 0.};
+  ec.l.reset();
+  ec.l.init_as_simple() = {-1.f, 1.f, 0.};
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
   while (b.nodes[cn].internal == 1)
   {  // if it's internal{
     base.predict(ec, b.nodes[cn].base_router);
-    uint64_t newcn = ec.pred.scalar < 0 ? b.nodes[cn].left : b.nodes[cn].right;  // do not need to increment nl and nr.
+    uint64_t newcn =
+        ec.pred.scalar() < 0 ? b.nodes[cn].left : b.nodes[cn].right;  // do not need to increment nl and nr.
     cn = newcn;
   }
 
+  ec.l.reset();
+  ec.pred.reset();
   if (b.oas == false)
   {
-    ec.l.multi = mc;
-    ec.pred.multiclass = save_multi_pred;
+    ec.l.init_as_multi() = mc;
+    ec.pred.init_as_multiclass() = save_multi_pred;
   }
   else
   {
-    ec.pred.multilabels = preds;
-    ec.l.multilabels = multilabels;
+    ec.pred.init_as_multilabels() = std::move(preds);
+    ec.l.init_as_multilabels() = std::move(multilabels);
   }
 
   int64_t closest_ec = 0;
@@ -711,11 +735,11 @@ void predict(memory_tree& b, single_learner& base, example& ec)
   {
     closest_ec = pick_nearest(b, base, cn, ec);
     if (closest_ec != -1)
-      ec.pred.multiclass = b.examples[closest_ec]->l.multi.label;
+      ec.pred.multiclass() = b.examples[closest_ec]->l.multi().label;
     else
-      ec.pred.multiclass = 0;
+      ec.pred.multiclass() = 0;
 
-    if (ec.l.multi.label != ec.pred.multiclass)
+    if (ec.l.multi().label != ec.pred.multiclass())
     {
       ec.loss = ec.weight;
       b.num_mistakes++;
@@ -730,7 +754,7 @@ void predict(memory_tree& b, single_learner& base, example& ec)
       reward = F1_score_for_two_examples(ec, *b.examples[closest_ec]);
       b.F1_score += reward;
     }
-    v_array<uint32_t> selected_labs = v_init<uint32_t>();
+    v_array<uint32_t> selected_labs;
     ec.loss = (float)compute_hamming_loss_via_oas(b, base, cn, ec, selected_labs);
     b.hamming_loss += ec.loss;
   }
@@ -745,31 +769,35 @@ float return_reward_from_node(memory_tree& b, single_learner& base, uint64_t cn,
   MULTILABEL::labels preds;
   if (b.oas == false)
   {
-    mc = ec.l.multi;
-    save_multi_pred = ec.pred.multiclass;
+    mc = ec.l.multi();
+    save_multi_pred = ec.pred.multiclass();
   }
   else
   {
-    multilabels = ec.l.multilabels;
-    preds = ec.pred.multilabels;
+    multilabels = std::move(ec.l.multilabels());
+    preds = std::move(ec.pred.multilabels());
   }
-  ec.l.simple = {FLT_MAX, 1., 0.0};
+  ec.l.reset();
+  ec.l.init_as_simple() = {FLT_MAX, 1., 0.0};
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
   while (b.nodes[cn].internal != -1)
   {
     base.predict(ec, b.nodes[cn].base_router);
-    float prediction = ec.pred.scalar;
+    float prediction = ec.pred.scalar();
     cn = prediction < 0 ? b.nodes[cn].left : b.nodes[cn].right;
   }
-
+  ec.l.reset();
+  ec.pred.reset();
   if (b.oas == false)
   {
-    ec.l.multi = mc;
-    ec.pred.multiclass = save_multi_pred;
+    ec.l.init_as_multi() = mc;
+    ec.pred.init_as_multiclass() = save_multi_pred;
   }
   else
   {
-    ec.pred.multilabels = preds;
-    ec.l.multilabels = multilabels;
+    ec.pred.init_as_multilabels() = preds;
+    ec.l.init_as_multilabels() = multilabels;
   }
 
   // get to leaf now:
@@ -778,7 +806,7 @@ float return_reward_from_node(memory_tree& b, single_learner& base, uint64_t cn,
   closest_ec = pick_nearest(b, base, cn, ec);  // no randomness for picking example.
   if (b.oas == false)
   {
-    if ((closest_ec != -1) && (b.examples[closest_ec]->l.multi.label == ec.l.multi.label))
+    if ((closest_ec != -1) && (b.examples[closest_ec]->l.multi().label == ec.l.multi().label))
       reward = 1.f;
   }
   else
@@ -791,8 +819,11 @@ float return_reward_from_node(memory_tree& b, single_learner& base, uint64_t cn,
   if (b.learn_at_leaf == true && closest_ec != -1)
   {
     float score = normalized_linear_prod(b, &ec, b.examples[closest_ec]);
-    diag_kronecker_product_test(ec, *b.examples[closest_ec], *b.kprod_ec, b.oas);
-    b.kprod_ec->l.simple = {reward, 1.f, -score};
+    diag_kronecker_product_test(ec, *b.examples[closest_ec], *b.kprod_ec);
+    b.kprod_ec->l.reset();
+    b.kprod_ec->l.init_as_simple() = {reward, 1.f, -score};
+    b.kprod_ec->pred.reset();
+    b.kprod_ec->pred.init_as_scalar();
     b.kprod_ec->weight = weight;
     base.learn(*b.kprod_ec, b.max_routers);
   }
@@ -816,11 +847,14 @@ void learn_at_leaf_random(
   }
   if (ec_id != -1)
   {
-    if (b.examples[ec_id]->l.multi.label == ec.l.multi.label)
+    if (b.examples[ec_id]->l.multi().label == ec.l.multi().label)
       reward = 1.f;
     float score = normalized_linear_prod(b, &ec, b.examples[ec_id]);
-    diag_kronecker_product_test(ec, *b.examples[ec_id], *b.kprod_ec, b.oas);
-    b.kprod_ec->l.simple = {reward, 1.f, -score};
+    diag_kronecker_product_test(ec, *b.examples[ec_id], *b.kprod_ec);
+    b.kprod_ec->l.reset();
+    b.kprod_ec->l.init_as_simple() = {reward, 1.f, -score};
+    b.kprod_ec->pred.reset();
+    b.kprod_ec->pred.init_as_scalar();
     b.kprod_ec->weight = weight;  //* b.nodes[leaf_id].examples_index.size();
     base.learn(*b.kprod_ec, b.max_routers);
   }
@@ -838,22 +872,25 @@ void route_to_leaf(memory_tree& b, single_learner& base, const uint32_t& ec_arra
   MULTILABEL::labels preds;
   if (b.oas == false)
   {
-    mc = ec.l.multi;
-    save_multi_pred = ec.pred.multiclass;
+    mc = ec.l.multi();
+    save_multi_pred = ec.pred.multiclass();
   }
   else
   {
-    multilabels = ec.l.multilabels;
-    preds = ec.pred.multilabels;
+    multilabels = std::move(ec.l.multilabels());
+    preds = std::move(ec.pred.multilabels());
   }
 
   path.clear();
-  ec.l.simple = {FLT_MAX, 1.0, 0.0};
+  ec.l.reset();
+  ec.l.init_as_simple() = {FLT_MAX, 1.0, 0.0};
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
   while (b.nodes[cn].internal != -1)
   {
     path.push_back(cn);  // path stores node id from the root to the leaf
     base.predict(ec, b.nodes[cn].base_router);
-    float prediction = ec.pred.scalar;
+    float prediction = ec.pred.scalar();
     if (insertion == false)
       cn = prediction < 0 ? b.nodes[cn].left : b.nodes[cn].right;
     else
@@ -861,15 +898,17 @@ void route_to_leaf(memory_tree& b, single_learner& base, const uint32_t& ec_arra
   }
   path.push_back(cn);  // push back the leaf
 
+  ec.l.reset();
+  ec.pred.reset();
   if (b.oas == false)
   {
-    ec.l.multi = mc;
-    ec.pred.multiclass = save_multi_pred;
+    ec.l.init_as_multi() = mc;
+    ec.pred.init_as_multiclass() = save_multi_pred;
   }
   else
   {
-    ec.pred.multilabels = preds;
-    ec.l.multilabels = multilabels;
+    ec.pred.init_as_multilabels() = std::move(preds);
+    ec.l.init_as_multilabels() = std::move(multilabels);
   }
 
   // std::cout<<"at route to leaf: "<<path.size()<< std::endl;
@@ -884,7 +923,7 @@ void route_to_leaf(memory_tree& b, single_learner& base, const uint32_t& ec_arra
 // we roll in, then stop at a random step, do exploration. //no real insertion happens in the function.
 void single_query_and_learn(memory_tree& b, single_learner& base, const uint32_t& ec_array_index, example& ec)
 {
-  v_array<uint64_t> path_to_leaf = v_init<uint64_t>();
+  v_array<uint64_t> path_to_leaf;
   route_to_leaf(b, base, ec_array_index, 0, path_to_leaf, false);  // no insertion happens here.
 
   if (path_to_leaf.size() > 1)
@@ -915,14 +954,18 @@ void single_query_and_learn(memory_tree& b, single_learner& base, const uint32_t
       float ec_input_weight = ec.weight;
 
       MULTICLASS::label_t mc;
+      uint32_t save_multi_pred = 0;
       MULTILABEL::labels multilabels;
       MULTILABEL::labels preds;
       if (b.oas == false)
-        mc = ec.l.multi;
+      {
+        mc = ec.l.multi();
+        save_multi_pred = ec.pred.multiclass();
+      }
       else
       {
-        multilabels = ec.l.multilabels;
-        preds = ec.pred.multilabels;
+        multilabels = std::move(ec.l.multilabels());
+        preds = std::move(ec.pred.multilabels());
       }
 
       ec.weight = fabs(objective);
@@ -930,15 +973,23 @@ void single_query_and_learn(memory_tree& b, single_learner& base, const uint32_t
         ec.weight = 100.f;
       else if (ec.weight < .01f)
         ec.weight = 0.01f;
-      ec.l.simple = {objective < 0. ? -1.f : 1.f, 1.f, 0.};
+      ec.l.reset();
+      ec.l.init_as_simple() = {objective < 0. ? -1.f : 1.f, 1.f, 0.};
+      ec.pred.reset();
+      ec.pred.init_as_scalar();
       base.learn(ec, b.nodes[cn].base_router);
 
+      ec.l.reset();
+      ec.pred.reset();
       if (b.oas == false)
-        ec.l.multi = mc;
+      {
+        ec.l.init_as_multi() = mc;
+        ec.pred.init_as_multiclass() = save_multi_pred;
+      }
       else
       {
-        ec.pred.multilabels = preds;
-        ec.l.multilabels = multilabels;
+        ec.pred.init_as_multilabels() = std::move(preds);
+        ec.l.init_as_multilabels() = std::move(multilabels);
       }
       ec.weight = ec_input_weight;  // restore the original weight
     }
@@ -953,7 +1004,6 @@ void single_query_and_learn(memory_tree& b, single_learner& base, const uint32_t
         train_one_against_some_at_leaf(b, base, cn, ec);
     }
   }
-  path_to_leaf.delete_v();
 }
 
 // using reward signals
@@ -1009,9 +1059,8 @@ void experience_replay(memory_tree& b, single_learner& base)
     {
       if (b.dream_at_update == false)
       {
-        v_array<uint64_t> tmp_path = v_init<uint64_t>();
+        v_array<uint64_t> tmp_path;
         route_to_leaf(b, base, ec_id, 0, tmp_path, true);
-        tmp_path.delete_v();
       }
       else
       {
@@ -1045,7 +1094,7 @@ void learn(memory_tree& b, single_learner& base, example& ec)
     if (b.current_pass < 1)
     {  // in the first pass, we need to store the memory:
       example* new_ec = &calloc_or_throw<example>();
-      copy_example_data(new_ec, &ec, b.oas);
+      copy_example_data(new_ec, &ec);
       b.examples.push_back(new_ec);
       if (b.online == true)
         update_rew(b, base, (uint32_t)(b.examples.size() - 1), *b.examples[b.examples.size() - 1]);  // query and learn
@@ -1097,18 +1146,18 @@ void save_load_example(example* ec, io_buf& model_file, bool& read, bool& text,
   writeit(ec->ft_offset, "ft_offset");
   if (oas == false)
   {  // multi-class
-    writeit(ec->l.multi.label, "multiclass_label");
-    writeit(ec->l.multi.weight, "multiclass_weight");
+    writeit(ec->l.multi().label, "multiclass_label");
+    writeit(ec->l.multi().weight, "multiclass_weight");
   }
   else
   {  // multi-label
-    writeitvar(ec->l.multilabels.label_v.size(), "label_size", label_size);
+    writeitvar(ec->l.multilabels().label_v.size(), "label_size", label_size);
     if (read)
     {
-      ec->l.multilabels.label_v.clear();
-      for (uint32_t i = 0; i < label_size; i++) ec->l.multilabels.label_v.push_back(0);
+      ec->l.multilabels().label_v.clear();
+      for (uint32_t i = 0; i < label_size; i++) ec->l.multilabels().label_v.push_back(0);
     }
-    for (uint32_t i = 0; i < label_size; i++) writeit(ec->l.multilabels.label_v[i], "ec_label");
+    for (uint32_t i = 0; i < label_size; i++) writeit(ec->l.multilabels().label_v[i], "ec_label");
   }
 
   writeitvar(ec->tag.size(), "tags", tag_number);
@@ -1123,7 +1172,7 @@ void save_load_example(example* ec, io_buf& model_file, bool& read, bool& text,
   writeitvar(ec->indices.size(), "namespaces", namespace_size);
   if (read)
   {
-    ec->indices.delete_v();
+    ec->indices.clear();
     for (uint32_t i = 0; i < namespace_size; i++)
     {
       ec->indices.push_back('\0');
@@ -1139,8 +1188,6 @@ void save_load_example(example* ec, io_buf& model_file, bool& read, bool& text,
     if (read)
     {
       fs->clear();
-      fs->values = v_init<feature_value>();
-      fs->indicies = v_init<feature_index>();
       for (uint32_t f_i = 0; f_i < feat_size; f_i++)
       {
         fs->push_back(0, 0);
@@ -1249,7 +1296,7 @@ base_learner* memory_tree_setup(options_i& options, vw& all)
                .help("number of dream operations per example (default = 1)"))
       .add(make_option("top_K", tree->top_K).default_value(1).help("top K prediction error (default 1)"))
       .add(make_option("learn_at_leaf", tree->learn_at_leaf).help("whether or not learn at leaf (defualt = True)"))
-      .add(make_option("oas", tree->oas).help("use oas at the leaf"))
+      .add(make_option("oas", tree->oas).help("use oas (one against some) at the leaf"))
       .add(make_option("dream_at_update", tree->dream_at_update)
                .default_value(0)
                .help("turn on dream operations at reward based update as well"))
@@ -1289,6 +1336,7 @@ base_learner* memory_tree_setup(options_i& options, vw& all)
     // srand(time(0));
     l.set_save_load(save_load_memory_tree);
     l.set_end_pass(end_pass);
+    l.label_type = label_type_t::multi;
 
     return make_base(l);
   }  // multi-label classification
@@ -1299,16 +1347,13 @@ base_learner* memory_tree_setup(options_i& options, vw& all)
         tree, as_singleline(setup_base(options, all)), learn, predict, num_learners, prediction_type_t::multilabels);
 
     // all.p->lp = MULTILABEL::multilabel;
-    // all.label_type = label_type_t::multi;
-    // all.delete_prediction = MULTILABEL::multilabel.delete_label;
     // srand(time(0));
     l.set_end_pass(end_pass);
     l.set_save_load(save_load_memory_tree);
     // l.set_end_pass(end_pass);
 
     all.p->lp = MULTILABEL::multilabel;
-    all.label_type = label_type_t::multi;
-    all.delete_prediction = MULTILABEL::multilabel.delete_label;
+    l.label_type = label_type_t::multi;
 
     return make_base(l);
   }
diff --git a/vowpalwabbit/mf.cc b/vowpalwabbit/mf.cc
index 4d84de7e3a5..5fb2bdc34dc 100644
--- a/vowpalwabbit/mf.cc
+++ b/vowpalwabbit/mf.cc
@@ -37,13 +37,6 @@ struct mf
   features temp_features;
 
   vw* all;  // for pairs? and finalize
-
-  ~mf()
-  {
-    // clean up local v_arrays
-    indices.delete_v();
-    sub_predictions.delete_v();
-  }
 };
 
 template <bool cache_sub_predictions>
@@ -105,18 +98,18 @@ void predict(mf& data, single_learner& base, example& ec)
 
   // finalize prediction
   ec.partial_prediction = prediction;
-  ec.pred.scalar = GD::finalize_prediction(data.all->sd, ec.partial_prediction);
+  ec.pred.scalar() = GD::finalize_prediction(data.all->sd, ec.partial_prediction);
 }
 
 void learn(mf& data, single_learner& base, example& ec)
 {
   // predict with current weights
   predict<true>(data, base, ec);
-  float predicted = ec.pred.scalar;
+  float predicted = ec.pred.scalar();
 
   // update linear weights
   base.update(ec);
-  ec.pred.scalar = ec.updated_prediction;
+  ec.pred.scalar() = ec.updated_prediction;
 
   // store namespace indices
   copy_array(data.indices, ec.indices);
@@ -138,7 +131,7 @@ void learn(mf& data, single_learner& base, example& ec)
       ec.indices[0] = left_ns;
 
       // store feature values in left namespace
-      data.temp_features.deep_copy_from(ec.feature_space[left_ns]);
+      data.temp_features = ec.feature_space[left_ns];
 
       for (size_t k = 1; k <= data.rank; k++)
       {
@@ -150,19 +143,19 @@ void learn(mf& data, single_learner& base, example& ec)
         base.update(ec, k);
 
         // restore left namespace features (undoing multiply)
-        fs.deep_copy_from(data.temp_features);
+        fs = data.temp_features;
 
         // compute new l_k * x_l scaling factors
         // base.predict(ec, k);
         // data.sub_predictions[2*k-1] = ec.partial_prediction;
-        // ec.pred.scalar = ec.updated_prediction;
+        // ec.pred.scalar() = ec.updated_prediction;
       }
 
       // set example to right namespace only
       ec.indices[0] = right_ns;
 
       // store feature values for right namespace
-      data.temp_features.deep_copy_from(ec.feature_space[right_ns]);
+      data.temp_features = ec.feature_space[right_ns];
 
       for (size_t k = 1; k <= data.rank; k++)
       {
@@ -172,18 +165,18 @@ void learn(mf& data, single_learner& base, example& ec)
 
         // update r^k using base learner
         base.update(ec, k + data.rank);
-        ec.pred.scalar = ec.updated_prediction;
+        ec.pred.scalar() = ec.updated_prediction;
 
         // restore right namespace features
-        fs.deep_copy_from(data.temp_features);
+        fs = data.temp_features;
       }
     }
   }
   // restore namespace indices
-  copy_array(ec.indices, data.indices);
+  ec.indices = data.indices;
 
   // restore original prediction
-  ec.pred.scalar = predicted;
+  ec.pred.scalar() = predicted;
 }
 
 void finish(mf& o)
@@ -210,8 +203,11 @@ base_learner* mf_setup(options_i& options, vw& all)
 
   all.random_positive_weights = true;
 
+  auto base = as_singleline(setup_base(options, all));
   learner<mf, example>& l =
-      init_learner(data, as_singleline(setup_base(options, all)), learn, predict<false>, 2 * data->rank + 1);
+      init_learner(data, base, learn, predict<false>, 2 * data->rank + 1);
   l.set_finish(finish);
+  l.label_type = base->label_type;
+
   return make_base(l);
 }
diff --git a/vowpalwabbit/multiclass.cc b/vowpalwabbit/multiclass.cc
index d31d4ddc931..6f8e6c45e1e 100644
--- a/vowpalwabbit/multiclass.cc
+++ b/vowpalwabbit/multiclass.cc
@@ -11,20 +11,20 @@
 
 namespace MULTICLASS
 {
-char* bufread_label(label_t* ld, char* c)
+char* bufread_label(label_t& ld, char* c)
 {
-  memcpy(&ld->label, c, sizeof(ld->label));
-  c += sizeof(ld->label);
-  memcpy(&ld->weight, c, sizeof(ld->weight));
-  c += sizeof(ld->weight);
+  memcpy(&ld.label, c, sizeof(ld.label));
+  c += sizeof(ld.label);
+  memcpy(&ld.weight, c, sizeof(ld.weight));
+  c += sizeof(ld.weight);
   return c;
 }
 
-size_t read_cached_label(shared_data*, void* v, io_buf& cache)
+size_t read_cached_label(shared_data*, polylabel& v, io_buf& cache)
 {
-  label_t* ld = (label_t*)v;
+  auto& ld = v.multi();
   char* c;
-  size_t total = sizeof(ld->label) + sizeof(ld->weight);
+  size_t total = sizeof(ld.label) + sizeof(ld.weight);
   if (cache.buf_read(c, total) < total)
     return 0;
   bufread_label(ld, c);
@@ -32,75 +32,77 @@ size_t read_cached_label(shared_data*, void* v, io_buf& cache)
   return total;
 }
 
-float weight(void* v)
+float weight(polylabel& v)
 {
-  label_t* ld = (label_t*)v;
-  return (ld->weight > 0) ? ld->weight : 0.f;
+ auto& ld = v.multi();
+  return (ld.weight > 0) ? ld.weight : 0.f;
 }
 
-char* bufcache_label(label_t* ld, char* c)
+char* bufcache_label(label_t& ld, char* c)
 {
-  memcpy(c, &ld->label, sizeof(ld->label));
-  c += sizeof(ld->label);
-  memcpy(c, &ld->weight, sizeof(ld->weight));
-  c += sizeof(ld->weight);
+  memcpy(c, &ld.label, sizeof(ld.label));
+  c += sizeof(ld.label);
+  memcpy(c, &ld.weight, sizeof(ld.weight));
+  c += sizeof(ld.weight);
   return c;
 }
 
-void cache_label(void* v, io_buf& cache)
+void cache_label(polylabel& v, io_buf& cache)
 {
   char* c;
-  label_t* ld = (label_t*)v;
-  cache.buf_write(c, sizeof(ld->label) + sizeof(ld->weight));
+ auto& ld = v.multi();
+  cache.buf_write(c, sizeof(ld.label) + sizeof(ld.weight));
   bufcache_label(ld, c);
 }
 
-void default_label(void* v)
+void default_label(polylabel& v)
 {
-  label_t* ld = (label_t*)v;
-  ld->label = (uint32_t)-1;
-  ld->weight = 1.;
+  if (v.get_type() != label_type_t::multi)
+  {
+    v.reset();
+    v.init_as_multi();
+  }
+  auto& ld = v.multi();
+  ld.label = (uint32_t)-1;
+  ld.weight = 1.;
 }
 
-bool test_label(void* v)
+bool test_label(polylabel& v)
 {
-  label_t* ld = (label_t*)v;
-  return ld->label == (uint32_t)-1;
+ auto& ld = v.multi();
+  return ld.label == (uint32_t)-1;
 }
 
-void delete_label(void*) {}
-
-void parse_label(parser*, shared_data* sd, void* v, v_array<VW::string_view>& words)
+void parse_label(parser*, shared_data* sd, polylabel& v, v_array<VW::string_view>& words)
 {
-  label_t* ld = (label_t*)v;
+ auto& ld = v.multi();
 
   switch (words.size())
   {
     case 0:
       break;
     case 1:
-      ld->label = sd->ldict ? (uint32_t)sd->ldict->get(words[0]) : int_of_string(words[0]);
-      ld->weight = 1.0;
+      ld.label = sd->ldict ? (uint32_t)sd->ldict->get(words[0]) : int_of_string(words[0]);
+      ld.weight = 1.0;
       break;
     case 2:
-      ld->label = sd->ldict ? (uint32_t)sd->ldict->get(words[0]) : int_of_string(words[0]);
-      ld->weight = float_of_string(words[1]);
+      ld.label = sd->ldict ? (uint32_t)sd->ldict->get(words[0]) : int_of_string(words[0]);
+      ld.weight = float_of_string(words[1]);
       break;
     default:
       std::cerr << "malformed example!\n";
       std::cerr << "words.size() = " << words.size() << std::endl;
   }
-  if (ld->label == 0)
+  if (ld.label == 0)
     THROW("label 0 is not allowed for multiclass.  Valid labels are {1,k}"
         << (sd->ldict ? "\nthis likely happened because you specified an invalid label with named labels" : ""));
 }
 
-label_parser mc_label = {default_label, parse_label, cache_label, read_cached_label, delete_label, weight, nullptr,
-    test_label, sizeof(label_t)};
+label_parser mc_label = {default_label, parse_label, cache_label, read_cached_label, polylabel_delete_label, weight, polylabel_copy_label, test_label, sizeof(label_t)};
 
 void print_label_pred(vw& all, example& ec, uint32_t prediction)
 {
-  VW::string_view sv_label = all.sd->ldict->get(ec.l.multi.label);
+  VW::string_view sv_label = all.sd->ldict->get(ec.l.multi().label);
   VW::string_view sv_pred = all.sd->ldict->get(prediction);
   all.sd->print_update(all.holdout_set_off, all.current_pass,
       sv_label.empty() ? "unknown" : sv_label.to_string(),
@@ -112,10 +114,10 @@ void print_probability(vw& all, example& ec, uint32_t prediction)
 {
   std::stringstream pred_ss;
   pred_ss << prediction << "(" << std::setw(2) << std::setprecision(0) << std::fixed
-          << 100 * ec.pred.scalars[prediction - 1] << "%)";
+          << 100 * ec.pred.scalars()[prediction - 1] << "%)";
 
   std::stringstream label_ss;
-  label_ss << ec.l.multi.label;
+  label_ss << ec.l.multi().label;
 
   all.sd->print_update(all.holdout_set_off, all.current_pass, label_ss.str(), pred_ss.str(), ec.num_features,
       all.progress_add, all.progress_arg);
@@ -127,7 +129,7 @@ void print_score(vw& all, example& ec, uint32_t prediction)
   pred_ss << prediction;
 
   std::stringstream label_ss;
-  label_ss << ec.l.multi.label;
+  label_ss << ec.l.multi().label;
 
   all.sd->print_update(all.holdout_set_off, all.current_pass, label_ss.str(), pred_ss.str(), ec.num_features,
       all.progress_add, all.progress_arg);
@@ -135,7 +137,7 @@ void print_score(vw& all, example& ec, uint32_t prediction)
 
 void direct_print_update(vw& all, example& ec, uint32_t prediction)
 {
-  all.sd->print_update(all.holdout_set_off, all.current_pass, ec.l.multi.label, prediction, ec.num_features,
+  all.sd->print_update(all.holdout_set_off, all.current_pass, ec.l.multi().label, prediction, ec.num_features,
       all.progress_add, all.progress_arg);
 }
 
@@ -147,7 +149,7 @@ void print_update(vw& all, example& ec, uint32_t prediction)
     if (!all.sd->ldict)
       T(all, ec, prediction);
     else
-      print_label_pred(all, ec, ec.pred.multiclass);
+      print_label_pred(all, ec, ec.pred.multiclass());
   }
 }
 
@@ -160,21 +162,21 @@ void print_update_with_score(vw& all, example& ec, uint32_t pred) { print_update
 void finish_example(vw& all, example& ec, bool update_loss)
 {
   float loss = 0;
-  if (ec.l.multi.label != (uint32_t)ec.pred.multiclass && ec.l.multi.label != (uint32_t)-1)
+  if (ec.l.multi().label != (uint32_t)ec.pred.multiclass() && ec.l.multi().label != (uint32_t)-1)
     loss = ec.weight;
 
-  all.sd->update(ec.test_only, update_loss && (ec.l.multi.label != (uint32_t)-1), loss, ec.weight, ec.num_features);
+  all.sd->update(ec.test_only, update_loss && (ec.l.multi().label != (uint32_t)-1), loss, ec.weight, ec.num_features);
 
   for (int sink : all.final_prediction_sink)
     if (!all.sd->ldict)
-      all.print_by_ref(sink, (float)ec.pred.multiclass, 0, ec.tag);
+      all.print_by_ref(sink, (float)ec.pred.multiclass(), 0, ec.tag);
     else
     {
-      VW::string_view sv_pred = all.sd->ldict->get(ec.pred.multiclass);
+      VW::string_view sv_pred = all.sd->ldict->get(ec.pred.multiclass());
       all.print_text_by_ref(sink, sv_pred.to_string(), ec.tag);
     }
 
-  MULTICLASS::print_update<direct_print_update>(all, ec, ec.pred.multiclass);
+  MULTICLASS::print_update<direct_print_update>(all, ec, ec.pred.multiclass());
   VW::finish_example(all, ec);
 }
 }  // namespace MULTICLASS
diff --git a/vowpalwabbit/multilabel.cc b/vowpalwabbit/multilabel.cc
index e573458a3d6..214af61d536 100644
--- a/vowpalwabbit/multilabel.cc
+++ b/vowpalwabbit/multilabel.cc
@@ -8,10 +8,10 @@
 
 namespace MULTILABEL
 {
-char* bufread_label(labels* ld, char* c, io_buf& cache)
+char* bufread_label(labels& ld, char* c, io_buf& cache)
 {
   size_t num = *(size_t*)c;
-  ld->label_v.clear();
+  ld.label_v.clear();
   c += sizeof(size_t);
   size_t total = sizeof(uint32_t) * num;
   if (cache.buf_read(c, (int)total) < total)
@@ -23,16 +23,16 @@ char* bufread_label(labels* ld, char* c, io_buf& cache)
   {
     uint32_t temp = *(uint32_t*)c;
     c += sizeof(uint32_t);
-    ld->label_v.push_back(temp);
+    ld.label_v.push_back(temp);
   }
 
   return c;
 }
 
-size_t read_cached_label(shared_data*, void* v, io_buf& cache)
+size_t read_cached_label(shared_data*, polylabel& v, io_buf& cache)
 {
-  labels* ld = (labels*)v;
-  ld->label_v.clear();
+  auto& ld = v.multilabels();
+  ld.label_v.clear();
   char* c;
   size_t total = sizeof(size_t);
   if (cache.buf_read(c, (int)total) < total)
@@ -42,62 +42,50 @@ size_t read_cached_label(shared_data*, void* v, io_buf& cache)
   return total;
 }
 
-float weight(void*) { return 1.; }
+float weight(polylabel&) { return 1.; }
 
-char* bufcache_label(labels* ld, char* c)
+char* bufcache_label(labels& ld, char* c)
 {
-  *(size_t*)c = ld->label_v.size();
+  *(size_t*)c = ld.label_v.size();
   c += sizeof(size_t);
-  for (unsigned int i = 0; i < ld->label_v.size(); i++)
+  for (unsigned int i = 0; i < ld.label_v.size(); i++)
   {
-    *(uint32_t*)c = ld->label_v[i];
+    *(uint32_t*)c = ld.label_v[i];
     c += sizeof(uint32_t);
   }
   return c;
 }
 
-void cache_label(void* v, io_buf& cache)
+void cache_label(polylabel& v, io_buf& cache)
 {
   char* c;
-  labels* ld = (labels*)v;
-  cache.buf_write(c, sizeof(size_t) + sizeof(uint32_t) * ld->label_v.size());
+  auto& ld = v.multilabels();
+  cache.buf_write(c, sizeof(size_t) + sizeof(uint32_t) * ld.label_v.size());
   bufcache_label(ld, c);
 }
 
-void default_label(void* v)
+void default_label(polylabel& v)
 {
-  labels* ld = (labels*)v;
-  ld->label_v.clear();
-}
-
-bool test_label(void* v)
-{
-  labels* ld = (labels*)v;
-  return ld->label_v.size() == 0;
-}
-
-void delete_label(void* v)
-{
-  labels* ld = (labels*)v;
-  if (ld)
-    ld->label_v.delete_v();
+  if (v.get_type() != label_type_t::multilabels)
+  {
+    v.reset();
+    v.init_as_multilabels();
+  }
+  auto& ld = v.multilabels();
+  ld.label_v.clear();
 }
 
-void copy_label(void* dst, void* src)
+bool test_label(polylabel& v)
 {
-  if (dst && src)
-  {
-    labels* ldD = (labels*)dst;
-    labels* ldS = (labels*)src;
-    copy_array(ldD->label_v, ldS->label_v);
-  }
+  auto& ld = v.multilabels();
+  return ld.label_v.size() == 0;
 }
 
-void parse_label(parser* p, shared_data*, void* v, v_array<VW::string_view>& words)
+void parse_label(parser* p, shared_data*, polylabel& v, v_array<VW::string_view>& words)
 {
-  labels* ld = (labels*)v;
+  auto& ld = v.multilabels();
 
-  ld->label_v.clear();
+  ld.label_v.clear();
   switch (words.size())
   {
     case 0:
@@ -108,7 +96,7 @@ void parse_label(parser* p, shared_data*, void* v, v_array<VW::string_view>& wor
       for (const auto & parse_name : p->parse_name)
       {
         uint32_t n = int_of_string(parse_name);
-        ld->label_v.push_back(n);
+        ld.label_v.push_back(n);
       }
       break;
     default:
@@ -118,8 +106,7 @@ void parse_label(parser* p, shared_data*, void* v, v_array<VW::string_view>& wor
   }
 }
 
-label_parser multilabel = {default_label, parse_label, cache_label, read_cached_label, delete_label, weight, copy_label,
-    test_label, sizeof(labels)};
+label_parser multilabel = {default_label, parse_label, cache_label, read_cached_label, polylabel_delete_label, weight, polylabel_copy_label, test_label, sizeof(labels)};
 
 void print_update(vw& all, bool is_test, example& ec)
 {
@@ -129,11 +116,11 @@ void print_update(vw& all, bool is_test, example& ec)
     if (is_test)
       label_string << " unknown";
     else
-      for (size_t i = 0; i < ec.l.multilabels.label_v.size(); i++) label_string << " " << ec.l.multilabels.label_v[i];
+      for (size_t i = 0; i < ec.l.multilabels().label_v.size(); i++) label_string << " " << ec.l.multilabels().label_v[i];
 
     std::stringstream pred_string;
-    for (size_t i = 0; i < ec.pred.multilabels.label_v.size(); i++)
-      pred_string << " " << ec.pred.multilabels.label_v[i];
+    for (size_t i = 0; i < ec.pred.multilabels().label_v.size(); i++)
+      pred_string << " " << ec.pred.multilabels().label_v[i];
 
     all.sd->print_update(all.holdout_set_off, all.current_pass, label_string.str(), pred_string.str(), ec.num_features,
         all.progress_add, all.progress_arg);
@@ -142,14 +129,12 @@ void print_update(vw& all, bool is_test, example& ec)
 
 void output_example(vw& all, example& ec)
 {
-  labels& ld = ec.l.multilabels;
-
   float loss = 0.;
-  if (!test_label(&ld))
+  if (!test_label(ec.l))
   {
     // need to compute exact loss
-    labels preds = ec.pred.multilabels;
-    labels given = ec.l.multilabels;
+    labels& preds = ec.pred.multilabels();
+    labels& given = ec.l.multilabels();
 
     uint32_t preds_index = 0;
     uint32_t given_index = 0;
@@ -176,23 +161,23 @@ void output_example(vw& all, example& ec)
     loss += preds.label_v.size() - preds_index;
   }
 
-  all.sd->update(ec.test_only, !test_label(&ld), loss, 1.f, ec.num_features);
+  all.sd->update(ec.test_only, !test_label(ec.l), loss, 1.f, ec.num_features);
 
   for (int sink : all.final_prediction_sink)
     if (sink >= 0)
     {
       std::stringstream ss;
 
-      for (size_t i = 0; i < ec.pred.multilabels.label_v.size(); i++)
+      for (size_t i = 0; i < ec.pred.multilabels().label_v.size(); i++)
       {
         if (i > 0)
           ss << ',';
-        ss << ec.pred.multilabels.label_v[i];
+        ss << ec.pred.multilabels().label_v[i];
       }
       ss << ' ';
       all.print_text_by_ref(sink, ss.str(), ec.tag);
     }
 
-  print_update(all, test_label(&ec.l.multilabels), ec);
+  print_update(all, test_label(ec.l), ec);
 }
 }  // namespace MULTILABEL
diff --git a/vowpalwabbit/multilabel_oaa.cc b/vowpalwabbit/multilabel_oaa.cc
index 04bcafec7fc..20a8474df0b 100644
--- a/vowpalwabbit/multilabel_oaa.cc
+++ b/vowpalwabbit/multilabel_oaa.cc
@@ -16,35 +16,41 @@ struct multi_oaa
 template <bool is_learn>
 void predict_or_learn(multi_oaa& o, LEARNER::single_learner& base, example& ec)
 {
-  MULTILABEL::labels multilabels = ec.l.multilabels;
-  MULTILABEL::labels preds = ec.pred.multilabels;
+  MULTILABEL::labels multilabels = std::move(ec.l.multilabels());
+  MULTILABEL::labels preds = std::move(ec.pred.multilabels());
   preds.label_v.clear();
 
-  ec.l.simple = {FLT_MAX, 1.f, 0.f};
+  ec.l.reset();
+  ec.l.init_as_simple(FLT_MAX, 1.f, 0.f);
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
+
   uint32_t multilabel_index = 0;
   for (uint32_t i = 0; i < o.k; i++)
   {
     if (is_learn)
     {
-      ec.l.simple.label = -1.f;
+      ec.l.simple().label = -1.f;
       if (multilabels.label_v.size() > multilabel_index && multilabels.label_v[multilabel_index] == i)
       {
-        ec.l.simple.label = 1.f;
+        ec.l.simple().label = 1.f;
         multilabel_index++;
       }
       base.learn(ec, i);
     }
     else
       base.predict(ec, i);
-    if (ec.pred.scalar > 0.)
+    if (ec.pred.scalar() > 0.)
       preds.label_v.push_back(i);
   }
   if (is_learn && multilabel_index < multilabels.label_v.size())
     std::cout << "label " << multilabels.label_v[multilabel_index] << " is not in {0," << o.k - 1
               << "} This won't work right." << std::endl;
 
-  ec.pred.multilabels = preds;
-  ec.l.multilabels = multilabels;
+  ec.pred.reset();
+  ec.pred.init_as_multilabels() = std::move(preds);
+  ec.l.reset();
+  ec.l.init_as_multilabels() = std::move(multilabels);
 }
 
 void finish_example(vw& all, multi_oaa&, example& ec)
@@ -67,8 +73,6 @@ LEARNER::base_learner* multilabel_oaa_setup(options_i& options, vw& all)
       predict_or_learn<true>, predict_or_learn<false>, data->k, prediction_type_t::multilabels);
   l.set_finish_example(finish_example);
   all.p->lp = MULTILABEL::multilabel;
-  all.label_type = label_type_t::multi;
-  all.delete_prediction = MULTILABEL::multilabel.delete_label;
-
+  l.label_type = label_type_t::multilabels;
   return make_base(l);
 }
diff --git a/vowpalwabbit/mwt.cc b/vowpalwabbit/mwt.cc
index 5f63f925332..12eed319584 100644
--- a/vowpalwabbit/mwt.cc
+++ b/vowpalwabbit/mwt.cc
@@ -33,13 +33,6 @@ struct mwt
   v_array<namespace_index> indices;  // excluded namespaces
   features feature_space[256];
   vw* all;
-
-  ~mwt()
-  {
-    evals.delete_v();
-    policies.delete_v();
-    indices.delete_v();
-  }
 };
 
 inline bool observed_cost(CB::cb_class* cl)
@@ -79,7 +72,7 @@ void value_policy(mwt& c, float val, uint64_t index)  // estimate the value of a
 template <bool learn, bool exclude, bool is_learn>
 void predict_or_learn(mwt& c, single_learner& base, example& ec)
 {
-  c.observation = get_observed_cost(ec.l.cb);
+  c.observation = get_observed_cost(ec.l.cb());
 
   if (c.observation != nullptr)
   {
@@ -117,7 +110,11 @@ void predict_or_learn(mwt& c, single_learner& base, example& ec)
   }
 
   // modify the predictions to use a vector with a score for each evaluated feature.
-  v_array<float> preds = ec.pred.scalars;
+  v_array<float> preds = std::move(ec.pred.scalars());
+
+  // TODO Confirm that this type is correct
+  ec.pred.reset();
+  ec.pred.init_as_multiclass();
 
   if (learn)
   {
@@ -137,10 +134,11 @@ void predict_or_learn(mwt& c, single_learner& base, example& ec)
   // modify the predictions to use a vector with a score for each evaluated feature.
   preds.clear();
   if (learn)
-    preds.push_back((float)ec.pred.multiclass);
+    preds.push_back((float)ec.pred.multiclass());
   for (uint64_t index : c.policies) preds.push_back((float)c.evals[index].cost / (float)c.total);
 
-  ec.pred.scalars = preds;
+  ec.pred.reset();
+  ec.pred.init_as_scalars(std::move(preds));
 }
 
 void print_scalars(int f, v_array<float>& scalars, v_array<char>& tag)
@@ -174,17 +172,19 @@ void finish_example(vw& all, mwt& c, example& ec)
   float loss = 0.;
   if (c.learn)
     if (c.observation != nullptr)
-      loss = get_cost_estimate(c.observation, (uint32_t)ec.pred.scalars[0]);
+      loss = get_cost_estimate(c.observation, (uint32_t)ec.pred.scalars()[0]);
   all.sd->update(ec.test_only, c.observation != nullptr, loss, 1.f, ec.num_features);
 
-  for (int sink : all.final_prediction_sink) print_scalars(sink, ec.pred.scalars, ec.tag);
+  for (int sink : all.final_prediction_sink) print_scalars(sink, ec.pred.scalars(), ec.tag);
 
   if (c.learn)
   {
-    v_array<float> temp = ec.pred.scalars;
-    ec.pred.multiclass = (uint32_t)temp[0];
+    v_array<float> temp = std::move(ec.pred.scalars());
+    ec.pred.reset();
+    ec.pred.init_as_multiclass() = (uint32_t)temp[0];
     CB::print_update(all, c.observation != nullptr, ec, nullptr, false);
-    ec.pred.scalars = temp;
+    ec.pred.reset();
+    ec.pred.init_as_scalars(std::move(temp));
   }
   VW::finish_example(all, ec);
 }
@@ -250,9 +250,7 @@ base_learner* mwt_setup(options_i& options, vw& all)
   calloc_reserve(c->evals, all.length());
   c->evals.end() = c->evals.begin() + all.length();
 
-  all.delete_prediction = delete_scalars;
   all.p->lp = CB::cb_label;
-  all.label_type = label_type_t::cb;
 
   if (c->num_classes > 0)
   {
@@ -280,5 +278,6 @@ base_learner* mwt_setup(options_i& options, vw& all)
 
   l->set_save_load(save_load);
   l->set_finish_example(finish_example);
+  l->label_type = label_type_t::cb;
   return make_base(*l);
 }
diff --git a/vowpalwabbit/mwt.h b/vowpalwabbit/mwt.h
index 39738d56aef..189c0a3768d 100644
--- a/vowpalwabbit/mwt.h
+++ b/vowpalwabbit/mwt.h
@@ -8,6 +8,5 @@ LEARNER::base_learner* mwt_setup(VW::config::options_i& options, vw& all);
 
 namespace MWT
 {
-void delete_scalars(void* v);
 void print_scalars(int f, v_array<float>& scalars, v_array<char>& tag);
 }  // namespace MWT
diff --git a/vowpalwabbit/nn.cc b/vowpalwabbit/nn.cc
index ede044b1847..ab904b2ab68 100644
--- a/vowpalwabbit/nn.cc
+++ b/vowpalwabbit/nn.cc
@@ -38,8 +38,8 @@ struct nn
   float* hidden_units;
   bool* dropped_out;
 
-  polyprediction* hidden_units_pred;
-  polyprediction* hiddenbias_pred;
+  std::vector<polyprediction> hidden_units_pred;
+  std::vector<polyprediction> hiddenbias_pred;
 
   vw* all;  // many things
   std::shared_ptr<rand_state> _random_state;
@@ -49,11 +49,6 @@ struct nn
     delete squared_loss;
     free(hidden_units);
     free(dropped_out);
-    free(hidden_units_pred);
-    free(hiddenbias_pred);
-    VW::dealloc_example(nullptr, output_layer);
-    VW::dealloc_example(nullptr, hiddenbias);
-    VW::dealloc_example(nullptr, outputweight);
   }
 };
 
@@ -83,8 +78,7 @@ static inline float fastpow2(float p)
   float clipp = (p < -126) ? -126.0f : p;
   int w = (int)clipp;
   float z = clipp - w + offset;
-  union
-  {
+  union {
     uint32_t i;
     float f;
   } v = {cast_uint32_t((1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z))};
@@ -100,9 +94,11 @@ void finish_setup(nn& n, vw& all)
 {
   // TODO: output_layer audit
 
-  memset(&n.output_layer, 0, sizeof(n.output_layer));
+  // TODO: This memset is very dangerous especially now that example has destructor etc
+  memset(&n.output_layer, 0, sizeof(n.output_layer));memset(&n.output_layer, 0, sizeof(n.output_layer));
   n.output_layer.interactions = &all.interactions;
   n.output_layer.indices.push_back(nn_output_namespace);
+  n.output_layer.pred.init_as_scalar();
   uint64_t nn_index = nn_constant << all.weights.stride_shift();
 
   features& fs = n.output_layer.feature_space[nn_output_namespace];
@@ -136,7 +132,8 @@ void finish_setup(nn& n, vw& all)
     n.hiddenbias.feature_space[constant_namespace].space_names.push_back(
         audit_strings_ptr(new audit_strings("", "HiddenBias")));
   n.hiddenbias.total_sum_feat_sq++;
-  n.hiddenbias.l.simple.label = FLT_MAX;
+  n.hiddenbias.l.init_as_simple().label = FLT_MAX;
+  n.hiddenbias.pred.init_as_scalar();
   n.hiddenbias.weight = 1;
   memset(&n.outputweight, 0, sizeof(n.outputweight));
   n.outputweight.interactions = &all.interactions;
@@ -148,7 +145,8 @@ void finish_setup(nn& n, vw& all)
         audit_strings_ptr(new audit_strings("", "OutputWeight")));
   n.outputweight.feature_space[nn_output_namespace].values[0] = 1;
   n.outputweight.total_sum_feat_sq++;
-  n.outputweight.l.simple.label = FLT_MAX;
+  n.outputweight.l.init_as_simple().label = FLT_MAX;
+  n.outputweight.pred.init_as_scalar();
   n.outputweight.weight = 1;
 
   n.finished_setup = true;
@@ -163,7 +161,7 @@ void end_pass(nn& n)
 template <bool is_learn, bool recompute_hidden>
 void predict_or_learn_multi(nn& n, single_learner& base, example& ec)
 {
-  bool shouldOutput = n.all->raw_prediction > 0;
+  const bool shouldOutput = n.all->raw_prediction > 0;
   if (!n.finished_setup)
     finish_setup(n, *(n.all));
   shared_data sd;
@@ -171,15 +169,15 @@ void predict_or_learn_multi(nn& n, single_learner& base, example& ec)
   {
     sd_guard(n.all, &sd);
 
-    label_data ld = ec.l.simple;
+    label_data ld = ec.l.simple();
     void (*save_set_minmax)(shared_data*, float) = n.all->set_minmax;
     float save_min_label;
     float save_max_label;
     float dropscale = n.dropout ? 2.0f : 1.0f;
     loss_function* save_loss = n.all->loss;
 
-    polyprediction* hidden_units = n.hidden_units_pred;
-    polyprediction* hiddenbias_pred = n.hiddenbias_pred;
+    polyprediction* hidden_units = n.hidden_units_pred.data();
+    polyprediction* hiddenbias_pred = n.hiddenbias_pred.data();
     bool* dropped_out = n.dropped_out;
 
     std::ostringstream outputStringStream;
@@ -204,11 +202,11 @@ void predict_or_learn_multi(nn& n, single_learner& base, example& ec)
 
       for (unsigned int i = 0; i < n.k; ++i)
         // avoid saddle point at 0
-        if (hiddenbias_pred[i].scalar == 0)
+        if (hiddenbias_pred[i].scalar() == 0)
         {
-          n.hiddenbias.l.simple.label = (float)(n._random_state->get_and_update_random() - 0.5);
+          n.hiddenbias.l.simple().label = (float)(n._random_state->get_and_update_random() - 0.5);
           base.learn(n.hiddenbias, i);
-          n.hiddenbias.l.simple.label = FLT_MAX;
+          n.hiddenbias.l.simple().label = FLT_MAX;
         }
 
       base.multipredict(ec, 0, n.k, hidden_units, true);
@@ -218,8 +216,8 @@ void predict_or_learn_multi(nn& n, single_learner& base, example& ec)
       if (ec.passthrough)
         for (unsigned int i = 0; i < n.k; ++i)
         {
-          add_passthrough_feature(ec, i * 2, hiddenbias_pred[i].scalar);
-          add_passthrough_feature(ec, i * 2 + 1, hidden_units[i].scalar);
+          add_passthrough_feature(ec, i * 2, hiddenbias_pred[i].scalar());
+          add_passthrough_feature(ec, i * 2 + 1, hidden_units[i].scalar());
         }
     }
 
@@ -228,8 +226,8 @@ void predict_or_learn_multi(nn& n, single_learner& base, example& ec)
       {
         if (i > 0)
           outputStringStream << ' ';
-        outputStringStream << i << ':' << hidden_units[i].scalar << ','
-                           << fasttanh(hidden_units[i].scalar);  // TODO: huh, what was going on here?
+        outputStringStream << i << ':' << hidden_units[i].scalar() << ','
+                           << fasttanh(hidden_units[i].scalar());  // TODO: huh, what was going on here?
       }
 
     n.all->loss = save_loss;
@@ -243,7 +241,7 @@ void predict_or_learn_multi(nn& n, single_learner& base, example& ec)
     float save_final_prediction = 0;
     float save_ec_loss = 0;
 
-CONVERSE:  // That's right, I'm using goto.  So sue me.
+  CONVERSE:  // That's right, I'm using goto.  So sue me.
 
     n.output_layer.total_sum_feat_sq = 1;
     n.output_layer.feature_space[nn_output_namespace].sum_feat_sq = 1;
@@ -259,7 +257,7 @@ void predict_or_learn_multi(nn& n, single_learner& base, example& ec)
 
     for (unsigned int i = 0; i < n.k; ++i)
     {
-      float sigmah = (dropped_out[i]) ? 0.0f : dropscale * fasttanh(hidden_units[i].scalar);
+      float sigmah = (dropped_out[i]) ? 0.0f : dropscale * fasttanh(hidden_units[i].scalar());
       features& out_fs = n.output_layer.feature_space[nn_output_namespace];
       out_fs.values[i] = sigmah;
 
@@ -268,15 +266,15 @@ void predict_or_learn_multi(nn& n, single_learner& base, example& ec)
 
       n.outputweight.feature_space[nn_output_namespace].indicies[0] = out_fs.indicies[i];
       base.predict(n.outputweight, n.k);
-      float wf = n.outputweight.pred.scalar;
+      float wf = n.outputweight.pred.scalar();
 
       // avoid saddle point at 0
       if (wf == 0)
       {
         float sqrtk = std::sqrt((float)n.k);
-        n.outputweight.l.simple.label = (float)(n._random_state->get_and_update_random() - 0.5) / sqrtk;
+        n.outputweight.l.simple().label = (float)(n._random_state->get_and_update_random() - 0.5) / sqrtk;
         base.update(n.outputweight, n.k);
-        n.outputweight.l.simple.label = FLT_MAX;
+        n.outputweight.l.simple().label = FLT_MAX;
       }
     }
 
@@ -300,10 +298,10 @@ void predict_or_learn_multi(nn& n, single_learner& base, example& ec)
        * ec.feature_space[] is reverted to its original value
        * save_nn_output_namespace contains the COPIED value
        * save_nn_output_namespace is destroyed
-       */ 
+       */
       features save_nn_output_namespace = std::move(ec.feature_space[nn_output_namespace]);
       auto tmp_sum_feat_sq = n.output_layer.feature_space[nn_output_namespace].sum_feat_sq;
-      ec.feature_space[nn_output_namespace].deep_copy_from(n.output_layer.feature_space[nn_output_namespace]);
+      ec.feature_space[nn_output_namespace] = n.output_layer.feature_space[nn_output_namespace];
 
       ec.total_sum_feat_sq += tmp_sum_feat_sq;
       if (is_learn)
@@ -364,12 +362,12 @@ void predict_or_learn_multi(nn& n, single_learner& base, example& ec)
             n.outputweight.feature_space[nn_output_namespace].indicies[0] =
                 n.output_layer.feature_space[nn_output_namespace].indicies[i];
             base.predict(n.outputweight, n.k);
-            float nu = n.outputweight.pred.scalar;
+            float nu = n.outputweight.pred.scalar();
             float gradhw = 0.5f * nu * gradient * sigmahprime;
 
-            ec.l.simple.label = GD::finalize_prediction(n.all->sd, hidden_units[i].scalar - gradhw);
-            ec.pred.scalar = hidden_units[i].scalar;
-            if (ec.l.simple.label != hidden_units[i].scalar)
+            ec.l.simple().label = GD::finalize_prediction(n.all->sd, hidden_units[i].scalar() - gradhw);
+            ec.pred.scalar() = hidden_units[i].scalar();
+            if (ec.l.simple().label != hidden_units[i].scalar())
               base.update(ec, i);
           }
         }
@@ -382,7 +380,7 @@ void predict_or_learn_multi(nn& n, single_learner& base, example& ec)
       }
     }
 
-    ec.l.simple.label = ld.label;
+    ec.l.simple().label = ld.label;
 
     if (!converse)
     {
@@ -403,7 +401,7 @@ void predict_or_learn_multi(nn& n, single_learner& base, example& ec)
     }
 
     ec.partial_prediction = save_partial_prediction;
-    ec.pred.scalar = save_final_prediction;
+    ec.pred.scalar() = save_final_prediction;
     ec.loss = save_ec_loss;
   }
   n.all->set_minmax(n.all->sd, sd.min_label);
@@ -422,7 +420,7 @@ void multipredict(nn& n, single_learner& base, example& ec, size_t count, size_t
     if (finalize_predictions)
       pred[c] = ec.pred;
     else
-      pred[c].scalar = ec.partial_prediction;
+      pred[c].scalar() = ec.partial_prediction;
     ec.ft_offset += (uint64_t)step;
   }
   ec.ft_offset -= (uint64_t)(step * count);
@@ -432,7 +430,7 @@ void finish_example(vw& all, nn&, example& ec)
 {
   int save_raw_prediction = all.raw_prediction;
   all.raw_prediction = -1;
-  return_simple_example(all, nullptr, ec);
+  return_simple_example_explicit(all, ec);
   all.raw_prediction = save_raw_prediction;
 }
 
@@ -481,8 +479,17 @@ base_learner* nn_setup(options_i& options, vw& all)
 
   n->hidden_units = calloc_or_throw<float>(n->k);
   n->dropped_out = calloc_or_throw<bool>(n->k);
-  n->hidden_units_pred = calloc_or_throw<polyprediction>(n->k);
-  n->hiddenbias_pred = calloc_or_throw<polyprediction>(n->k);
+  n->hidden_units_pred.resize(n->k);
+  for (auto& pred : n->hidden_units_pred)
+  {
+    pred.init_as_scalar();
+  }
+  n->hiddenbias_pred.resize(n->k);
+  for (auto& pred : n->hiddenbias_pred)
+  {
+    pred.init_as_scalar();
+  }
+  n->output_layer.pred.init_as_scalar();
 
   auto base = as_singleline(setup_base(options, all));
   n->increment = base->increment;  // Indexing of output layer is odd.
@@ -493,6 +500,7 @@ base_learner* nn_setup(options_i& options, vw& all)
     l.set_multipredict(multipredict);
   l.set_finish_example(finish_example);
   l.set_end_pass(end_pass);
+  l.label_type = label_type_t::simple;
 
   return make_base(l);
 }
diff --git a/vowpalwabbit/no_label.cc b/vowpalwabbit/no_label.cc
index 539a6feb756..6f7f24f89eb 100644
--- a/vowpalwabbit/no_label.cc
+++ b/vowpalwabbit/no_label.cc
@@ -6,6 +6,7 @@
 #include <cfloat>
 #include <cmath>
 #include <cstdio>
+#include "vw_string_view.h"
 
 #include "cache.h"
 #include "accumulate.h"
@@ -14,23 +15,33 @@
 
 namespace no_label
 {
-char* bufread_no_label(shared_data*, label_data*, char* c) { return c; }
-
-size_t read_cached_no_label(shared_data*, void*, io_buf&) { return 1; }
-
-float get_weight(void*) { return 1.; }
-
-char* bufcache_no_label(label_data*, char* c) { return c; }
+size_t read_cached_no_label(shared_data*, polylabel& label, io_buf&)
+{
+  if (label.get_type() != label_type_t::empty)
+  {
+    label.reset();
+    label.init_as_empty();
+  }
+  return 1;
+}
 
-void cache_no_label(void*, io_buf&) {}
+float get_weight(polylabel&) { return 1.; }
 
-void default_no_label(void*) {}
+void cache_no_label(polylabel&, io_buf&) {}
 
-bool test_label(void*) { return false; }
+// This is wasted work, ideally empty and unset should be the same thing.
+void default_no_label(polylabel& label)
+{
+  if (label.get_type() != label_type_t::empty && label.get_type() != label_type_t::empty)
+  {
+    label.reset();
+    label.init_as_empty();
+  }
+}
 
-void delete_no_label(void*) {}
+bool test_label(polylabel&) { return false; }
 
-void parse_no_label(parser*, shared_data*, void*, v_array<VW::string_view>& words)
+void parse_no_label(parser*, shared_data*, polylabel&, v_array<VW::string_view>& words)
 {
   switch (words.size())
   {
@@ -43,15 +54,15 @@ void parse_no_label(parser*, shared_data*, void*, v_array<VW::string_view>& word
   }
 }
 
-label_parser no_label_parser = {default_no_label, parse_no_label, cache_no_label, read_cached_no_label, delete_no_label,
-    get_weight, nullptr, test_label, sizeof(nullptr)};
+label_parser no_label_parser = {default_no_label, parse_no_label, cache_no_label, read_cached_no_label, polylabel_delete_label,
+    get_weight, polylabel_copy_label, test_label, sizeof(nullptr)};
 
 void print_no_label_update(vw& all, example& ec)
 {
   if (all.sd->weighted_labeled_examples + all.sd->weighted_unlabeled_examples >= all.sd->dump_interval && !all.quiet &&
       !all.bfgs)
   {
-    all.sd->print_update(all.holdout_set_off, all.current_pass, 0.f, ec.pred.scalar, ec.num_features, all.progress_add,
+    all.sd->print_update(all.holdout_set_off, all.current_pass, 0.f, ec.pred.scalar(), ec.num_features, all.progress_add,
         all.progress_arg);
   }
 }
@@ -64,13 +75,13 @@ void output_and_account_no_label_example(vw& all, example& ec)
   for (size_t i = 0; i < all.final_prediction_sink.size(); i++)
   {
     int f = (int)all.final_prediction_sink[i];
-    all.print_by_ref(f, ec.pred.scalar, 0, ec.tag);
+    all.print_by_ref(f, ec.pred.scalar(), 0, ec.tag);
   }
 
   print_no_label_update(all, ec);
 }
 
-void return_no_label_example(vw& all, void*, example& ec)
+void return_no_label_example(vw& all, polylabel&, example& ec)
 {
   output_and_account_example(all, ec);
   VW::finish_example(all, ec);
diff --git a/vowpalwabbit/oaa.cc b/vowpalwabbit/oaa.cc
index 65e23bb4dda..f47b1f589e9 100644
--- a/vowpalwabbit/oaa.cc
+++ b/vowpalwabbit/oaa.cc
@@ -9,38 +9,40 @@
 #include "rand48.h"
 #include "vw_exception.h"
 #include "vw.h"
+#include <numeric>
 
 using namespace VW::config;
 
 struct oaa
 {
   uint64_t k;
-  vw* all;                    // for raw
-  polyprediction* pred;       // for multipredict
-  uint64_t num_subsample;     // for randomized subsampling, how many negatives to draw?
-  uint32_t* subsample_order;  // for randomized subsampling, in what order should we touch classes
-  size_t subsample_id;        // for randomized subsampling, where do we live in the list
-
-  ~oaa()
-  {
-    free(pred);
-    free(subsample_order);
-  }
+  vw* all;                                // for raw
+  std::vector<polyprediction> pred;   // for multipredict
+  uint64_t num_subsample;                 // for randomized subsampling, how many negatives to draw?
+  std::vector<uint32_t> subsample_order;  // for randomized subsampling, in what order should we touch classes
+  size_t subsample_id;                    // for randomized subsampling, where do we live in the list
 };
 
 void learn_randomized(oaa& o, LEARNER::single_learner& base, example& ec)
 {
-  MULTICLASS::label_t ld = ec.l.multi;
+  MULTICLASS::label_t ld = ec.l.multi();
   if (ld.label == 0 || (ld.label > o.k && ld.label != (uint32_t)-1))
+  {
     std::cout << "label " << ld.label << " is not in {1," << o.k << "} This won't work right." << std::endl;
+  }
+
+  // Prepare for next reduction.
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
+  ec.l.reset();
+  ec.l.init_as_simple(1.f, 0.f, 0.f);  // truth
 
-  ec.l.simple = {1., 0.f, 0.f};  // truth
   base.learn(ec, ld.label - 1);
 
   size_t prediction = ld.label;
   float best_partial_prediction = ec.partial_prediction;
 
-  ec.l.simple.label = -1.;
+  ec.l.simple().label = -1.;
   float weight_temp = ec.weight;
   ec.weight *= ((float)o.k) / (float)o.num_subsample;
   size_t p = o.subsample_id;
@@ -61,72 +63,96 @@ void learn_randomized(oaa& o, LEARNER::single_learner& base, example& ec)
   }
   o.subsample_id = p;
 
-  ec.pred.multiclass = (uint32_t)prediction;
-  ec.l.multi = ld;
+  // Ensure example is in correct state upon exiting.
+  ec.pred.reset();
+  ec.pred.init_as_multiclass(static_cast<uint32_t>(prediction));
+  ec.l.reset();
+  ec.l.init_as_multi(ld);
   ec.weight = weight_temp;
 }
 
+// Prediction types is scalars when scores is true and multiclass when scores is false.
 template <bool is_learn, bool print_all, bool scores, bool probabilities>
 void predict_or_learn(oaa& o, LEARNER::single_learner& base, example& ec)
 {
-  MULTICLASS::label_t mc_label_data = ec.l.multi;
+  MULTICLASS::label_t mc_label_data = ec.l.multi();
   if (mc_label_data.label == 0 || (mc_label_data.label > o.k && mc_label_data.label != (uint32_t)-1))
+  {
     std::cout << "label " << mc_label_data.label << " is not in {1," << o.k << "} This won't work right." << std::endl;
+  }
 
-  std::stringstream outputStringStream;
-  uint32_t prediction = 1;
-  v_array<float> scores_array;
-  if (scores)
-    scores_array = ec.pred.scalars;
+  ec.l.reset();
+  ec.l.init_as_simple(FLT_MAX, 0.f, 0.f);
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
+  base.multipredict(ec, 0, o.k, o.pred.data(), true);
 
-  ec.l.simple = {FLT_MAX, 0.f, 0.f};
-  base.multipredict(ec, 0, o.k, o.pred, true);
+  uint32_t prediction = 1;
   for (uint32_t i = 2; i <= o.k; i++)
-    if (o.pred[i - 1].scalar > o.pred[prediction - 1].scalar)
+  {
+    if (o.pred[i - 1].scalar() > o.pred[prediction - 1].scalar())
+    {
       prediction = i;
+    }
+  }
 
   if (ec.passthrough)
-    for (uint32_t i = 1; i <= o.k; i++) add_passthrough_feature(ec, i, o.pred[i - 1].scalar);
+  {
+    for (uint32_t i = 1; i <= o.k; i++)
+    {
+      add_passthrough_feature(ec, i, o.pred[i - 1].scalar());
+    }
+  }
 
   if (is_learn)
   {
     for (uint32_t i = 1; i <= o.k; i++)
     {
-      ec.l.simple = {(mc_label_data.label == i) ? 1.f : -1.f, 0.f, 0.f};
-      ec.pred.scalar = o.pred[i - 1].scalar;
+      ec.l.reset();
+      ec.l.init_as_simple((mc_label_data.label == i) ? 1.f : -1.f, 0.f, 0.f);
+      ec.pred.reset();
+      ec.pred.init_as_scalar(o.pred[i - 1].scalar());
       base.update(ec, i - 1);
     }
   }
 
   if (print_all)
   {
-    outputStringStream << "1:" << o.pred[0].scalar;
-    for (uint32_t i = 2; i <= o.k; i++) outputStringStream << ' ' << i << ':' << o.pred[i - 1].scalar;
+    std::stringstream outputStringStream;
+    outputStringStream << "1:" << o.pred[0].scalar();
+    for (uint32_t i = 2; i <= o.k; i++)
+      outputStringStream << ' ' << i << ':' << o.pred[i - 1].scalar();
     o.all->print_text_by_ref(o.all->raw_prediction, outputStringStream.str(), ec.tag);
   }
 
   if (scores)
   {
-    scores_array.clear();
-    for (uint32_t i = 0; i < o.k; i++) scores_array.push_back(o.pred[i].scalar);
-    ec.pred.scalars = scores_array;
+    v_array<float> scores_array;
+    for (uint32_t i = 0; i < o.k; i++) scores_array.push_back(o.pred[i].scalar());
+
+    ec.pred.reset();
+    ec.pred.init_as_scalars(std::move(scores_array));
 
     if (probabilities)
     {
-      float sum_prob = 0;
+      float sum_prob = 0.f;
       for (uint32_t i = 0; i < o.k; i++)
       {
-        ec.pred.scalars[i] = 1.f / (1.f + correctedExp(-o.pred[i].scalar));
-        sum_prob += ec.pred.scalars[i];
+        ec.pred.scalars()[i] = 1.f / (1.f + correctedExp(-o.pred[i].scalar()));
+        sum_prob += ec.pred.scalars()[i];
       }
-      float inv_sum_prob = 1.f / sum_prob;
-      for (uint32_t i = 0; i < o.k; i++) ec.pred.scalars[i] *= inv_sum_prob;
+      const float inv_sum_prob = 1.f / sum_prob;
+      for (uint32_t i = 0; i < o.k; i++) ec.pred.scalars()[i] *= inv_sum_prob;
     }
   }
   else
-    ec.pred.multiclass = prediction;
+  {
+    ec.pred.reset();
+    ec.pred.init_as_multiclass(prediction);
+  }
 
-  ec.l.multi = mc_label_data;
+  ec.l.reset();
+  ec.l.init_as_multi(mc_label_data);
 }
 
 // TODO: partial code duplication with multiclass.cc:finish_example
@@ -144,8 +170,8 @@ void finish_example_scores(vw& all, oaa& o, example& ec)
   float correct_class_prob = 0;
   if (probabilities)
   {
-    if (ec.l.multi.label <= o.k)  // prevent segmentation fault if labeĺ==(uint32_t)-1
-      correct_class_prob = ec.pred.scalars[ec.l.multi.label - 1];
+    if (ec.l.multi().label <= o.k)  // prevent segmentation fault if labeĺ==(uint32_t)-1
+      correct_class_prob = ec.pred.scalars()[ec.l.multi().label - 1];
     if (correct_class_prob > 0)
       multiclass_log_loss = -log(correct_class_prob) * ec.weight;
     if (ec.test_only)
@@ -158,11 +184,11 @@ void finish_example_scores(vw& all, oaa& o, example& ec)
   // but we cannot store it in ec.pred union because we store ec.pred.probs there.
   uint32_t prediction = 0;
   for (uint32_t i = 1; i < o.k; i++)
-    if (ec.pred.scalars[i] > ec.pred.scalars[prediction])
+    if (ec.pred.scalars()[i] > ec.pred.scalars()[prediction])
       prediction = i;
   prediction++;  // prediction is 1-based index (not 0-based)
   float zero_one_loss = 0;
-  if (ec.l.multi.label != prediction)
+  if (ec.l.multi().label != prediction)
     zero_one_loss = ec.weight;
 
   // === Print probabilities for all classes
@@ -177,12 +203,12 @@ void finish_example_scores(vw& all, oaa& o, example& ec)
     }
     else
       outputStringStream << i + 1;
-    outputStringStream << ':' << ec.pred.scalars[i];
+    outputStringStream << ':' << ec.pred.scalars()[i];
   }
   for (int sink : all.final_prediction_sink) all.print_text_by_ref(sink, outputStringStream.str(), ec.tag);
 
   // === Report updates using zero-one loss
-  all.sd->update(ec.test_only, ec.l.multi.label != (uint32_t)-1, zero_one_loss, ec.weight, ec.num_features);
+  all.sd->update(ec.test_only, ec.l.multi().label != (uint32_t)-1, zero_one_loss, ec.weight, ec.num_features);
   // Alternatively, we could report multiclass_log_loss.
   // all.sd->update(ec.test_only, multiclass_log_loss, ec.weight, ec.num_features);
   // Even better would be to report both losses, but this would mean to increase
@@ -217,8 +243,12 @@ LEARNER::base_learner* oaa_setup(options_i& options, vw& all)
     THROW("error: you have " << all.sd->ldict->getK() << " named labels; use that as the argument to oaa")
 
   data->all = &all;
-  data->pred = calloc_or_throw<polyprediction>(data->k);
-  data->subsample_order = nullptr;
+  data->pred.resize(data->k);
+  for (auto& pred : data->pred)
+  {
+    pred.init_as_scalar();
+  }
+
   data->subsample_id = 0;
   if (data->num_subsample > 0)
   {
@@ -229,14 +259,15 @@ LEARNER::base_learner* oaa_setup(options_i& options, vw& all)
     }
     else
     {
-      data->subsample_order = calloc_or_throw<uint32_t>(data->k);
-      for (size_t i = 0; i < data->k; i++) data->subsample_order[i] = (uint32_t)i;
+      // Fills the vector with values from 0 to K. 0,1,2,...K
+      data->subsample_order.resize(data->k);
+      std::iota(std::begin(data->subsample_order), std::end(data->subsample_order), 0);
+
       for (size_t i = 0; i < data->k; i++)
       {
-        size_t j = (size_t)(all.get_random_state()->get_and_update_random() * (float)(data->k - i)) + i;
-        uint32_t tmp = data->subsample_order[i];
-        data->subsample_order[i] = data->subsample_order[j];
-        data->subsample_order[j] = tmp;
+        const auto j =
+            static_cast<size_t>(all.get_random_state()->get_and_update_random() * static_cast<float>(data->k - i)) + i;
+        std::swap(data->subsample_order[i], data->subsample_order[j]);
       }
     }
   }
@@ -246,37 +277,45 @@ LEARNER::base_learner* oaa_setup(options_i& options, vw& all)
   auto base = as_singleline(setup_base(options, all));
   if (probabilities || scores)
   {
-    all.delete_prediction = delete_scalars;
     if (probabilities)
     {
-      auto loss_function_type = all.loss->getType();
+      const auto loss_function_type = all.loss->getType();
       if (loss_function_type != "logistic")
+      {
         all.trace_message << "WARNING: --probabilities should be used only with --loss_function=logistic" << std::endl;
-      // the three boolean template parameters are: is_learn, print_all and scores
-      l = &LEARNER::init_multiclass_learner(data, base, predict_or_learn<true, false, true, true>,
-          predict_or_learn<false, false, true, true>, all.p, data->k, prediction_type_t::scalars);
+      }
+      l = &LEARNER::init_multiclass_learner(data, base,
+          predict_or_learn<true /*is_learn*/, false /*print_all*/, true /*scores*/, true /*probabilities*/>,
+          predict_or_learn<false /*is_learn*/, false /*print_all*/, true /*scores*/, true /*probabilities*/>, all.p,
+          data->k, prediction_type_t::scalars);
       all.sd->report_multiclass_log_loss = true;
-      l->set_finish_example(finish_example_scores<true>);
+      l->set_finish_example(finish_example_scores<true /*probabilities*/>);
     }
     else
     {
-      l = &LEARNER::init_multiclass_learner(data, base, predict_or_learn<true, false, true, false>,
-          predict_or_learn<false, false, true, false>, all.p, data->k, prediction_type_t::scalars);
-      l->set_finish_example(finish_example_scores<false>);
+      l = &LEARNER::init_multiclass_learner(data, base,
+          predict_or_learn<true /*is_learn*/, false /*print_all*/, true /*scores*/, false /*probabilities*/>,
+          predict_or_learn<false /*is_learn*/, false /*print_all*/, true /*scores*/, false /*probabilities*/>, all.p,
+          data->k, prediction_type_t::scalars);
+      l->set_finish_example(finish_example_scores<false /*probabilities*/>);
     }
   }
   else if (all.raw_prediction > 0)
-    l = &LEARNER::init_multiclass_learner(data, base, predict_or_learn<true, true, false, false>,
-        predict_or_learn<false, true, false, false>, all.p, data->k, prediction_type_t::multiclass);
+    l = &LEARNER::init_multiclass_learner(data, base,
+        predict_or_learn<true /*is_learn*/, true /*print_all*/, false /*scores*/, false /*probabilities*/>,
+        predict_or_learn<false /*is_learn*/, true /*print_all*/, false /*scores*/, false /*probabilities*/>, all.p,
+        data->k, prediction_type_t::multiclass);
   else
-    l = &LEARNER::init_multiclass_learner(data, base, predict_or_learn<true, false, false, false>,
-        predict_or_learn<false, false, false, false>, all.p, data->k, prediction_type_t::multiclass);
+    l = &LEARNER::init_multiclass_learner(data, base,
+        predict_or_learn<true /*is_learn*/, false /*print_all*/, false /*scores*/, false /*probabilities*/>,
+        predict_or_learn<false /*is_learn*/, false /*print_all*/, false /*scores*/, false /*probabilities*/>, all.p,
+        data->k, prediction_type_t::multiclass);
 
   if (data_ptr->num_subsample > 0)
   {
     l->set_learn(learn_randomized);
     l->set_finish_example(MULTICLASS::finish_example_without_loss<oaa>);
   }
-
+  l->label_type = label_type_t::multi;
   return make_base(*l);
 }
diff --git a/vowpalwabbit/parse_args.cc b/vowpalwabbit/parse_args.cc
index f2b79d1a242..817d29aee60 100644
--- a/vowpalwabbit/parse_args.cc
+++ b/vowpalwabbit/parse_args.cc
@@ -208,7 +208,7 @@ void parse_dictionary_argument(vw& all, std::string str)
   // mimicing old v_hashmap behavior for load factor.
   // A smaller factor will generally use more memory but have faster access
   map->max_load_factor(0.25);
-  example* ec = VW::alloc_examples(all.p->lp.label_size, 1);
+  example* ec = VW::alloc_examples(1);
 
   size_t def = (size_t)' ';
 
@@ -230,7 +230,7 @@ void parse_dictionary_argument(vw& all, std::string str)
         if (new_buffer == nullptr)
         {
           free(buffer);
-          VW::dealloc_example(all.p->lp.delete_label, *ec);
+          ec->~example();
           free(ec);
           io->close_file();
           delete io;
@@ -264,8 +264,7 @@ void parse_dictionary_argument(vw& all, std::string str)
     {
       continue;
     }
-    std::unique_ptr<features> arr(new features);
-    arr->deep_copy_from(ec->feature_space[def]);
+    std::unique_ptr<features> arr(new features(ec->feature_space[def]));
     map->emplace(word, std::move(arr));
 
     // clear up ec
@@ -279,7 +278,7 @@ void parse_dictionary_argument(vw& all, std::string str)
   free(buffer);
   io->close_file();
   delete io;
-  VW::dealloc_example(all.p->lp.delete_label, *ec);
+  ec->~example();
   free(ec);
 
   if (!all.quiet)
@@ -1227,7 +1226,11 @@ LEARNER::base_learner* setup_base(options_i& options, vw& all)
   if (base == nullptr)
     return setup_base(options, all);
   else
+  {
+    assert(base->label_type != label_type_t::unset);
+    assert(base->pred_type != prediction_type_t::unset);
     return base;
+  }
 }
 
 void parse_reductions(options_i& options, vw& all)
@@ -1658,7 +1661,7 @@ vw* initialize(
     options_i& options, io_buf* model, bool skipModelLoad, trace_message_t trace_listener, void* trace_context)
 {
   vw& all = parse_args(options, trace_listener, trace_context);
-
+  
   try
   {
     // if user doesn't pass in a model, read from options
@@ -1896,15 +1899,7 @@ void finish(vw& all, bool delete_all)
   if (all.should_delete_options)
     delete all.options;
 
-  // TODO: migrate all finalization into parser destructor
-  if (all.p != nullptr)
-  {
-    free_parser(all);
-    finalize_source(all.p);
-    all.p->parse_name.clear();
-    all.p->parse_name.delete_v();
-    delete all.p;
-  }
+  delete all.p;
 
   bool seeded;
   if (all.weights.seeded() > 0)
@@ -1923,7 +1918,6 @@ void finish(vw& all, bool delete_all)
   for (size_t i = 0; i < all.final_prediction_sink.size(); i++)
     if (all.final_prediction_sink[i] != 1)
       io_buf::close_file_or_socket(all.final_prediction_sink[i]);
-  all.final_prediction_sink.delete_v();
 
   all.loaded_dictionaries.clear();
   // TODO: should we be clearing the namespace dictionaries?
diff --git a/vowpalwabbit/parse_dispatch_loop.h b/vowpalwabbit/parse_dispatch_loop.h
index 7177d9f1966..1bd244bf124 100644
--- a/vowpalwabbit/parse_dispatch_loop.h
+++ b/vowpalwabbit/parse_dispatch_loop.h
@@ -10,7 +10,7 @@ using dispatch_fptr = std::function<void(vw&, const v_array<example*>&)>;
 
 inline void parse_dispatch(vw& all, dispatch_fptr dispatch)
 {
-  v_array<example*> examples = v_init<example*>();
+  v_array<example*> examples;
   size_t example_number = 0;  // for variable-size batch learning algorithms
 
   try
@@ -32,7 +32,8 @@ inline void parse_dispatch(vw& all, dispatch_fptr dispatch)
         all.passes_complete++;
 
         // setup an end_pass example
-        all.p->lp.default_label(&examples[0]->l);
+        examples[0]->l.reset();
+        all.p->lp.default_label(examples[0]->l);
         examples[0]->end_pass = true;
         all.p->in_pass_counter = 0;
 
@@ -66,5 +67,4 @@ inline void parse_dispatch(vw& all, dispatch_fptr dispatch)
     all.p->exc_ptr = std::current_exception();
   }
   lock_done(*all.p);
-  examples.delete_v();
 }
diff --git a/vowpalwabbit/parse_example.cc b/vowpalwabbit/parse_example.cc
index b26842e9630..6c9ed535e99 100644
--- a/vowpalwabbit/parse_example.cc
+++ b/vowpalwabbit/parse_example.cc
@@ -3,7 +3,7 @@
 // license as described in the file LICENSE.
 
 #include <cmath>
-#include <cmath>
+#include "vw_string_view.h"
 #include <cctype>
 #include "parse_example.h"
 #include "hash.h"
@@ -187,7 +187,7 @@ class TC_parser
           affix_fs.push_back(_v, word_hash);
           if (audit)
           {
-            v_array<char> affix_v = v_init<char>();
+            v_array<char> affix_v;
             if (_index != ' ')
               affix_v.push_back(_index);
             affix_v.push_back(is_prefix ? '+' : '-');
@@ -229,7 +229,7 @@ class TC_parser
         spell_fs.push_back(_v, word_hash);
         if (audit)
         {
-          v_array<char> spelling_v = v_init<char>();
+          v_array<char> spelling_v;
           if (_index != ' ')
           {
             spelling_v.push_back(_index);
@@ -260,7 +260,7 @@ class TC_parser
             if (audit)
               for (const auto& id : feats->indicies)
               {
-		std::stringstream ss;
+                std::stringstream ss;
                 ss << _index << '_';
                 ss << feature_name;
                 ss << '=' << id;
@@ -402,7 +402,6 @@ class TC_parser
 
   TC_parser(VW::string_view line, vw& all, example* ae) : _line(line)
   {
-    _spelling = v_init<char>();
     if (!_line.empty())
     {
       this->_read_idx = 0;
@@ -423,7 +422,7 @@ class TC_parser
 
 void substring_to_example(vw* all, example* ae, VW::string_view example)
 {
-  all->p->lp.default_label(&ae->l);
+  all->p->lp.default_label(ae->l);
 
   size_t bar_idx = example.find('|');
 
@@ -457,7 +456,7 @@ void substring_to_example(vw* all, example* ae, VW::string_view example)
   }
 
   if (!all->p->words.empty())
-    all->p->lp.parse_label(all->p, all->p->_shared_data, &ae->l, all->p->words);
+    all->p->lp.parse_label(all->p, all->p->_shared_data, ae->l, all->p->words);
 
   if (bar_idx != VW::string_view::npos)
   {
diff --git a/vowpalwabbit/parse_example.h b/vowpalwabbit/parse_example.h
index 691c02f953c..461612e221a 100644
--- a/vowpalwabbit/parse_example.h
+++ b/vowpalwabbit/parse_example.h
@@ -3,6 +3,7 @@
 // license as described in the file LICENSE.
 #pragma once
 #include <cstdint>
+#include "vw_string_view.h"
 #include "parse_primitives.h"
 #include "example.h"
 #include "vw.h"
diff --git a/vowpalwabbit/parse_example_json.h b/vowpalwabbit/parse_example_json.h
index 077d35cda0a..e03b53ed3c7 100644
--- a/vowpalwabbit/parse_example_json.h
+++ b/vowpalwabbit/parse_example_json.h
@@ -175,7 +175,7 @@ class LabelObjectState : public BaseState<audit>
 
   BaseState<audit>* StartObject(Context<audit>& ctx) override
   {
-    ctx.all->p->lp.default_label(&ctx.ex->l);
+    ctx.all->p->lp.default_label(ctx.ex->l);
 
     // don't allow { { { } } }
     if (ctx.previous_state == this)
@@ -202,17 +202,17 @@ class LabelObjectState : public BaseState<audit>
     // simple
     if (!_stricmp(ctx.key, "Label"))
     {
-      ctx.ex->l.simple.label = v;
+      ctx.ex->l.simple().label = v;
       found = true;
     }
     else if (!_stricmp(ctx.key, "Initial"))
     {
-      ctx.ex->l.simple.initial = v;
+      ctx.ex->l.simple().initial = v;
       found = true;
     }
     else if (!_stricmp(ctx.key, "Weight"))
     {
-      ctx.ex->l.simple.weight = v;
+      ctx.ex->l.simple().weight = v;
       found = true;
     }
     // CB
@@ -244,13 +244,13 @@ class LabelObjectState : public BaseState<audit>
 
   BaseState<audit>* EndObject(Context<audit>& ctx, rapidjson::SizeType) override
   {
-    if (ctx.all->label_type == label_type_t::ccb)
+    if (ctx.all->get_label_type() == label_type_t::conditional_contextual_bandit)
     {
-      auto ld = (CCB::label*)&ctx.ex->l;
+      auto& ld = ctx.ex->l.ccb();
 
       for (auto id : inc)
       {
-        ld->explicit_included_actions.push_back(id);
+        ld.explicit_included_actions.push_back(id);
       }
       inc.clear();
 
@@ -270,21 +270,21 @@ class LabelObjectState : public BaseState<audit>
         actions.clear();
         probs.clear();
 
-        ld->outcome = outcome;
+        ld.outcome = outcome;
         cb_label = {0., 0, 0., 0.};
       }
     }
     else if (found_cb)
     {
-      CB::label* ld = (CB::label*)&ctx.ex->l;
-      ld->costs.push_back(cb_label);
+      auto& ld = ctx.ex->l.cb();
+      ld.costs.push_back(cb_label);
 
       found_cb = false;
       cb_label = {0., 0, 0., 0.};
     }
     else if (found)
     {
-      count_label(ctx.all->sd, ctx.ex->l.simple.label);
+      count_label(ctx.all->sd, ctx.ex->l.simple().label);
 
       found = false;
     }
@@ -357,14 +357,14 @@ struct LabelState : BaseState<audit>
   BaseState<audit>* Float(Context<audit>& ctx, float v) override
   {
     // TODO: once we introduce label types, check here
-    ctx.ex->l.simple.label = v;
+    ctx.ex->l.simple().label = v;
     return ctx.previous_state;
   }
 
   BaseState<audit>* Uint(Context<audit>& ctx, unsigned v) override
   {
     // TODO: once we introduce label types, check here
-    ctx.ex->l.simple.label = (float)v;
+    ctx.ex->l.simple().label = (float)v;
     return ctx.previous_state;
   }
 };
@@ -432,9 +432,9 @@ struct MultiState : BaseState<audit>
   BaseState<audit>* StartArray(Context<audit>& ctx) override
   {
     // mark shared example
-    if (ctx.all->label_type == label_type_t::cb)
+    if (ctx.all->get_label_type() == label_type_t::cb)
     {
-      CB::label* ld = &ctx.ex->l.cb;
+      CB::label* ld = &ctx.ex->l.cb();
       CB::cb_class f;
 
       f.partial_prediction = 0.;
@@ -444,9 +444,9 @@ struct MultiState : BaseState<audit>
 
       ld->costs.push_back(f);
     }
-    else if (ctx.all->label_type == label_type_t::ccb)
+    else if (ctx.all->get_label_type() == label_type_t::conditional_contextual_bandit)
     {
-      CCB::label* ld = &ctx.ex->l.conditional_contextual_bandit;
+      CCB::label* ld = &ctx.ex->l.ccb();
       ld->type = CCB::example_type::shared;
     }
     else
@@ -459,10 +459,10 @@ struct MultiState : BaseState<audit>
   {
     // allocate new example
     ctx.ex = &(*ctx.example_factory)(ctx.example_factory_context);
-    ctx.all->p->lp.default_label(&ctx.ex->l);
-    if (ctx.all->label_type == label_type_t::ccb)
+    ctx.all->p->lp.default_label(ctx.ex->l);
+    if (ctx.all->get_label_type() == label_type_t::conditional_contextual_bandit)
     {
-      ctx.ex->l.conditional_contextual_bandit.type = CCB::example_type::action;
+      ctx.ex->l.ccb().type = CCB::example_type::action;
     }
 
     ctx.examples->push_back(ctx.ex);
@@ -504,8 +504,8 @@ struct SlotsState : BaseState<audit>
   {
     // allocate new example
     ctx.ex = &(*ctx.example_factory)(ctx.example_factory_context);
-    ctx.all->p->lp.default_label(&ctx.ex->l);
-    ctx.ex->l.conditional_contextual_bandit.type = CCB::example_type::slot;
+    ctx.all->p->lp.default_label(ctx.ex->l);
+    ctx.ex->l.ccb().type = CCB::example_type::slot;
 
     ctx.examples->push_back(ctx.ex);
 
@@ -825,22 +825,22 @@ class DefaultState : public BaseState<audit>
 
       // If we are in CCB mode and there have been no slots. Check label cost, prob and action were passed. In that
       // case this is CB, so generate a single slot with this info.
-      if (ctx.all->label_type == label_type_t::ccb)
+      if (ctx.all->get_label_type() == label_type_t::conditional_contextual_bandit)
       {
         auto num_slots = std::count_if(ctx.examples->begin(), ctx.examples->end(),
-            [](example* ex) { return ex->l.conditional_contextual_bandit.type == CCB::example_type::slot; });
+            [](example* ex) { return ex->l.ccb().type == CCB::example_type::slot; });
         if (num_slots == 0 && ctx.label_object_state.found_cb)
         {
           ctx.ex = &(*ctx.example_factory)(ctx.example_factory_context);
-          ctx.all->p->lp.default_label(&ctx.ex->l);
-          ctx.ex->l.conditional_contextual_bandit.type = CCB::example_type::slot;
+          ctx.all->p->lp.default_label(ctx.ex->l);
+          ctx.ex->l.ccb().type = CCB::example_type::slot;
           ctx.examples->push_back(ctx.ex);
 
           auto outcome = new CCB::conditional_contextual_bandit_outcome();
           outcome->cost = ctx.label_object_state.cb_label.cost;
           outcome->probabilities.push_back(
               {ctx.label_object_state.cb_label.action, ctx.label_object_state.cb_label.probability});
-          ctx.ex->l.conditional_contextual_bandit.outcome = outcome;
+          ctx.ex->l.ccb().outcome = outcome;
         }
       }
     }
@@ -1022,7 +1022,7 @@ class CCBOutcomeList : public BaseState<audit>
     // Find start index of slot objects by iterating until we find the first slot example.
     for (auto ex : *ctx.examples)
     {
-      if (ex->l.conditional_contextual_bandit.type != CCB::example_type::slot)
+      if (ex->l.ccb().type != CCB::example_type::slot)
       {
         slot_object_index++;
       }
@@ -1058,12 +1058,12 @@ class CCBOutcomeList : public BaseState<audit>
     // DSJson requires the interaction object to be filled. After reading all slot outcomes fill out the top actions.
     for (auto ex : *ctx.examples)
     {
-      if (ex->l.conditional_contextual_bandit.type == CCB::example_type::slot)
+      if (ex->l.ccb().type == CCB::example_type::slot)
       {
-        if (ex->l.conditional_contextual_bandit.outcome)
+        if (ex->l.ccb().outcome)
         {
-          interactions->actions.push_back(ex->l.conditional_contextual_bandit.outcome->probabilities[0].action);
-          interactions->probabilities.push_back(ex->l.conditional_contextual_bandit.outcome->probabilities[0].score);
+          interactions->actions.push_back(ex->l.ccb().outcome->probabilities[0].action);
+          interactions->probabilities.push_back(ex->l.ccb().outcome->probabilities[0].score);
         }
       }
     }
@@ -1294,7 +1294,7 @@ struct VWReaderHandler : public rapidjson::BaseReaderHandler<rapidjson::UTF8<>,
     ctx.init(all);
     ctx.examples = examples;
     ctx.ex = (*examples)[0];
-    all->p->lp.default_label(&ctx.ex->l);
+    all->p->lp.default_label(ctx.ex->l);
 
     ctx.stream = stream;
     ctx.stream_end = stream_end;
@@ -1372,18 +1372,17 @@ void read_line_json(
 
 inline void apply_pdrop(vw& all, float pdrop, v_array<example*>& examples)
 {
-  if (all.label_type == label_type_t::cb)
+  if (all.get_label_type() == label_type_t::cb)
   {
-    for (auto& e : examples)
+    for (auto& e: examples)
     {
-      e->l.cb.weight = 1 - pdrop;
+      e->l.cb().weight = 1 - pdrop;
     }
-  }
-  else if (all.label_type == label_type_t::ccb)
+  } else if (all.get_label_type() == label_type_t::conditional_contextual_bandit)
   {
-    for (auto& e : examples)
+    for (auto& e: examples)
     {
-      e->l.conditional_contextual_bandit.weight = 1 - pdrop;
+      e->l.ccb().weight = 1 - pdrop;
     }
   }
 }
diff --git a/vowpalwabbit/parse_primitives.h b/vowpalwabbit/parse_primitives.h
index cbb1bc4ef2c..c7d07a45f39 100644
--- a/vowpalwabbit/parse_primitives.h
+++ b/vowpalwabbit/parse_primitives.h
@@ -123,7 +123,7 @@ inline float parseFloat(const char* p, size_t& end_idx, const char* endLine = nu
   {
     // can't use stod because that throws an exception. Use strtod instead.
     char* end = nullptr;
-    auto ret = strtod(start, &end);
+    auto ret = std::strtod(start, &end);
     if (end >= start)
     {
       end_idx = end - start;
diff --git a/vowpalwabbit/parser.cc b/vowpalwabbit/parser.cc
index 830a35e681e..d19396d597a 100644
--- a/vowpalwabbit/parser.cc
+++ b/vowpalwabbit/parser.cc
@@ -79,7 +79,6 @@ bool is_test_only(uint32_t counter, uint32_t period, uint32_t after, bool holdou
 
 void set_compressed(parser* par)
 {
-  finalize_source(par);
   delete par->input;
   par->input = new comp_io_buf;
   delete par->output;
@@ -207,21 +206,8 @@ IGNORE_DEPRECATED_USAGE_END
   }
 }
 
-void finalize_source(parser* p)
+void finalize_source(parser*)
 {
-#ifdef _WIN32
-  int f = _fileno(stdin);
-#else
-  int f = fileno(stdin);
-#endif
-  while (!p->input->files.empty() && p->input->files.last() == f) p->input->files.pop();
-  p->input->close_files();
-
-  delete p->input;
-  p->input = nullptr;
-  p->output->close_files();
-  delete p->output;
-  p->output = nullptr;
 }
 
 void make_write_cache(vw& all, std::string& newname, bool quiet)
@@ -304,7 +290,7 @@ void parse_cache(vw& all, std::vector<std::string> cache_files, bool kill_cache,
   {
     if (!quiet)
       all.trace_message << "using no cache" << endl;
-    all.p->output->space.delete_v();
+    all.p->output->space.clear();
   }
 }
 
@@ -419,7 +405,7 @@ void enable_sources(vw& all, bool quiet, size_t passes, input_options& input_opt
 
       // create children
       size_t num_children = all.num_children;
-      v_array<int> children = v_init<int>();
+      v_array<int> children;
       children.resize(num_children);
       for (size_t i = 0; i < num_children; i++)
       {
@@ -657,7 +643,7 @@ void generateGrams(vw& all, example*& ex)
 
 void end_pass_example(vw& all, example* ae)
 {
-  all.p->lp.default_label(&ae->l);
+  all.p->lp.default_label(ae->l);
   ae->end_pass = true;
   all.p->in_pass_counter = 0;
 }
@@ -685,7 +671,8 @@ example& get_unused_example(vw* all)
 
 void setup_examples(vw& all, v_array<example*>& examples)
 {
-  for (example* ae : examples) setup_example(all, ae);
+  for (example* ae : examples)
+    setup_example(all, ae);
 }
 
 void setup_example(vw& all, example* ae)
@@ -695,7 +682,7 @@ void setup_example(vw& all, example* ae)
 
   if (all.p->write_cache)
   {
-    all.p->lp.cache_label(&ae->l, *(all.p->output));
+    all.p->lp.cache_label(ae->l, *(all.p->output));
     cache_features(*(all.p->output), ae, all.parse_mask);
   }
 
@@ -712,12 +699,12 @@ void setup_example(vw& all, example* ae)
   ae->test_only = is_test_only(all.p->in_pass_counter, all.holdout_period, all.holdout_after, all.holdout_set_off,
       all.p->emptylines_separate_examples ? (all.holdout_period - 1) : 0);
   // If this example has a test only label then it is true regardless.
-  ae->test_only |= all.p->lp.test_label(&ae->l);
+  ae->test_only |= all.p->lp.test_label(ae->l);
 
   if (all.p->emptylines_separate_examples && example_is_newline(*ae))
     all.p->in_pass_counter++;
 
-  ae->weight = all.p->lp.get_weight(&ae->l);
+  ae->weight = all.p->lp.get_weight(ae->l);
 
   if (all.ignore_some)
     for (unsigned char* i = ae->indices.begin(); i != ae->indices.end(); i++)
@@ -760,6 +747,44 @@ void setup_example(vw& all, example* ae)
   INTERACTIONS::eval_count_of_generated_ft(all, *ae, new_features_cnt, new_features_sum_feat_sq);
   ae->num_features += new_features_cnt;
   ae->total_sum_feat_sq += new_features_sum_feat_sq;
+
+  // Prediction type should be preinitialized for the given reductions expected type.
+  if(ae->pred.get_type() != all.l->pred_type)
+  {
+    ae->pred.reset();
+    switch (all.l->pred_type)
+    {
+      case (prediction_type_t::scalar):
+        ae->pred.init_as_scalar();
+        break;
+      case (prediction_type_t::scalars):
+        ae->pred.init_as_scalars();
+        break;
+      case (prediction_type_t::action_scores):
+        ae->pred.init_as_action_scores();
+        break;
+      case (prediction_type_t::action_probs):
+        ae->pred.init_as_action_probs();
+        break;
+      case (prediction_type_t::decision_scores):
+        ae->pred.init_as_decision_scores();
+        break;
+      case (prediction_type_t::multiclass):
+        ae->pred.init_as_multiclass();
+        break;
+      case (prediction_type_t::multilabels):
+        ae->pred.init_as_multilabels();
+        break;
+      case (prediction_type_t::prob):
+        ae->pred.init_as_prob();
+        break;
+      case (prediction_type_t::multiclassprobs):
+        ae->pred.multiclassprobs();
+        break;
+      default:
+        THROW(to_string(all.l->pred_type) << " is not supported here");
+    }
+  }
 }
 }  // namespace VW
 
@@ -768,7 +793,7 @@ namespace VW
 example* new_unused_example(vw& all)
 {
   example* ec = &get_unused_example(&all);
-  all.p->lp.default_label(&ec->l);
+  all.p->lp.default_label(ec->l);
   all.p->begin_parsed_examples++;
   ec->example_counter = (size_t)all.p->begin_parsed_examples.load();
   return ec;
@@ -798,15 +823,15 @@ void add_constant_feature(vw& vw, example* ec)
 
 void add_label(example* ec, float label, float weight, float base)
 {
-  ec->l.simple.label = label;
-  ec->l.simple.initial = base;
+  ec->l.simple().label = label;
+  ec->l.simple().initial = base;
   ec->weight = weight;
 }
 
 example* import_example(vw& all, const std::string& label, primitive_feature_space* features, size_t len)
 {
   example* ret = &get_unused_example(&all);
-  all.p->lp.default_label(&ret->l);
+  all.p->lp.default_label(ret->l);
 
   if (label.length() > 0)
     parse_example_label(all, *ret, label);
@@ -860,17 +885,16 @@ void releaseFeatureSpace(primitive_feature_space* features, size_t len)
 
 void parse_example_label(vw& all, example& ec, std::string label)
 {
-  v_array<VW::string_view> words = v_init<VW::string_view>();
+  v_array<VW::string_view> words;
 
   tokenize(' ', label, words);
-  all.p->lp.parse_label(all.p, all.p->_shared_data, &ec.l, words);
-  words.clear();
-  words.delete_v();
+  all.p->lp.parse_label(all.p, all.p->_shared_data, ec.l, words);
 }
 
 void empty_example(vw& /*all*/, example& ec)
 {
-  for (features& fs : ec) fs.clear();
+  for (features& fs : ec)
+    fs.clear();
 
   ec.indices.clear();
   ec.tag.clear();
@@ -920,30 +944,30 @@ namespace VW
 {
 example* get_example(parser* p) { return p->ready_parsed_examples.pop(); }
 
-float get_topic_prediction(example* ec, size_t i) { return ec->pred.scalars[i]; }
+float get_topic_prediction(example* ec, size_t i) { return ec->pred.scalars()[i]; }
 
-float get_label(example* ec) { return ec->l.simple.label; }
+float get_label(example* ec) { return ec->l.simple().label; }
 
 float get_importance(example* ec) { return ec->weight; }
 
-float get_initial(example* ec) { return ec->l.simple.initial; }
+float get_initial(example* ec) { return ec->l.simple().initial; }
 
-float get_prediction(example* ec) { return ec->pred.scalar; }
+float get_prediction(example* ec) { return ec->pred.scalar(); }
 
-float get_cost_sensitive_prediction(example* ec) { return (float)ec->pred.multiclass; }
+float get_cost_sensitive_prediction(example* ec) { return (float)ec->pred.multiclass(); }
 
-v_array<float>& get_cost_sensitive_prediction_confidence_scores(example* ec) { return ec->pred.scalars; }
+v_array<float>& get_cost_sensitive_prediction_confidence_scores(example* ec) { return ec->pred.scalars(); }
 
 uint32_t* get_multilabel_predictions(example* ec, size_t& len)
 {
-  MULTILABEL::labels labels = ec->pred.multilabels;
+  MULTILABEL::labels labels = ec->pred.multilabels();
   len = labels.label_v.size();
   return labels.label_v.begin();
 }
 
 float get_action_score(example* ec, size_t i)
 {
-  ACTION_SCORE::action_scores scores = ec->pred.a_s;
+  ACTION_SCORE::action_scores scores = ec->pred.action_scores();
 
   if (i < scores.size())
   {
@@ -955,7 +979,7 @@ float get_action_score(example* ec, size_t i)
   }
 }
 
-size_t get_action_score_length(example* ec) { return ec->pred.a_s.size(); }
+size_t get_action_score_length(example* ec) { return ec->pred.action_scores().size(); }
 
 size_t get_tag_length(example* ec) { return ec->tag.size(); }
 
@@ -968,13 +992,12 @@ float get_confidence(example* ec) { return ec->confidence; }
 
 example* example_initializer::operator()(example* ex)
 {
-  memset(&ex->l, 0, sizeof(polylabel));
-  ex->passthrough = nullptr;
-  ex->tag = v_init<char>();
-  ex->indices = v_init<namespace_index>();
+  new (&ex->l) polylabel();
+  new (&ex->pred) polyprediction();
 IGNORE_DEPRECATED_USAGE_START
   ex->in_use = true;
 IGNORE_DEPRECATED_USAGE_END
+  ex->passthrough = nullptr;
   memset(ex->feature_space.data(), 0, ex->feature_space.size() * sizeof(ex->feature_space[0]));
   return ex;
 }
@@ -988,52 +1011,9 @@ namespace VW
 void start_parser(vw& all) { all.parse_thread = std::thread(main_parse_loop, &all); }
 }  // namespace VW
 
-// a copy of dealloc_example except that this does not call the example destructor
-// Work to remove this is currently in progress
-void cleanup_example(void(*delete_label)(void*), example& ec, void(*delete_prediction)(void*))
+VW_DEPRECATED("No longer needed. Use destructor.")
+void free_parser(vw& /*all*/)
 {
-  if (delete_label)
-    delete_label(&ec.l);
-
-  if (delete_prediction)
-    delete_prediction(&ec.pred);
-
-  ec.tag.delete_v();
-
-  if (ec.passthrough)
-  {
-    delete ec.passthrough;
-  }
-
-  ec.indices.delete_v();
-}
-
-void free_parser(vw& all)
-{
-  all.p->words.delete_v();
-
-  if (!all.ngram_strings.empty())
-    all.p->gram_mask.delete_v();
-
-  io_buf* output = all.p->output;
-  if (output != nullptr)
-  {
-    output->finalname.delete_v();
-    output->currentname.delete_v();
-  }
-
-  while (!all.p->example_pool.empty())
-  {
-    example* temp = all.p->example_pool.get_object();
-    cleanup_example(all.p->lp.delete_label, *temp, all.delete_prediction);
-  }
-
-  while (all.p->ready_parsed_examples.size() != 0)
-  {
-    example* temp = all.p->ready_parsed_examples.pop();
-    cleanup_example(all.p->lp.delete_label, *temp, all.delete_prediction);
-  }
-  all.p->counts.delete_v();
 }
 
 namespace VW
diff --git a/vowpalwabbit/parser.h b/vowpalwabbit/parser.h
index ca374fc6020..63ed1aa8112 100644
--- a/vowpalwabbit/parser.h
+++ b/vowpalwabbit/parser.h
@@ -48,13 +48,6 @@ struct parser
     this->input = new io_buf{};
     this->output = new io_buf{};
     this->lp = simple_label;
-
-    // Free parser must still be used for the following fields.
-    this->words = v_init<VW::string_view>();
-    this->parse_name = v_init<VW::string_view>();
-    this->gram_mask = v_init<size_t>();
-    this->ids = v_init<size_t>();
-    this->counts = v_init<size_t>();
   }
 
   ~parser()
@@ -128,6 +121,9 @@ void set_done(vw& all);
 
 // source control functions
 void reset_source(vw& all, size_t numbits);
+VW_DEPRECATED("no longer needed")
 void finalize_source(parser* source);
 void set_compressed(parser* par);
+
+VW_DEPRECATED("no longer needed. Use destructor") 
 void free_parser(vw& all);
diff --git a/vowpalwabbit/prediction.h b/vowpalwabbit/prediction.h
new file mode 100644
index 00000000000..c810e82a8d0
--- /dev/null
+++ b/vowpalwabbit/prediction.h
@@ -0,0 +1,472 @@
+#pragma once
+
+/*
+When a new prediction type needs to be added the following actions must be taken:
+- PREDICTION_TYPE is the type that will be used
+- PREDICTION_NAME is the name to identify this label type
+Steps:
+  1. Add a new variant to prediction_type_t called PREDICTION_NAME
+  2. Add the corresponding row to to_string:
+    TO_STRING_CASE(prediction_type_t::PREDICTION_NAME)
+  3. Add the new type to the union:
+    PREDICTION_TYPE _PREDICTION_NAME;
+  3. Add the corresponding row to polyprediction::copy_from
+    case (prediction_type_t::PREDICTION_NAME):
+      init_as_PREDICTION_NAME(std::move(other._PREDICTION_NAME));
+      break;
+  4. Add the corresponding row to polyprediction::move_from
+    case (prediction_type_t::PREDICTION_NAME):
+      init_as_PREDICTION_NAME(std::move(other._PREDICTION_NAME));
+      break;
+  5. Add the corresponding row to polyprediction::reset
+    case (prediction_type_t::PREDICTION_NAME):
+        destruct(_PREDICTION_NAME);
+        break;
+  6. Add another three methods that correspond to the new type according to this template
+    template <typename... Args>
+    PREDICTION_TYPE& init_as_PREDICTION_NAME(Args&&... args)
+    {
+      ensure_is_type(prediction_type_t::unset);
+      new (&_PREDICTION_NAME) PREDICTION_TYPE(std::forward<Args>(args)...);
+      _tag = prediction_type_t::PREDICTION_NAME;
+      return _PREDICTION_NAME;
+    }
+
+    const PREDICTION_TYPE& PREDICTION_NAME() const
+    {
+      ensure_is_type(prediction_type_t::PREDICTION_NAME);
+      return _PREDICTION_NAME;
+    }
+
+    PREDICTION_TYPE& PREDICTION_NAME()
+    {
+      ensure_is_type(prediction_type_t::PREDICTION_NAME);
+      return _PREDICTION_NAME;
+    }
+*/
+
+enum class prediction_type_t : int
+{
+  unset,
+  scalar,
+  scalars,
+  action_scores,
+  multiclassprobs,
+  multiclass,
+  multilabels,
+  prob,
+  decision_scores,
+  action_probs,
+};
+
+#define TO_STRING_CASE(enum_type) \
+  case enum_type:                 \
+    return #enum_type;
+
+inline const char* to_string(prediction_type_t prediction_type)
+{
+  switch (prediction_type)
+  {
+    TO_STRING_CASE(prediction_type_t::unset)
+    TO_STRING_CASE(prediction_type_t::scalar)
+    TO_STRING_CASE(prediction_type_t::scalars)
+    TO_STRING_CASE(prediction_type_t::action_scores)
+    TO_STRING_CASE(prediction_type_t::action_probs)
+    TO_STRING_CASE(prediction_type_t::decision_scores)
+    TO_STRING_CASE(prediction_type_t::multiclass)
+    TO_STRING_CASE(prediction_type_t::multilabels)
+    TO_STRING_CASE(prediction_type_t::prob)
+    TO_STRING_CASE(prediction_type_t::multiclassprobs)
+    default:
+      return "<unsupported>";
+  }
+}
+
+struct polyprediction
+{
+ private:
+  union {
+    float _scalar;
+    v_array<float> _scalars;           // a sequence of scalar predictions
+    ACTION_SCORE::action_scores _action_scores;  // a sequence of classes with scores.
+    ACTION_SCORE::action_scores _action_probs;  // a sequence of classes with probs.
+    CCB::decision_scores_t _decision_scores;
+    uint32_t _multiclass;
+    MULTILABEL::labels _multilabels;
+    float _prob;  // for --probabilities --csoaa_ldf=mc
+    v_array<float> _multiclassprobs;
+
+  };
+  prediction_type_t _tag;
+
+  inline void ensure_is_type(prediction_type_t type) const
+  {
+#ifndef NDEBUG
+    if (_tag != type)
+    {
+      THROW("Expected type: " << to_string(type) << ", but found: " << to_string(_tag));
+    }
+#else
+    _UNUSED(type);
+#endif
+  }
+
+  template <typename T>
+  void destruct(T& item)
+  {
+    item.~T();
+  }
+
+  // These two functions only differ by parameter
+  void copy_from(const polyprediction& other)
+  {
+    switch (other._tag)
+    {
+      case (prediction_type_t::unset):
+        break;
+      case (prediction_type_t::scalar):
+        init_as_scalar(other._scalar);
+        break;
+      case (prediction_type_t::scalars):
+        init_as_scalars(other._scalars);
+        break;
+      case (prediction_type_t::action_scores):
+        init_as_action_scores(other._action_scores);
+        break;
+      case (prediction_type_t::action_probs):
+        init_as_action_probs(other._action_probs);
+        break;
+      case (prediction_type_t::decision_scores):
+        init_as_decision_scores(other._decision_scores);
+        break;
+      case (prediction_type_t::multiclass):
+        init_as_multiclass(other._multiclass);
+        break;
+      case (prediction_type_t::multilabels):
+        init_as_multilabels(other._multilabels);
+        break;
+      case (prediction_type_t::prob):
+        init_as_prob(other._prob);
+        break;
+      case (prediction_type_t::multiclassprobs):
+        init_as_multiclassprobs(other._multiclassprobs);
+        break;
+      default:;
+    }
+  }
+
+  void move_from(polyprediction&& other)
+  {
+    switch (other._tag)
+    {
+      case (prediction_type_t::unset):
+        break;
+      case (prediction_type_t::scalar):
+        init_as_scalar(std::move(other._scalar));
+        break;
+      case (prediction_type_t::scalars):
+        init_as_scalars(std::move(other._scalars));
+        break;
+      case (prediction_type_t::action_scores):
+        init_as_action_scores(std::move(other._action_scores));
+        break;
+      case (prediction_type_t::action_probs):
+        init_as_action_probs(std::move(other._action_probs));
+        break;
+      case (prediction_type_t::decision_scores):
+        init_as_decision_scores(std::move(other._decision_scores));
+        break;
+      case (prediction_type_t::multiclass):
+        init_as_multiclass(std::move(other._multiclass));
+        break;
+      case (prediction_type_t::multilabels):
+        init_as_multilabels(std::move(other._multilabels));
+        break;
+      case (prediction_type_t::prob):
+        init_as_prob(std::move(other._prob));
+        break;
+      case (prediction_type_t::multiclassprobs):
+        init_as_multiclassprobs(std::move(other._multiclassprobs));
+        break;
+      default:;
+    }
+  }
+
+ public:
+  polyprediction() { _tag = prediction_type_t::unset; // Perhaps we should memset here?
+  };
+  ~polyprediction() { reset(); }
+
+  polyprediction(polyprediction&& other)
+  {
+    _tag = prediction_type_t::unset;
+    move_from(std::move(other));
+    other.reset();
+  }
+
+  polyprediction& operator=(polyprediction&& other)
+  {
+    reset();
+    move_from(std::move(other));
+    other.reset();
+    return *this;
+  }
+
+  polyprediction(const polyprediction& other) {
+    _tag = prediction_type_t::unset;
+    copy_from(other);
+  }
+
+  polyprediction& operator=(const polyprediction& other) {
+    reset();
+    copy_from(other);
+    return *this;
+  }
+
+  prediction_type_t get_type() const { return _tag; }
+
+  void reset()
+  {
+    switch (_tag)
+    {
+      case (prediction_type_t::unset):
+        // Nothing to do! Whatever was in here has already been destroyed.
+        return;
+      case (prediction_type_t::scalar):
+        destruct(_scalar);
+        break;
+      case (prediction_type_t::scalars):
+        destruct(_scalars);
+        break;
+      case (prediction_type_t::action_scores):
+        destruct(_action_scores);
+        break;
+      case (prediction_type_t::action_probs):
+        destruct(_action_probs);
+        break;
+      case (prediction_type_t::decision_scores):
+        destruct(_decision_scores);
+        break;
+      case (prediction_type_t::multiclass):
+        destruct(_multiclass);
+        break;
+      case (prediction_type_t::multilabels):
+        destruct(_multilabels);
+        break;
+      case (prediction_type_t::prob):
+        destruct(_prob);
+        break;
+      case (prediction_type_t::multiclassprobs):
+        destruct(_multiclassprobs);
+        break;
+      default:;
+    }
+
+    _tag = prediction_type_t::unset;
+  }
+
+  template <typename... Args>
+  float& init_as_scalar(Args&&... args)
+  {
+    ensure_is_type(prediction_type_t::unset);
+    new (&_scalar) float(std::forward<Args>(args)...);
+    _tag = prediction_type_t::scalar;
+    return _scalar;
+  }
+
+  const float& scalar() const
+  {
+    ensure_is_type(prediction_type_t::scalar);
+    return _scalar;
+  }
+
+  float& scalar()
+  {
+    ensure_is_type(prediction_type_t::scalar);
+    return _scalar;
+  }
+
+  template <typename... Args>
+  v_array<float>& init_as_scalars(Args&&... args)
+  {
+    ensure_is_type(prediction_type_t::unset);
+    new (&_scalars) v_array<float>(std::forward<Args>(args)...);
+    _tag = prediction_type_t::scalars;
+    return _scalars;
+  }
+
+  const v_array<float>& scalars() const
+  {
+    ensure_is_type(prediction_type_t::scalars);
+    return _scalars;
+  }
+
+  v_array<float>& scalars()
+  {
+    ensure_is_type(prediction_type_t::scalars);
+    return _scalars;
+  }
+
+  template <typename... Args>
+  ACTION_SCORE::action_scores& init_as_action_scores(Args&&... args)
+  {
+    ensure_is_type(prediction_type_t::unset);
+    new (&_action_scores) ACTION_SCORE::action_scores(std::forward<Args>(args)...);
+    _tag = prediction_type_t::action_scores;
+    return _action_scores;
+  }
+
+  const ACTION_SCORE::action_scores& action_scores() const
+  {
+    ensure_is_type(prediction_type_t::action_scores);
+    return _action_scores;
+  }
+
+  ACTION_SCORE::action_scores& action_scores()
+  {
+    ensure_is_type(prediction_type_t::action_scores);
+    return _action_scores;
+  }
+  
+  template <typename... Args>
+  ACTION_SCORE::action_scores& init_as_action_probs(Args&&... args)
+  {
+    ensure_is_type(prediction_type_t::unset);
+    new (&_action_probs) ACTION_SCORE::action_scores(std::forward<Args>(args)...);
+    _tag = prediction_type_t::action_probs;
+    return _action_probs;
+  }
+
+  const ACTION_SCORE::action_scores& action_probs() const
+  {
+    ensure_is_type(prediction_type_t::action_probs);
+    return _action_probs;
+  }
+
+  ACTION_SCORE::action_scores& action_probs()
+  {
+    ensure_is_type(prediction_type_t::action_probs);
+    return _action_probs;
+  }
+
+  template <typename... Args>
+  CCB::decision_scores_t& init_as_decision_scores(Args&&... args)
+  {
+    ensure_is_type(prediction_type_t::unset);
+    new (&_decision_scores) CCB::decision_scores_t(std::forward<Args>(args)...);
+    _tag = prediction_type_t::decision_scores;
+    return _decision_scores;
+  }
+
+  const CCB::decision_scores_t& decision_scores() const
+  {
+    ensure_is_type(prediction_type_t::decision_scores);
+    return _decision_scores;
+  }
+
+  CCB::decision_scores_t& decision_scores()
+  {
+    ensure_is_type(prediction_type_t::decision_scores);
+    return _decision_scores;
+  }
+
+  template <typename... Args>
+  uint32_t& init_as_multiclass(Args&&... args)
+  {
+    ensure_is_type(prediction_type_t::unset);
+    new (&_multiclass) uint32_t(std::forward<Args>(args)...);
+    _tag = prediction_type_t::multiclass;
+    return _multiclass;
+  }
+
+  const uint32_t& multiclass() const
+  {
+    ensure_is_type(prediction_type_t::multiclass);
+    return _multiclass;
+  }
+
+  uint32_t& multiclass()
+  {
+    ensure_is_type(prediction_type_t::multiclass);
+    return _multiclass;
+  }
+
+  template <typename... Args>
+  MULTILABEL::labels& init_as_multilabels(Args&&... args)
+  {
+    ensure_is_type(prediction_type_t::unset);
+    new (&_multilabels) MULTILABEL::labels(std::forward<Args>(args)...);
+    _tag = prediction_type_t::multilabels;
+    return _multilabels;
+  }
+
+  const MULTILABEL::labels& multilabels() const
+  {
+    ensure_is_type(prediction_type_t::multilabels);
+    return _multilabels;
+  }
+
+  MULTILABEL::labels& multilabels()
+  {
+    ensure_is_type(prediction_type_t::multilabels);
+    return _multilabels;
+  }
+
+  template <typename... Args>
+  float& init_as_prob(Args&&... args)
+  {
+    ensure_is_type(prediction_type_t::unset);
+    new (&_prob) float(std::forward<Args>(args)...);
+    _tag = prediction_type_t::prob;
+    return _prob;
+  }
+
+  const float& prob() const
+  {
+    ensure_is_type(prediction_type_t::prob);
+    return _prob;
+  }
+
+  float& prob()
+  {
+    ensure_is_type(prediction_type_t::prob);
+    return _prob;
+  }
+  
+  template <typename... Args>
+  v_array<float>& init_as_multiclassprobs(Args&&... args)
+  {
+    ensure_is_type(prediction_type_t::unset);
+    new (&_multiclassprobs) v_array<float>(std::forward<Args>(args)...);
+    _tag = prediction_type_t::multiclassprobs;
+    return _multiclassprobs;
+  }
+
+  const v_array<float>& multiclassprobs() const
+  {
+    ensure_is_type(prediction_type_t::multiclassprobs);
+    return _multiclassprobs;
+  }
+
+  v_array<float>& multiclassprobs()
+  {
+    ensure_is_type(prediction_type_t::multiclassprobs);
+    return _multiclassprobs;
+  }
+
+  // TODO: make this more generic through traits and type comparisons.
+  void reinterpret(prediction_type_t type)
+  {
+    // Currently the only valid reinterpret is between action scores and probs, or itself.
+    if((type == prediction_type_t::action_probs && _tag == prediction_type_t::action_scores)
+      || (type == prediction_type_t::action_scores && _tag == prediction_type_t::action_probs)
+      || type == _tag) 
+    {
+      _tag = type;
+    }
+    else
+    {
+      THROW("Illegal reinterpret. Tried to reinterpret as " << to_string(type) << ", but contains: " << to_string(_tag));
+    }
+  }
+};
diff --git a/vowpalwabbit/print.cc b/vowpalwabbit/print.cc
index ec5163d189b..aec953e3f5d 100644
--- a/vowpalwabbit/print.cc
+++ b/vowpalwabbit/print.cc
@@ -24,7 +24,7 @@ void print_feature(vw& /* all */, float value, uint64_t index)
 
 void learn(print& p, LEARNER::base_learner&, example& ec)
 {
-  label_data& ld = ec.l.simple;
+  label_data& ld = ec.l.simple();
   if (ld.label != FLT_MAX)
   {
     cout << ld.label << " ";
@@ -61,5 +61,6 @@ LEARNER::base_learner* print_setup(options_i& options, vw& all)
   all.weights.stride_shift(0);
 
   LEARNER::learner<print, example>& ret = init_learner(p, learn, learn, 1);
+  ret.label_type = label_type_t::simple;
   return make_base(ret);
 }
diff --git a/vowpalwabbit/recall_tree.cc b/vowpalwabbit/recall_tree.cc
index 431ada1eea8..d6ce4cfd191 100644
--- a/vowpalwabbit/recall_tree.cc
+++ b/vowpalwabbit/recall_tree.cc
@@ -52,7 +52,6 @@ struct node
       , n(0)
       , entropy(0)
       , passes(1)
-      , preds(v_init<node_pred>())
   {
   }
 };
@@ -72,12 +71,6 @@ struct recall_tree
   float bern_hyper;
 
   bool randomized_routing;
-
-  ~recall_tree()
-  {
-    for (auto& node : nodes) node.preds.delete_v();
-    nodes.delete_v();
-  }
 };
 
 float to_prob(float x)
@@ -121,11 +114,12 @@ void init_tree(recall_tree& b)
   b.max_routers = routers_used;
 }
 
+// TODO replace with std::find
 node_pred* find(recall_tree& b, uint32_t cn, example& ec)
 {
   node_pred* ls;
 
-  for (ls = b.nodes[cn].preds.begin(); ls != b.nodes[cn].preds.end() && ls->label != ec.l.multi.label; ++ls)
+  for (ls = b.nodes[cn].preds.begin(); ls != b.nodes[cn].preds.end() && ls->label != ec.l.multi().label; ++ls)
     ;
 
   return ls;
@@ -137,7 +131,7 @@ node_pred* find_or_create(recall_tree& b, uint32_t cn, example& ec)
 
   if (ls == b.nodes[cn].preds.end())
   {
-    node_pred newls(ec.l.multi.label);
+    node_pred newls(ec.l.multi().label);
     b.nodes[cn].preds.push_back(newls);
     ls = b.nodes[cn].preds.end() - 1;
   }
@@ -251,13 +245,16 @@ void remove_node_id_feature(recall_tree& /* b */, uint32_t /* cn */, example& ec
 
 uint32_t oas_predict(recall_tree& b, single_learner& base, uint32_t cn, example& ec)
 {
-  MULTICLASS::label_t mc = ec.l.multi;
-  uint32_t save_pred = ec.pred.multiclass;
+  MULTICLASS::label_t mc = ec.l.multi();
+  uint32_t save_pred = ec.pred.multiclass();
 
   uint32_t amaxscore = 0;
 
   add_node_id_feature(b, cn, ec);
-  ec.l.simple = {FLT_MAX, 0.f, 0.f};
+  ec.l.reset();
+  ec.l.init_as_simple() = {FLT_MAX, 0.f, 0.f};
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
 
   float maxscore = std::numeric_limits<float>::lowest();
   for (node_pred* ls = b.nodes[cn].preds.begin();
@@ -273,8 +270,10 @@ uint32_t oas_predict(recall_tree& b, single_learner& base, uint32_t cn, example&
 
   remove_node_id_feature(b, cn, ec);
 
-  ec.l.multi = mc;
-  ec.pred.multiclass = save_pred;
+  ec.l.reset();
+  ec.l.init_as_multi() = mc;
+  ec.pred.reset();
+  ec.pred.init_as_multiclass() = save_pred;
 
   return amaxscore;
 }
@@ -284,7 +283,7 @@ bool is_candidate(recall_tree& b, uint32_t cn, example& ec)
   for (node_pred* ls = b.nodes[cn].preds.begin();
        ls != b.nodes[cn].preds.end() && ls < b.nodes[cn].preds.begin() + b.max_candidates; ++ls)
   {
-    if (ls->label == ec.l.multi.label)
+    if (ls->label == ec.l.multi().label)
       return true;
   }
 
@@ -308,10 +307,12 @@ bool stop_recurse_check(recall_tree& b, uint32_t parent, uint32_t child)
 
 predict_type predict_from(recall_tree& b, single_learner& base, example& ec, uint32_t cn)
 {
-  MULTICLASS::label_t mc = ec.l.multi;
-  uint32_t save_pred = ec.pred.multiclass;
-
-  ec.l.simple = {FLT_MAX, 0.f, 0.f};
+  MULTICLASS::label_t mc = ec.l.multi();
+  uint32_t save_pred = ec.pred.multiclass();
+  ec.l.reset();
+  ec.l.init_as_simple() = {FLT_MAX, 0.f, 0.f};
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
   while (b.nodes[cn].internal)
   {
     base.predict(ec, b.nodes[cn].base_router);
@@ -324,8 +325,10 @@ predict_type predict_from(recall_tree& b, single_learner& base, example& ec, uin
     cn = newcn;
   }
 
-  ec.l.multi = mc;
-  ec.pred.multiclass = save_pred;
+  ec.l.reset();
+  ec.l.init_as_multi() = mc;
+  ec.pred.reset();
+  ec.pred.init_as_multiclass() = save_pred;
 
   return predict_type(cn, oas_predict(b, base, cn, ec));
 }
@@ -334,13 +337,13 @@ void predict(recall_tree& b, single_learner& base, example& ec)
 {
   predict_type pred = predict_from(b, base, ec, 0);
 
-  ec.pred.multiclass = pred.class_prediction;
+  ec.pred.multiclass() = pred.class_prediction;
 }
 
 float train_node(recall_tree& b, single_learner& base, example& ec, uint32_t cn)
 {
-  MULTICLASS::label_t mc = ec.l.multi;
-  uint32_t save_pred = ec.pred.multiclass;
+  MULTICLASS::label_t mc = ec.l.multi();
+  uint32_t save_pred = ec.pred.multiclass();
 
   // minimize entropy
   // better than maximize expected likelihood, and the proofs go through :)
@@ -355,7 +358,10 @@ float train_node(recall_tree& b, single_learner& base, example& ec, uint32_t cn)
   float route_label = delta_left < delta_right ? -1.f : 1.f;
   float imp_weight = fabs((float)(delta_left - delta_right));
 
-  ec.l.simple = {route_label, imp_weight, 0.};
+  ec.l.reset();
+  ec.l.init_as_simple() = {route_label, imp_weight, 0.};
+  ec.pred.reset();
+  ec.pred.init_as_scalar();
   base.learn(ec, b.nodes[cn].base_router);
 
   // TODO: using the updated routing seems to help
@@ -363,10 +369,12 @@ float train_node(recall_tree& b, single_learner& base, example& ec, uint32_t cn)
   // TODO: (doesn't play well with link function)
   base.predict(ec, b.nodes[cn].base_router);
 
-  float save_scalar = ec.pred.scalar;
+  float save_scalar = ec.pred.scalar();
 
-  ec.l.multi = mc;
-  ec.pred.multiclass = save_pred;
+  ec.l.reset();
+  ec.l.init_as_multi() = mc;
+  ec.pred.reset();
+  ec.pred.init_as_multiclass() = save_pred;
 
   return save_scalar;
 }
@@ -375,7 +383,7 @@ void learn(recall_tree& b, single_learner& base, example& ec)
 {
   predict(b, base, ec);
 
-  if (b.all->training && ec.l.multi.label != (uint32_t)-1)  // if training the tree
+  if (b.all->training && ec.l.multi().label != (uint32_t)-1)  // if training the tree
   {
     uint32_t cn = 0;
 
@@ -404,14 +412,17 @@ void learn(recall_tree& b, single_learner& base, example& ec)
 
     if (is_candidate(b, cn, ec))
     {
-      MULTICLASS::label_t mc = ec.l.multi;
-      uint32_t save_pred = ec.pred.multiclass;
+      MULTICLASS::label_t mc = ec.l.multi();
+      uint32_t save_pred = ec.pred.multiclass();
 
       add_node_id_feature(b, cn, ec);
 
-      ec.l.simple = {1.f, 1.f, 0.f};
+      ec.l.reset();
+      ec.l.init_as_simple() = {1.f, 1.f, 0.f};
+      ec.pred.reset();
+      ec.pred.init_as_scalar();
       base.learn(ec, b.max_routers + mc.label - 1);
-      ec.l.simple = {-1.f, 1.f, 0.f};
+      ec.l.simple() = {-1.f, 1.f, 0.f};
 
       for (node_pred* ls = b.nodes[cn].preds.begin();
            ls != b.nodes[cn].preds.end() && ls < b.nodes[cn].preds.begin() + b.max_candidates; ++ls)
@@ -422,8 +433,10 @@ void learn(recall_tree& b, single_learner& base, example& ec)
 
       remove_node_id_feature(b, cn, ec);
 
-      ec.l.multi = mc;
-      ec.pred.multiclass = save_pred;
+      ec.l.reset();
+      ec.l.init_as_multi() = mc;
+      ec.pred.reset();
+      ec.pred.init_as_multiclass() = save_pred;
     }
   }
 }
@@ -534,6 +547,6 @@ base_learner* recall_tree_setup(options_i& options, vw& all)
   learner<recall_tree, example>& l = init_multiclass_learner(
       tree, as_singleline(setup_base(options, all)), learn, predict, all.p, tree->max_routers + tree->k);
   l.set_save_load(save_load_tree);
-
+  l.label_type = label_type_t::multi;
   return make_base(l);
 }
diff --git a/vowpalwabbit/scorer.cc b/vowpalwabbit/scorer.cc
index f755d726b3f..1250886c975 100644
--- a/vowpalwabbit/scorer.cc
+++ b/vowpalwabbit/scorer.cc
@@ -17,16 +17,21 @@ struct scorer
 template <bool is_learn, float (*link)(float in)>
 void predict_or_learn(scorer& s, LEARNER::single_learner& base, example& ec)
 {
-  s.all->set_minmax(s.all->sd, ec.l.simple.label);
-  if (is_learn && ec.l.simple.label != FLT_MAX && ec.weight > 0)
+  // LDA uses this reduction and explicitly uses no label and so we must check here before using it.
+  const float simple_label = ec.l.get_type() == label_type_t::simple ? ec.l.simple().label : 0.f;
+
+  s.all->set_minmax(s.all->sd, simple_label);
+  if (is_learn && simple_label != FLT_MAX && ec.weight > 0)
     base.learn(ec);
   else
     base.predict(ec);
 
-  if (ec.weight > 0 && ec.l.simple.label != FLT_MAX)
-    ec.loss = s.all->loss->getLoss(s.all->sd, ec.pred.scalar, ec.l.simple.label) * ec.weight;
+  // TODO: LDA returns scalars prediction type - what should we do here?
+
+  if (ec.weight > 0 && simple_label != FLT_MAX)
+    ec.loss = s.all->loss->getLoss(s.all->sd, ec.pred.scalar(), simple_label) * ec.weight;
 
-  ec.pred.scalar = link(ec.pred.scalar);
+  ec.pred.scalar() = link(ec.pred.scalar());
 }
 
 template <float (*link)(float in)>
@@ -34,12 +39,12 @@ inline void multipredict(scorer&, LEARNER::single_learner& base, example& ec, si
     polyprediction* pred, bool finalize_predictions)
 {
   base.multipredict(ec, 0, count, pred, finalize_predictions);  // TODO: need to thread step through???
-  for (size_t c = 0; c < count; c++) pred[c].scalar = link(pred[c].scalar);
+  for (size_t c = 0; c < count; c++) pred[c].scalar() = link(pred[c].scalar());
 }
 
 void update(scorer& s, LEARNER::single_learner& base, example& ec)
 {
-  s.all->set_minmax(s.all->sd, ec.l.simple.label);
+  s.all->set_minmax(s.all->sd, ec.l.simple().label);
   base.update(ec);
 }
 
@@ -65,7 +70,9 @@ LEARNER::base_learner* scorer_setup(options_i& options, vw& all)
                       .help("Specify the link function: identity, logistic, glf1 or poisson"));
   options.add_and_parse(new_options);
 
-  // This always returns a base_learner.
+  // This always returns a base_learner, except for in the case of LDA which does not use the scorer.
+  if (options.was_supplied("lda"))
+    return nullptr;
 
   s->all = &all;
 
@@ -96,6 +103,7 @@ LEARNER::base_learner* scorer_setup(options_i& options, vw& all)
 
   l->set_multipredict(multipredict_f);
   l->set_update(update);
+  l->label_type = base->label_type;
   all.scorer = LEARNER::as_singleline(l);
 
   return make_base(*all.scorer);
diff --git a/vowpalwabbit/search.cc b/vowpalwabbit/search.cc
index ceb0b32c2ae..cd733d8683d 100644
--- a/vowpalwabbit/search.cc
+++ b/vowpalwabbit/search.cc
@@ -110,8 +110,7 @@ struct action_repr
   {
     if (_repr != nullptr)
     {
-      repr = new features();
-      repr->deep_copy_from(*_repr);
+      repr = new features(*_repr);
     }
   }
   action_repr(action _a) : a(_a), repr(nullptr) {}
@@ -207,7 +206,7 @@ struct search_private
   action learn_oracle_action;                // store an oracle action for debugging purposes
   features last_action_repr;
 
-  polylabel* allowed_actions_cache;
+  polylabel allowed_actions_cache;
 
   size_t loss_declared_cnt;                 // how many times did run declare any loss (implicitly or explicitly)?
   v_array<scored_action> train_trajectory;  // the training trajectory
@@ -305,7 +304,6 @@ void clear_memo_foreach_action(search_private& priv)
   for (size_t i = 0; i < priv.memo_foreach_action.size(); i++)
     if (priv.memo_foreach_action[i])
     {
-      priv.memo_foreach_action[i]->delete_v();
       delete priv.memo_foreach_action[i];
     }
   priv.memo_foreach_action.clear();
@@ -319,60 +317,22 @@ search::~search()
   {
     search_private& priv = *this->priv;
 
-    priv._random_state.~shared_ptr<rand_state>();
     delete priv.truth_string;
     delete priv.pred_string;
     delete priv.bad_string_stream;
-    priv.rawOutputString.~basic_string();
-    priv.test_action_sequence.~vector<action>();
-    priv.dat_new_feature_audit_ss.~basic_stringstream();
-    priv.neighbor_features.delete_v();
-    priv.timesteps.delete_v();
-    if (priv.cb_learner)
-      priv.learn_losses.cb.costs.delete_v();
-    else
-      priv.learn_losses.cs.costs.delete_v();
-    if (priv.cb_learner)
-      priv.gte_label.cb.costs.delete_v();
-    else
-      priv.gte_label.cs.costs.delete_v();
-
-    priv.condition_on_actions.delete_v();
-    priv.learn_allowed_actions.delete_v();
-    priv.ldf_test_label.costs.delete_v();
-    priv.last_action_repr.~features();
-    priv.active_uncertainty.delete_v();
-    for (size_t i = 0; i < priv.active_known.size(); i++) priv.active_known[i].delete_v();
-    priv.active_known.delete_v();
-
-    if (priv.cb_learner)
-      priv.allowed_actions_cache->cb.costs.delete_v();
-    else
-      priv.allowed_actions_cache->cs.costs.delete_v();
-
-    priv.train_trajectory.delete_v();
-
-    for (auto& ar : priv.ptag_to_action) delete ar.repr;
-    priv.ptag_to_action.delete_v();
-    clear_memo_foreach_action(priv);
-    priv.memo_foreach_action.delete_v();
 
-    // destroy copied examples if we needed them
-    if (!priv.examples_dont_change)
+    delete priv.rawOutputStringStream;
+    for (auto& ar : priv.ptag_to_action)
     {
-      void (*delete_label)(void*) = priv.is_ldf ? CS::cs_label.delete_label : MC::mc_label.delete_label;
-      for (example& ec : priv.learn_ec_copy) VW::dealloc_example(delete_label, ec);
-      priv.learn_ec_copy.delete_v();
+      if (ar.repr != nullptr)
+      {
+        delete ar.repr;
+        cdbg << "delete_v" << endl;
+      }
     }
-    priv.learn_condition_on_names.delete_v();
-    priv.learn_condition_on.delete_v();
-
-    priv.learn_condition_on_act.delete_v();
-    priv.cache_hash_map.~unordered_map<byte_array, scored_action, search_private::cached_item_hash,
-        search_private::cached_item_equivalent>();
+    clear_memo_foreach_action(priv);
 
-    free(priv.allowed_actions_cache);
-    delete priv.rawOutputStringStream;
+    this->priv->~search_private();
   }
   free(this->priv);
 }
@@ -724,8 +684,13 @@ void reset_search_structure(search_private& priv)
     if (priv.beta > 1)
       priv.beta = 1;
   }
-
-  for (auto& ar : priv.ptag_to_action) delete ar.repr;
+  for (auto& ar : priv.ptag_to_action)
+  {
+    if (ar.repr != nullptr)
+    {
+      delete ar.repr;
+    }
+  }
   priv.ptag_to_action.clear();
 
   if (!priv.cb_learner)  // was: if rollout_all_actions
@@ -796,8 +761,8 @@ void add_example_conditioning(search_private& priv, example& ec, size_t conditio
 
   uint64_t extra_offset = 0;
   if (priv.is_ldf)
-    if (ec.l.cs.costs.size() > 0)
-      extra_offset = 3849017 * ec.l.cs.costs[0].class_index;
+    if (ec.l.get_type() == label_type_t::cs && ec.l.cs().costs.size() > 0)
+      extra_offset = 3849017 * ec.l.cs().costs[0].class_index;
 
   size_t I = condition_on_cnt;
   size_t N = std::max(priv.acset.max_bias_ngram_length, priv.acset.max_quad_ngram_length);
@@ -891,40 +856,43 @@ void del_example_conditioning(search_private& priv, example& ec)
     del_features_in_top_namespace(priv, ec, conditioning_namespace);
 }
 
-inline size_t cs_get_costs_size(bool isCB, polylabel& ld) { return isCB ? ld.cb.costs.size() : ld.cs.costs.size(); }
+inline size_t cs_get_costs_size(bool isCB, polylabel& ld)
+{
+  return isCB ? ld.cb().costs.size() : ld.cs().costs.size();
+}
 
 inline uint32_t cs_get_cost_index(bool isCB, polylabel& ld, size_t k)
 {
-  return isCB ? ld.cb.costs[k].action : ld.cs.costs[k].class_index;
+  return isCB ? ld.cb().costs[k].action : ld.cs().costs[k].class_index;
 }
 
 inline float cs_get_cost_partial_prediction(bool isCB, polylabel& ld, size_t k)
 {
-  return isCB ? ld.cb.costs[k].partial_prediction : ld.cs.costs[k].partial_prediction;
+  return isCB ? ld.cb().costs[k].partial_prediction : ld.cs().costs[k].partial_prediction;
 }
 
 inline void cs_set_cost_loss(bool isCB, polylabel& ld, size_t k, float val)
 {
   if (isCB)
-    ld.cb.costs[k].cost = val;
+    ld.cb().costs[k].cost = val;
   else
-    ld.cs.costs[k].x = val;
+    ld.cs().costs[k].x = val;
 }
 
 inline void cs_costs_erase(bool isCB, polylabel& ld)
 {
   if (isCB)
-    ld.cb.costs.clear();
+    ld.cb().costs.clear();
   else
-    ld.cs.costs.clear();
+    ld.cs().costs.clear();
 }
 
 inline void cs_costs_resize(bool isCB, polylabel& ld, size_t new_size)
 {
   if (isCB)
-    ld.cb.costs.resize(new_size);
+    ld.cb().costs.resize(new_size);
   else
-    ld.cs.costs.resize(new_size);
+    ld.cs().costs.resize(new_size);
 }
 
 inline void cs_cost_push_back(bool isCB, polylabel& ld, uint32_t index, float value)
@@ -932,12 +900,12 @@ inline void cs_cost_push_back(bool isCB, polylabel& ld, uint32_t index, float va
   if (isCB)
   {
     CB::cb_class cost = {value, index, 0., 0.};
-    ld.cb.costs.push_back(cost);
+    ld.cb().costs.push_back(cost);
   }
   else
   {
     CS::wclass cost = {value, index, 0., 0.};
-    ld.cs.costs.push_back(cost);
+    ld.cs().costs.push_back(cost);
   }
 }
 
@@ -945,7 +913,7 @@ polylabel& allowed_actions_to_ld(search_private& priv, size_t ec_cnt, const acti
     size_t allowed_actions_cnt, const float* allowed_actions_cost)
 {
   bool isCB = priv.cb_learner;
-  polylabel& ld = *priv.allowed_actions_cache;
+  polylabel& ld = priv.allowed_actions_cache;
   uint32_t num_costs = (uint32_t)cs_get_costs_size(isCB, ld);
 
   if (priv.is_ldf)  // LDF version easier
@@ -1150,9 +1118,9 @@ action choose_oracle_action(search_private& priv, size_t ec_cnt, const action* o
   if (need_memo_foreach_action(priv) && (priv.state == INIT_TRAIN))
   {
     v_array<action_cache>* this_cache = new v_array<action_cache>();
-    *this_cache = v_init<action_cache>();
     // TODO we don't really need to construct this polylabel
-    polylabel l = allowed_actions_to_ld(priv, 1, allowed_actions, allowed_actions_cnt, allowed_actions_cost);
+    polylabel l =
+        std::move(allowed_actions_to_ld(priv, 1, allowed_actions, allowed_actions_cnt, allowed_actions_cost));
     size_t K = cs_get_costs_size(priv.cb_learner, l);
     for (size_t k = 0; k < K; k++)
     {
@@ -1163,6 +1131,7 @@ action choose_oracle_action(search_private& priv, size_t ec_cnt, const action* o
     assert(priv.memo_foreach_action.size() == priv.meta_t + priv.t - 1);
     priv.memo_foreach_action.push_back(this_cache);
     cdbg << "memo_foreach_action[" << priv.meta_t + priv.t - 1 << "] = " << this_cache << " from oracle" << endl;
+    priv.allowed_actions_cache = std::move(l);
   }
   return a;
 }
@@ -1173,22 +1142,23 @@ action single_prediction_notLDF(search_private& priv, example& ec, int policy, c
                              // appropriate cost for that action
 {
   vw& all = *priv.all;
-  polylabel old_label = ec.l;
-  bool need_partial_predictions = need_memo_foreach_action(priv) ||
+  auto old_label = std::move(ec.l);
+  ec.l.reset();
+  const bool need_partial_predictions = need_memo_foreach_action(priv) ||
       (priv.metaoverride && priv.metaoverride->_foreach_action) || (override_action != (action)-1) || priv.active_csoaa;
   if ((allowed_actions_cnt > 0) || need_partial_predictions)
-    ec.l = allowed_actions_to_ld(priv, 1, allowed_actions, allowed_actions_cnt, allowed_actions_cost);
+    ec.l = std::move(allowed_actions_to_ld(priv, 1, allowed_actions, allowed_actions_cnt, allowed_actions_cost));
   else
-    ec.l.cs = priv.empty_cs_label;
+    ec.l.init_as_cs() = priv.empty_cs_label;
 
   cdbg << "allowed_actions_cnt=" << allowed_actions_cnt << ", ec.l = [";
-  for (size_t i = 0; i < ec.l.cs.costs.size(); i++)
-    cdbg << ' ' << ec.l.cs.costs[i].class_index << ':' << ec.l.cs.costs[i].x;
+  for (size_t i = 0; i < ec.l.cs().costs.size(); i++)
+    cdbg << ' ' << ec.l.cs().costs[i].class_index << ':' << ec.l.cs().costs[i].x;
   cdbg << " ]" << endl;
 
   as_singleline(priv.base_learner)->predict(ec, policy);
 
-  uint32_t act = ec.pred.multiclass;
+  uint32_t act = ec.pred.multiclass();
   cdbg << "a=" << act << " from";
   if (allowed_actions)
   {
@@ -1215,7 +1185,6 @@ action single_prediction_notLDF(search_private& priv, example& ec, int policy, c
     if (need_memo_foreach_action(priv) && (override_action == (action)-1))
     {
       this_cache = new v_array<action_cache>();
-      *this_cache = v_init<action_cache>();
     }
     for (size_t k = 0; k < K; k++)
     {
@@ -1264,22 +1233,24 @@ action single_prediction_notLDF(search_private& priv, example& ec, int policy, c
     while (priv.active_known.size() <= cur_t)
     {
       priv.active_known.push_back(v_array<std::pair<CS::wclass&, bool>>());
-      priv.active_known[priv.active_known.size() - 1] = v_init<std::pair<CS::wclass&, bool>>();
+      priv.active_known[priv.active_known.size() - 1] = v_array<std::pair<CS::wclass&, bool>>();
       cdbg << "active_known length now " << priv.active_known.size() << endl;
     }
     priv.active_known[cur_t].clear();
-    assert(ec.l.cs.costs.size() > 0);
-    for (size_t k = 0; k < ec.l.cs.costs.size(); k++)
+    assert(ec.l.cs().costs.size() > 0);
+    for (size_t k = 0; k < ec.l.cs().costs.size(); k++)
     {
-      /* priv.active_known[cur_t].push_back( ec.l.cs.costs[k].pred_is_certain
-                                          ? ec.l.cs.costs[k].partial_prediction
+      /* priv.active_known[cur_t].push_back( ec.l.cs().costs[k].pred_is_certain
+                                          ? ec.l.cs().costs[k].partial_prediction
                                           : FLT_MAX );
                                           cdbg << "active_known[" << cur_t << "][" << (priv.active_known[cur_t].size() -
-         1) << "] = certain=" << ec.l.cs.costs[k].pred_is_certain << ", cost=" << ec.l.cs.costs[k].partial_prediction <<
+         1) << "] = certain=" << ec.l.cs().costs[k].pred_is_certain << ", cost=" <<
+         ec.l.cs().costs[k].partial_prediction <<
          "}" << endl; */
-      CS::wclass& wc = ec.l.cs.costs[k];
+      CS::wclass& wc = ec.l.cs().costs[k];
       // Get query_needed from pred
-      bool query_needed = v_array_contains(ec.pred.multilabels.label_v, wc.class_index);
+      bool query_needed = std::find(ec.pred.multilabels().label_v.cbegin(), ec.pred.multilabels().label_v.cend(),
+                              wc.class_index) == ec.pred.multilabels().label_v.cend();
       std::pair<CS::wclass&, bool> p = {wc, query_needed};
       // Push into active_known[cur_t] with wc
       priv.active_known[cur_t].push_back(p);
@@ -1287,8 +1258,8 @@ action single_prediction_notLDF(search_private& priv, example& ec, int policy, c
       // << ':' << wc.x << " pp=" << wc.partial_prediction << " query_needed=" << wc.query_needed << " max_pred=" <<
       // wc.max_pred << " min_pred=" << wc.min_pred << " is_range_overlapped=" << wc.is_range_overlapped << "
       // is_range_large=" << wc.is_range_large << endl;
-      // query_needed=" << ec.l.cs.costs[k].query_needed << ", cost=" << ec.l.cs.costs[k].partial_prediction << "}" <<
-      // endl;
+      // query_needed=" << ec.l.cs().costs[k].query_needed << ", cost=" << ec.l.cs().costs[k].partial_prediction << "}"
+      // << endl;
     }
   }
 
@@ -1306,7 +1277,12 @@ action single_prediction_notLDF(search_private& priv, example& ec, int policy, c
     all.print_text_by_ref(all.raw_prediction, priv.rawOutputStringStream->str(), ec.tag);
   }
 
-  ec.l = old_label;
+  if ((allowed_actions_cnt > 0) || need_partial_predictions)
+  {
+    priv.allowed_actions_cache = std::move(ec.l);
+  }
+
+  ec.l = std::move(old_label);
 
   priv.total_predictions_made++;
   priv.num_features += ec.num_features;
@@ -1321,7 +1297,7 @@ action single_prediction_LDF(search_private& priv, example* ecs, size_t ec_cnt,
   bool need_partial_predictions = need_memo_foreach_action(priv) ||
       (priv.metaoverride && priv.metaoverride->_foreach_action) || (override_action != (action)-1);
 
-  CS::cs_label.default_label(&priv.ldf_test_label);
+  CS::default_label(priv.ldf_test_label);
   CS::wclass wc = {0., 1, 0., 0.};
   priv.ldf_test_label.costs.push_back(wc);
 
@@ -1335,7 +1311,6 @@ action single_prediction_LDF(search_private& priv, example* ecs, size_t ec_cnt,
   if (need_partial_predictions)
   {
     this_cache = new v_array<action_cache>();
-    *this_cache = v_init<action_cache>();
   }
 
   for (action a = (uint32_t)start_K; a < ec_cnt; a++)
@@ -1344,8 +1319,13 @@ action single_prediction_LDF(search_private& priv, example* ecs, size_t ec_cnt,
     if (start_K > 0)
       LabelDict::add_example_namespaces_from_example(ecs[a], ecs[0]);
 
-    polylabel old_label = ecs[a].l;
-    ecs[a].l.cs = priv.ldf_test_label;
+    polylabel old_label = std::move(ecs[a].l);
+    ecs[a].l.reset();
+    ecs[a].l.init_as_cs() = priv.ldf_test_label;
+    if (ecs[a].pred.get_type() == prediction_type_t::unset)
+    {
+      ecs[a].pred.init_as_multiclass();
+    }
 
     multi_ex tmp;
     uint64_t old_offset = ecs[a].ft_offset;
@@ -1371,7 +1351,7 @@ action single_prediction_LDF(search_private& priv, example* ecs, size_t ec_cnt,
       this_cache->push_back(action_cache(0., a, false, ecs[a].partial_prediction));
 
     priv.num_features += ecs[a].num_features;
-    ecs[a].l = old_label;
+    ecs[a].l = std::move(old_label);
     if (start_K > 0)
       LabelDict::del_example_namespaces_from_example(ecs[a], ecs[0]);
   }
@@ -1394,7 +1374,6 @@ action single_prediction_LDF(search_private& priv, example* ecs, size_t ec_cnt,
       priv.memo_foreach_action.push_back(this_cache);
     else
     {
-      this_cache->delete_v();
       delete this_cache;
     }
   }
@@ -1504,15 +1483,16 @@ void generate_training_example(search_private& priv, polylabel& losses, float we
   if (priv.cb_learner)
   {
     if (min_loss == FLT_MAX)
-      for (size_t i = 0; i < losses.cb.costs.size(); i++) min_loss = std::min(min_loss, losses.cb.costs[i].cost);
-    for (size_t i = 0; i < losses.cb.costs.size(); i++) losses.cb.costs[i].cost = losses.cb.costs[i].cost - min_loss;
+      for (size_t i = 0; i < losses.cb().costs.size(); i++) min_loss = std::min(min_loss, losses.cb().costs[i].cost);
+    for (size_t i = 0; i < losses.cb().costs.size(); i++)
+      losses.cb().costs[i].cost = losses.cb().costs[i].cost - min_loss;
   }
   else
   {
     if (min_loss == FLT_MAX)
-      for (size_t i = 0; i < losses.cs.costs.size(); i++) min_loss = std::min(min_loss, losses.cs.costs[i].x);
-    for (size_t i = 0; i < losses.cs.costs.size(); i++)
-      losses.cs.costs[i].x = (losses.cs.costs[i].x - min_loss) * weight;
+      for (size_t i = 0; i < losses.cs().costs.size(); i++) min_loss = std::min(min_loss, losses.cs().costs[i].x);
+    for (size_t i = 0; i < losses.cs().costs.size(); i++)
+      losses.cs().costs[i].x = (losses.cs().costs[i].x - min_loss) * weight;
   }
   // std::cerr << "losses = ["; for (size_t i=0; i<losses.cs.costs.size(); i++) std::cerr << ' ' <<
   // losses.cs.costs[i].class_index
@@ -1570,13 +1550,13 @@ void generate_training_example(search_private& priv, polylabel& losses, float we
       for (action a = (uint32_t)start_K; a < priv.learn_ec_ref_cnt; a++)
       {
         example& ec = priv.learn_ec_ref[a];
-        CS::label& lab = ec.l.cs;
+        CS::label& lab = ec.l.cs();
         if (lab.costs.size() == 0)
         {
           CS::wclass wc = {0., a - (uint32_t)start_K, 0., 0.};
           lab.costs.push_back(wc);
         }
-        lab.costs[0].x = losses.cs.costs[a - start_K].x;
+        lab.costs[0].x = losses.cs().costs[a - start_K].x;
         // store the offset to restore it later
         ec.ft_offset = priv.offset;
         // create the example collection used to learn
@@ -1726,12 +1706,8 @@ action search_predict(search_private& priv, example* ecs, size_t ec_cnt, ptag my
         priv.learn_ec_ref = ecs;
       else
       {
-        size_t label_size = priv.is_ldf ? sizeof(CS::label) : sizeof(MC::label_t);
-        void (*label_copy_fn)(void*, void*) = priv.is_ldf ? CS::cs_label.copy_label : nullptr;
-
         ensure_size(priv.learn_ec_copy, ec_cnt);
-        for (size_t i = 0; i < ec_cnt; i++)
-          VW::copy_example_data(priv.all->audit, priv.learn_ec_copy.begin() + i, ecs + i, label_size, label_copy_fn);
+        for (size_t i = 0; i < ec_cnt; i++) priv.learn_ec_copy[i] = ecs[i];
 
         priv.learn_ec_ref = priv.learn_ec_copy.begin();
       }
@@ -1908,8 +1884,8 @@ action search_predict(search_private& priv, example* ecs, size_t ec_cnt, ptag my
           allowed_actions_to_label(priv, ec_cnt, allowed_actions, allowed_actions_cnt, allowed_actions_cost,
               oracle_actions, oracle_actions_cnt, priv.gte_label);
           cdbg << "priv.gte_label = [";
-          for (size_t i = 0; i < priv.gte_label.cs.costs.size(); i++)
-            cdbg << ' ' << priv.gte_label.cs.costs[i].class_index << ':' << priv.gte_label.cs.costs[i].x;
+          for (size_t i = 0; i < priv.gte_label.cs().costs.size(); i++)
+            cdbg << ' ' << priv.gte_label.cs().costs[i].class_index << ':' << priv.gte_label.cs().costs[i].x;
           cdbg << " ]" << endl;
 
           priv.learn_ec_ref = ecs;
@@ -2002,7 +1978,7 @@ void get_training_timesteps(search_private& priv, v_array<size_t>& timesteps)
         timesteps.push_back(priv.active_uncertainty[i].second - 1);
     /*
     float k = (float)priv.total_examples_generated;
-    priv.ec_seq[t]->revert_weight = priv.all->loss->getRevertingWeight(priv.all->sd, priv.ec_seq[t].pred.scalar,
+    priv.ec_seq[t]->revert_weight = priv.all->loss->getRevertingWeight(priv.all->sd, priv.ec_seq[t].pred.scalar(),
     priv.all->eta / powf(k, priv.all->power_t)); float importance = query_decision(active_str, *priv.ec_seq[t], k); if
     (importance > 0.) timesteps.push_back(pair<size_t,size_t>(0,t));
     */
@@ -2045,7 +2021,7 @@ void get_training_timesteps(search_private& priv, v_array<size_t>& timesteps)
     while ((timesteps.size() < (size_t)priv.subsample_timesteps) && (timesteps.size() < priv.T))
     {
       size_t t = (size_t)(priv._random_state->get_and_update_random() * (float)priv.T);
-      if (!v_array_contains(timesteps, t))
+      if (std::find(timesteps.cbegin(), timesteps.cend(), t) == timesteps.cend())
         timesteps.push_back(t);
     }
     std::sort(timesteps.begin(), timesteps.end(), cmp_size_t);
@@ -2065,7 +2041,6 @@ struct final_item
 
 void free_final_item(final_item* p)
 {
-  p->prefix->delete_v();
   delete p->prefix;
   delete p;
 }
@@ -2171,7 +2146,7 @@ void advance_from_known_actions(search_private& priv)
                     priv.active_known[t][priv.learn_a_idx],
                     true);
   */
-  priv.learn_losses.cs.costs.push_back(priv.active_known[t][priv.learn_a_idx].first);
+  priv.learn_losses.cs().costs.push_back(priv.active_known[t][priv.learn_a_idx].first);
   cdbg << "  --> adding " << priv.learn_a_idx << ":" << priv.active_known[t][priv.learn_a_idx].first.x << endl;
   priv.learn_a_idx++;
   advance_from_known_actions(priv);
@@ -2265,9 +2240,9 @@ void train_single_example(search& sch, bool is_test_ex, bool is_holdout_ex, mult
   }
 
   if (priv.cb_learner)
-    priv.learn_losses.cb.costs.clear();
+    priv.learn_losses.cb().costs.clear();
   else
-    priv.learn_losses.cs.costs.clear();
+    priv.learn_losses.cs().costs.clear();
 
   for (size_t tid = 0; tid < priv.timesteps.size(); tid++)
   {
@@ -2313,8 +2288,8 @@ void train_single_example(search& sch, bool is_test_ex, bool is_holdout_ex, mult
       //                           priv.learn_loss);
     }
     if (priv.active_csoaa_verify > 0.)
-      verify_active_csoaa(
-          priv.learn_losses.cs, priv.active_known[priv.learn_t], ec_seq[0]->example_counter, priv.active_csoaa_verify);
+      verify_active_csoaa(priv.learn_losses.cs(), priv.active_known[priv.learn_t], ec_seq[0]->example_counter,
+          priv.active_csoaa_verify);
 
     if (skipped_all_actions)
     {
@@ -2335,7 +2310,7 @@ void train_single_example(search& sch, bool is_test_ex, bool is_holdout_ex, mult
     {
       for (size_t i = 0; i < priv.learn_allowed_actions.size(); i++)
       {
-        priv.learn_losses.cs.costs[i].class_index = priv.learn_allowed_actions[i];
+        priv.learn_losses.cs().costs[i].class_index = priv.learn_allowed_actions[i];
       }
     }
     // float min_loss = 0.;
@@ -2343,22 +2318,23 @@ void train_single_example(search& sch, bool is_test_ex, bool is_holdout_ex, mult
     //  for (size_t aid=0; aid<priv.memo_foreach_action[tid]->size(); aid++)
     //    min_loss = std::min(min_loss, priv.memo_foreach_action[tid]->get(aid).cost);
     cdbg << "priv.learn_losses = [";
-    for (auto& wc : priv.learn_losses.cs.costs) cdbg << " " << wc.class_index << ":" << wc.x;
+    for (auto& wc : priv.learn_losses.cs().costs) cdbg << " " << wc.class_index << ":" << wc.x;
     cdbg << " ]" << endl;
     cdbg << "gte" << endl;
     generate_training_example(priv, priv.learn_losses, 1., true);  // , min_loss);  // TODO: weight
-    if (!priv.examples_dont_change)
-      for (size_t n = 0; n < priv.learn_ec_copy.size(); n++)
-      {
-        if (sch.priv->is_ldf)
-          CS::cs_label.delete_label(&priv.learn_ec_copy[n].l.cs);
-        else
-          MC::mc_label.delete_label(&priv.learn_ec_copy[n].l.multi);
-      }
+    // Should not be needed anymore
+    // if (!priv.examples_dont_change)
+    //   for (size_t n = 0; n < priv.learn_ec_copy.size(); n++)
+    //   {
+    //     if (sch.priv->is_ldf)
+    //       CS::cs_label.delete_label(priv.learn_ec_copy[n].l);
+    //     else
+    //       MC::mc_label.delete_label(priv.learn_ec_copy[n].l);
+    //   }
     if (priv.cb_learner)
-      priv.learn_losses.cb.costs.clear();
+      priv.learn_losses.cb().costs.clear();
     else
-      priv.learn_losses.cs.costs.clear();
+      priv.learn_losses.cs().costs.clear();
   }
 
   if (priv.active_csoaa && (priv.save_every_k_runs > 1))
@@ -2491,7 +2467,7 @@ void end_examples(search& sch)
   }
 }
 
-bool mc_label_is_test(polylabel& lab) { return MC::mc_label.test_label(&lab.multi); }
+bool mc_label_is_test(polylabel& lab) { return MC::mc_label.test_label(lab); }
 
 void search_initialize(vw* all, search& sch)
 {
@@ -2529,10 +2505,7 @@ void search_initialize(vw* all, search& sch)
 
   sch.task_data = nullptr;
 
-  priv.active_uncertainty = v_init<std::pair<float, size_t>>();
-  priv.active_known = v_init<v_array<std::pair<CS::wclass&, bool>>>();
-
-  CS::cs_label.default_label(&priv.empty_cs_label);
+  CS::default_label(priv.empty_cs_label);
 
   new (&priv.rawOutputString) std::string();
   priv.rawOutputStringStream = new std::stringstream(priv.rawOutputString);
@@ -2610,11 +2583,11 @@ v_array<CS::label> read_allowed_transitions(action A, const char* filename)
   }
   fclose(f);
 
-  v_array<CS::label> allowed = v_init<CS::label>();
+  v_array<CS::label> allowed;
 
   for (size_t from = 0; from < A; from++)
   {
-    v_array<CS::wclass> costs = v_init<CS::wclass>();
+    v_array<CS::wclass> costs;
 
     for (size_t to = 0; to < A; to++)
       if (bg[from * (A + 1) + to])
@@ -2799,20 +2772,19 @@ base_learner* setup(options_i& options, vw& all)
     THROW("error: --search_rollin must be 'learn', 'ref', 'mix' or 'mix_per_state'");
 
   // check if the base learner is contextual bandit, in which case, we dont rollout all actions.
-  priv.allowed_actions_cache = &calloc_or_throw<polylabel>();
   if (options.was_supplied("cb"))
   {
     priv.cb_learner = true;
-    CB::cb_label.default_label(priv.allowed_actions_cache);
-    priv.learn_losses.cb.costs = v_init<CB::cb_class>();
-    priv.gte_label.cb.costs = v_init<CB::cb_class>();
+    CB::default_label(priv.allowed_actions_cache.init_as_cb());
+    priv.learn_losses.cb().costs = v_array<CB::cb_class>();
+    priv.gte_label.cb().costs = v_array<CB::cb_class>();
   }
   else
   {
     priv.cb_learner = false;
-    CS::cs_label.default_label(priv.allowed_actions_cache);
-    priv.learn_losses.cs.costs = v_init<CS::wclass>();
-    priv.gte_label.cs.costs = v_init<CS::wclass>();
+    CS::default_label(priv.allowed_actions_cache.init_as_cs());
+    priv.learn_losses.init_as_cs().costs = v_array<CS::wclass>();
+    priv.gte_label.init_as_cs().costs = v_array<CS::wclass>();
   }
 
   ensure_param(priv.beta, 0.0, 1.0, 0.5, "warning: search_beta must be in (0,1); resetting to 0.5");
@@ -2911,7 +2883,6 @@ base_learner* setup(options_i& options, vw& all)
 
   // default to OAA labels unless the task wants to override this (which they can do in initialize)
   all.p->lp = MC::mc_label;
-  all.label_type = label_type_t::mc;
   if (priv.task && priv.task->initialize)
     priv.task->initialize(*sch.get(), priv.A, options);
   if (priv.metatask && priv.metatask->initialize)
@@ -2942,6 +2913,28 @@ base_learner* setup(options_i& options, vw& all)
   l.set_end_examples(end_examples);
   l.set_finish(search_finish);
   l.set_end_pass(end_pass);
+
+  // In search, tasks can define which label should be used. There isn't a great
+  // way to do this right now. However, currently the only usage is for cost
+  // sensitive. So we will check at this point if the label parser is either
+  // multiclass or cost sensitive. In any other case throw as it is not
+  // supported yet. TODO: improve the handling of tasks specifying label types.
+  if (all.p->lp.parse_label == COST_SENSITIVE::cs_label.parse_label)
+  {
+    l.label_type = label_type_t::cs;
+    l.pred_type = prediction_type_t::multiclass;
+  }
+  else if (all.p->lp.parse_label == MC::mc_label.parse_label)
+  {
+    l.label_type = label_type_t::multi;
+    l.pred_type = prediction_type_t::multiclass;
+  }
+  else
+  {
+    THROW(
+        "Only multi and cost sensitive are supported in search right now. To support more, please add another check "
+        "for label types.")
+  }
   return make_base(l);
 }
 
@@ -3021,7 +3014,7 @@ action search::predictLDF(example* ecs, size_t ec_cnt, ptag mytag, const action*
   // beyond the end of the array (usually resulting in a segfault at some point.)
   size_t action_index = a - COST_SENSITIVE::ec_is_example_header(ecs[0]) ? 0 : 1;
 
-  if ((mytag != 0) && ecs[action_index].l.cs.costs.size() > 0)
+  if ((mytag != 0) && ecs[action_index].l.cs().costs.size() > 0)
   {
     if (mytag < priv->ptag_to_action.size())
     {
@@ -3032,7 +3025,7 @@ action search::predictLDF(example* ecs, size_t ec_cnt, ptag mytag, const action*
         priv->ptag_to_action[mytag].repr = nullptr;
       }
     }
-    push_at(priv->ptag_to_action, action_repr(ecs[a].l.cs.costs[0].class_index, &(priv->last_action_repr)), mytag);
+    push_at(priv->ptag_to_action, action_repr(ecs[a].l.cs().costs[0].class_index, &(priv->last_action_repr)), mytag);
   }
   if (priv->auto_hamming_loss)
     loss(action_hamming_loss(a, oracle_actions, oracle_actions_cnt));  // TODO: action costs
@@ -3085,7 +3078,7 @@ void search::set_label_parser(label_parser& lp, bool (*is_test)(polylabel&))
   if (this->priv->all->vw_is_main && (this->priv->state != INITIALIZE))
     std::cerr << "warning: task should not set label parser except in initialize function!" << endl;
   this->priv->all->p->lp = lp;
-  this->priv->all->p->lp.test_label = (bool (*)(void*))is_test;
+  this->priv->all->p->lp.test_label = is_test;
   this->priv->label_is_test = is_test;
 }
 
@@ -3121,23 +3114,8 @@ void search::set_force_oracle(bool force) { this->priv->force_oracle = force; }
 
 // predictor implementation
 predictor::predictor(search& sch, ptag my_tag)
-    : is_ldf(false)
-    , my_tag(my_tag)
-    , ec(nullptr)
-    , ec_cnt(0)
-    , ec_alloced(false)
-    , weight(1.)
-    , oracle_is_pointer(false)
-    , allowed_is_pointer(false)
-    , allowed_cost_is_pointer(false)
-    , learner_id(0)
-    , sch(sch)
-{
-  oracle_actions = v_init<action>();
-  condition_on_tags = v_init<ptag>();
-  condition_on_names = v_init<char>();
-  allowed_actions = v_init<action>();
-  allowed_actions_cost = v_init<float>();
+    : is_ldf(false), my_tag(my_tag), ec(nullptr), ec_cnt(0), ec_alloced(false), weight(1.), learner_id(0), sch(sch)
+{
 }
 
 void predictor::free_ec()
@@ -3145,30 +3123,14 @@ void predictor::free_ec()
   if (ec_alloced)
   {
     if (is_ldf)
-      for (size_t i = 0; i < ec_cnt; i++)
-      {
-        VW::dealloc_example(CS::cs_label.delete_label, ec[i]);
-      }
+      for (size_t i = 0; i < ec_cnt; i++) ec[i].~example();
     else
-    {
-      VW::dealloc_example(nullptr, *ec);
-    }
+      ec->~example();
     free(ec);
   }
 }
 
-predictor::~predictor()
-{
-  if (!oracle_is_pointer)
-    oracle_actions.delete_v();
-  if (!allowed_is_pointer)
-    allowed_actions.delete_v();
-  if (!allowed_cost_is_pointer)
-    allowed_actions_cost.delete_v();
-  free_ec();
-  condition_on_tags.delete_v();
-  condition_on_names.delete_v();
-}
+predictor::~predictor() { free_ec(); }
 predictor& predictor::reset()
 {
   this->erase_oracles();
@@ -3224,8 +3186,8 @@ void predictor::set_input_at(size_t posn, example& ex)
   if (posn >= ec_cnt)
     THROW("call to set_input_at with too large a position: posn (" << posn << ") >= ec_cnt(" << ec_cnt << ")");
 
-  VW::copy_example_data(
-      false, ec + posn, &ex, CS::cs_label.label_size, CS::cs_label.copy_label);  // TODO: the false is "audit"
+  // Copy given example into ec.
+  ec[posn] = ex;
 }
 
 template <class T>
@@ -3240,96 +3202,52 @@ void predictor::make_new_pointer(v_array<T>& A, size_t new_size)
 }
 
 template <class T>
-predictor& predictor::add_to(v_array<T>& A, bool& A_is_ptr, T a, bool clear_first)
+predictor& predictor::add_to(v_array<T>& destination, T action, bool clear_first)
 {
-  if (A_is_ptr)  // we need to make our own memory
-  {
-    if (clear_first)
-      A.end() = A.begin();
-    size_t new_size = clear_first ? 1 : (A.size() + 1);
-    make_new_pointer<T>(A, new_size);
-    A_is_ptr = false;
-    A[new_size - 1] = a;
-  }
-  else  // we've already allocated our own memory
+  if (clear_first)
   {
-    if (clear_first)
-      A.clear();
-    A.push_back(a);
+    destination.clear();
   }
+  destination.push_back(action);
+
   return *this;
 }
 
 template <class T>
-predictor& predictor::add_to(v_array<T>& A, bool& A_is_ptr, T* a, size_t count, bool clear_first)
+predictor& predictor::add_to(v_array<T>& destination, T* source, size_t count, bool clear_first)
 {
-  size_t old_size = A.size();
-  if (old_size > 0)
+  if (clear_first)
   {
-    if (A_is_ptr)  // we need to make our own memory
-    {
-      if (clear_first)
-      {
-        A.end() = A.begin();
-        old_size = 0;
-      }
-      size_t new_size = old_size + count;
-      make_new_pointer<T>(A, new_size);
-      A_is_ptr = false;
-      if (a != nullptr)
-        memcpy(A.begin() + old_size, a, count * sizeof(T));
-    }
-    else  // we already have our own memory
-    {
-      if (clear_first)
-        A.clear();
-      if (a != nullptr)
-        push_many<T>(A, a, count);
-    }
+    destination.clear();
   }
-  else  // old_size == 0, clear_first is irrelevant
+  // TODO uncomment this
+  // destination.reserve(destination.size() + count);
+  for (size_t i = 0; i < count; i++)
   {
-    if (!A_is_ptr)
-      A.delete_v();  // avoid memory leak
-
-    A.begin() = a;
-    if (a != nullptr)  // a is not nullptr
-      A.end() = a + count;
-    else
-      A.end() = a;
-    A.end_array = A.end();
-    A_is_ptr = true;
+    destination.push_back(source[i]);
   }
+
   return *this;
 }
 
 predictor& predictor::erase_oracles()
 {
-  if (oracle_is_pointer)
-    oracle_actions.end() = oracle_actions.begin();
-  else
-    oracle_actions.clear();
+  oracle_actions.clear();
   return *this;
 }
-predictor& predictor::add_oracle(action a) { return add_to(oracle_actions, oracle_is_pointer, a, false); }
+predictor& predictor::add_oracle(action a) { return add_to(oracle_actions, a, false); }
 predictor& predictor::add_oracle(action* a, size_t action_count)
 {
-  return add_to(oracle_actions, oracle_is_pointer, a, action_count, false);
-}
-predictor& predictor::add_oracle(v_array<action>& a)
-{
-  return add_to(oracle_actions, oracle_is_pointer, a.begin(), a.size(), false);
+  return add_to(oracle_actions, a, action_count, false);
 }
+predictor& predictor::add_oracle(v_array<action>& a) { return add_to(oracle_actions, a.begin(), a.size(), false); }
 
-predictor& predictor::set_oracle(action a) { return add_to(oracle_actions, oracle_is_pointer, a, true); }
+predictor& predictor::set_oracle(action a) { return add_to(oracle_actions, a, true); }
 predictor& predictor::set_oracle(action* a, size_t action_count)
 {
-  return add_to(oracle_actions, oracle_is_pointer, a, action_count, true);
-}
-predictor& predictor::set_oracle(v_array<action>& a)
-{
-  return add_to(oracle_actions, oracle_is_pointer, a.begin(), a.size(), true);
+  return add_to(oracle_actions, a, action_count, true);
 }
+predictor& predictor::set_oracle(v_array<action>& a) { return add_to(oracle_actions, a.begin(), a.size(), true); }
 
 predictor& predictor::set_weight(float w)
 {
@@ -3339,53 +3257,50 @@ predictor& predictor::set_weight(float w)
 
 predictor& predictor::erase_alloweds()
 {
-  if (allowed_is_pointer)
-    allowed_actions.end() = allowed_actions.begin();
-  else
-    allowed_actions.clear();
-  if (allowed_cost_is_pointer)
-    allowed_actions_cost.end() = allowed_actions_cost.begin();
-  else
-    allowed_actions_cost.clear();
+  allowed_actions.clear();
+  allowed_actions_cost.clear();
   return *this;
 }
-predictor& predictor::add_allowed(action a) { return add_to(allowed_actions, allowed_is_pointer, a, false); }
+predictor& predictor::add_allowed(action a) { return add_to(allowed_actions, a, false); }
 predictor& predictor::add_allowed(action* a, size_t action_count)
 {
-  return add_to(allowed_actions, allowed_is_pointer, a, action_count, false);
-}
-predictor& predictor::add_allowed(v_array<action>& a)
-{
-  return add_to(allowed_actions, allowed_is_pointer, a.begin(), a.size(), false);
+  return add_to(allowed_actions, a, action_count, false);
 }
+predictor& predictor::add_allowed(v_array<action>& a) { return add_to(allowed_actions, a.begin(), a.size(), false); }
 
-predictor& predictor::set_allowed(action a) { return add_to(allowed_actions, allowed_is_pointer, a, true); }
+predictor& predictor::set_allowed(action a) { return add_to(allowed_actions, a, true); }
 predictor& predictor::set_allowed(action* a, size_t action_count)
 {
-  return add_to(allowed_actions, allowed_is_pointer, a, action_count, true);
-}
-predictor& predictor::set_allowed(v_array<action>& a)
-{
-  return add_to(allowed_actions, allowed_is_pointer, a.begin(), a.size(), true);
+  return add_to(allowed_actions, a, action_count, true);
 }
+predictor& predictor::set_allowed(v_array<action>& a) { return add_to(allowed_actions, a.begin(), a.size(), true); }
 
 predictor& predictor::add_allowed(action a, float cost)
 {
-  add_to(allowed_actions_cost, allowed_cost_is_pointer, cost, false);
-  return add_to(allowed_actions, allowed_is_pointer, a, false);
+  add_to(allowed_actions_cost, cost, false);
+  return add_to(allowed_actions, a, false);
 }
 
 predictor& predictor::add_allowed(action* a, float* costs, size_t action_count)
 {
-  add_to(allowed_actions_cost, allowed_cost_is_pointer, costs, action_count, false);
-  return add_to(allowed_actions, allowed_is_pointer, a, action_count, false);
+  // In sequence task this function is used with a being nullptr, but costs is valid.
+  // So we need to check if we can do the adds.
+  if (costs != nullptr)
+  {
+    add_to(allowed_actions_cost, costs, action_count, false);
+  }
+  if (a != nullptr)
+  {
+    add_to(allowed_actions, a, action_count, false);
+  }
+  return *this;
 }
 predictor& predictor::add_allowed(v_array<std::pair<action, float>>& a)
 {
   for (size_t i = 0; i < a.size(); i++)
   {
-    add_to(allowed_actions, allowed_is_pointer, a[i].first, false);
-    add_to(allowed_actions_cost, allowed_cost_is_pointer, a[i].second, false);
+    add_to(allowed_actions, a[i].first, false);
+    add_to(allowed_actions_cost, a[i].second, false);
   }
   return *this;
 }
@@ -3393,22 +3308,31 @@ predictor& predictor::add_allowed(std::vector<std::pair<action, float>>& a)
 {
   for (size_t i = 0; i < a.size(); i++)
   {
-    add_to(allowed_actions, allowed_is_pointer, a[i].first, false);
-    add_to(allowed_actions_cost, allowed_cost_is_pointer, a[i].second, false);
+    add_to(allowed_actions, a[i].first, false);
+    add_to(allowed_actions_cost, a[i].second, false);
   }
   return *this;
 }
 
 predictor& predictor::set_allowed(action a, float cost)
 {
-  add_to(allowed_actions_cost, allowed_cost_is_pointer, cost, true);
-  return add_to(allowed_actions, allowed_is_pointer, a, true);
+  add_to(allowed_actions_cost, cost, true);
+  return add_to(allowed_actions, a, true);
 }
 
 predictor& predictor::set_allowed(action* a, float* costs, size_t action_count)
 {
-  add_to(allowed_actions_cost, allowed_cost_is_pointer, costs, action_count, true);
-  return add_to(allowed_actions, allowed_is_pointer, a, action_count, true);
+  // In sequence task this function is used with a being nullptr, but costs is valid.
+  // So we need to check if we can do the adds.
+  if (costs != nullptr)
+  {
+    add_to(allowed_actions_cost, costs, action_count, true);
+  }
+  if (a != nullptr)
+  {
+    add_to(allowed_actions, a, action_count, true);
+  }
+  return *this;
 }
 predictor& predictor::set_allowed(v_array<std::pair<action, float>>& a)
 {
diff --git a/vowpalwabbit/search.h b/vowpalwabbit/search.h
index 2ee4f4980a5..f3981930589 100644
--- a/vowpalwabbit/search.h
+++ b/vowpalwabbit/search.h
@@ -335,22 +335,19 @@ class predictor
   bool ec_alloced;
   float weight;
   v_array<action> oracle_actions;
-  bool oracle_is_pointer;  // if we're pointing to your memory TRUE; if it's our own memory FALSE
   v_array<ptag> condition_on_tags;
   v_array<char> condition_on_names;
   v_array<action> allowed_actions;
-  bool allowed_is_pointer;  // if we're pointing to your memory TRUE; if it's our own memory FALSE
   v_array<float> allowed_actions_cost;
-  bool allowed_cost_is_pointer;  // if we're pointing to your memory TRUE; if it's our own memory FALSE
   size_t learner_id;
   search& sch;
 
   template <class T>
   void make_new_pointer(v_array<T>& A, size_t new_size);
   template <class T>
-  predictor& add_to(v_array<T>& A, bool& A_is_ptr, T a, bool clear_first);
+  predictor& add_to(v_array<T>& A, T a, bool clear_first);
   template <class T>
-  predictor& add_to(v_array<T>& A, bool& A_is_ptr, T* a, size_t count, bool clear_first);
+  predictor& add_to(v_array<T>& A, T* a, size_t count, bool clear_first);
   void free_ec();
 
   // prevent the user from doing something stupid :) ... ugh needed to turn this off for python :(
diff --git a/vowpalwabbit/search_dep_parser.cc b/vowpalwabbit/search_dep_parser.cc
index a0a89913879..279b17d1311 100644
--- a/vowpalwabbit/search_dep_parser.cc
+++ b/vowpalwabbit/search_dep_parser.cc
@@ -28,9 +28,9 @@ struct task_data
   v_array<uint32_t> valid_actions, action_loss, gold_heads, gold_tags, stack, heads, tags, temp, valid_action_temp;
   v_array<action> gold_actions, gold_action_temp;
   v_array<std::pair<action, float>> gold_action_losses;
-  v_array<uint32_t> children[6];  // [0]:num_left_arcs, [1]:num_right_arcs; [2]: leftmost_arc, [3]: second_leftmost_arc,
-                                  // [4]:rightmost_arc, [5]: second_rightmost_arc
-  example *ec_buf[13];
+  std::array<v_array<uint32_t>, 6> children;  // [0]:num_left_arcs, [1]:num_right_arcs; [2]: leftmost_arc, [3]: second_leftmost_arc,
+                                              // [4]:rightmost_arc, [5]: second_rightmost_arc
+  std::array<example*, 13> ec_buf;
   bool old_style_labels;
   bool cost_to_go, one_learner;
   uint32_t transition_system;
@@ -51,7 +51,7 @@ void initialize(Search::search &sch, size_t & /*num_actions*/, options_i &option
   vw &all = sch.get_vw_pointer_unsafe();
   task_data *data = new task_data();
   data->action_loss.resize(5);
-  data->ex = NULL;
+  data->ex = nullptr;
   sch.set_task_data<task_data>(data);
 
   option_group_definition new_options("Dependency Parser Options");
@@ -74,11 +74,12 @@ void initialize(Search::search &sch, size_t & /*num_actions*/, options_i &option
       make_option("old_style_labels", data->old_style_labels).keep().help("Use old hack of label information"));
   options.add_and_parse(new_options);
 
-  data->ex = VW::alloc_examples(sizeof(polylabel), 1);
+  data->ex = VW::alloc_examples(1);
   data->ex->indices.push_back(val_namespace);
   for (size_t i = 1; i < 14; i++) data->ex->indices.push_back((unsigned char)i + 'A');
   data->ex->indices.push_back(constant_namespace);
   data->ex->interactions = &sch.get_vw_pointer_unsafe().interactions;
+  data->ex->pred.init_as_multiclass();
 
   if (data->one_learner)
     sch.set_num_learners(1);
@@ -101,27 +102,15 @@ void initialize(Search::search &sch, size_t & /*num_actions*/, options_i &option
   else
     sch.set_options(AUTO_CONDITION_FEATURES | NO_CACHING);
 
-  sch.set_label_parser(COST_SENSITIVE::cs_label, [](polylabel &l) -> bool { return l.cs.costs.size() == 0; });
+  sch.set_label_parser(COST_SENSITIVE::cs_label, [](polylabel &l) -> bool { return l.cs().costs.size() == 0; });
 }
 
 void finish(Search::search &sch)
 {
-  task_data *data = sch.get_task_data<task_data>();
-  data->valid_actions.delete_v();
-  data->valid_action_temp.delete_v();
-  data->gold_heads.delete_v();
-  data->gold_tags.delete_v();
-  data->stack.delete_v();
-  data->heads.delete_v();
-  data->tags.delete_v();
-  data->temp.delete_v();
-  data->action_loss.delete_v();
-  data->gold_actions.delete_v();
-  data->gold_action_losses.delete_v();
-  data->gold_action_temp.delete_v();
-  VW::dealloc_example(COST_SENSITIVE::cs_label.delete_label, *data->ex);
+  task_data* data = sch.get_task_data<task_data>();
+
+  data->ex->~example();
   free(data->ex);
-  for (size_t i = 0; i < 6; i++) data->children[i].delete_v();
   delete data;
 }
 
@@ -152,9 +141,12 @@ void inline reset_ex(example *ex)
 size_t transition_hybrid(Search::search &sch, uint64_t a_id, uint32_t idx, uint32_t t_id, uint32_t /* n */)
 {
   task_data *data = sch.get_task_data<task_data>();
-  v_array<uint32_t> &heads = data->heads, &stack = data->stack, &gold_heads = data->gold_heads,
-                    &gold_tags = data->gold_tags, &tags = data->tags;
-  v_array<uint32_t> *children = data->children;
+  v_array<uint32_t>& heads = data->heads;
+  v_array<uint32_t>& stack = data->stack;
+  v_array<uint32_t>& gold_heads = data->gold_heads;
+  v_array<uint32_t>& gold_tags = data->gold_tags;
+  v_array<uint32_t>& tags = data->tags;
+  auto& children = data->children;
   if (a_id == SHIFT)
   {
     stack.push_back(idx);
@@ -195,9 +187,12 @@ size_t transition_hybrid(Search::search &sch, uint64_t a_id, uint32_t idx, uint3
 size_t transition_eager(Search::search &sch, uint64_t a_id, uint32_t idx, uint32_t t_id, uint32_t n)
 {
   task_data *data = sch.get_task_data<task_data>();
-  v_array<uint32_t> &heads = data->heads, &stack = data->stack, &gold_heads = data->gold_heads,
-                    &gold_tags = data->gold_tags, &tags = data->tags;
-  v_array<uint32_t> *children = data->children;
+  v_array<uint32_t>& heads = data->heads;
+  v_array<uint32_t>& stack = data->stack;
+  v_array<uint32_t>& gold_heads = data->gold_heads;
+  v_array<uint32_t>& gold_tags = data->gold_tags;
+  v_array<uint32_t>& tags = data->tags;
+  auto& children = data->children;
   if (a_id == SHIFT)
   {
     stack.push_back(idx);
@@ -247,8 +242,11 @@ void extract_features(Search::search &sch, uint32_t idx, multi_ex &ec)
   uint64_t mask = sch.get_mask();
   uint64_t multiplier = (uint64_t)all.wpp << all.weights.stride_shift();
 
-  v_array<uint32_t> &stack = data->stack, &tags = data->tags, *children = data->children, &temp = data->temp;
-  example **ec_buf = data->ec_buf;
+  v_array<uint32_t>& stack = data->stack;
+  v_array<uint32_t>& tags = data->tags;
+  auto& children = data->children;
+  v_array<uint32_t>& temp = data->temp;
+  example** ec_buf = data->ec_buf.data();
   example &ex = *(data->ex);
 
   size_t n = ec.size();
@@ -379,8 +377,10 @@ bool is_valid(uint64_t action, const v_array<uint32_t>& valid_actions)
 void get_eager_action_cost(Search::search &sch, uint32_t idx, uint64_t n)
 {
   task_data *data = sch.get_task_data<task_data>();
-  v_array<uint32_t> &action_loss = data->action_loss, &stack = data->stack, &gold_heads = data->gold_heads,
-                    heads = data->heads;
+  v_array<uint32_t>& action_loss = data->action_loss;
+  v_array<uint32_t>& stack = data->stack;
+  v_array<uint32_t>& gold_heads = data->gold_heads;
+  v_array<uint32_t>& heads = data->heads;
   size_t size = stack.size();
   size_t last = (size == 0) ? 0 : stack.last();
   for (size_t i = 1; i <= 4; i++) action_loss[i] = 0;
@@ -455,7 +455,8 @@ void get_cost_to_go_losses(Search::search &sch, v_array<std::pair<action, float>
   task_data *data = sch.get_task_data<task_data>();
   bool &one_learner = data->one_learner;
   uint32_t &sys = data->transition_system;
-  v_array<uint32_t> &action_loss = data->action_loss, &valid_actions = data->valid_actions;
+  v_array<uint32_t>& action_loss = data->action_loss;
+  v_array<uint32_t>& valid_actions = data->valid_actions;
   uint32_t &num_label = data->num_label;
   gold_action_losses.clear();
 
@@ -487,8 +488,10 @@ void get_cost_to_go_losses(Search::search &sch, v_array<std::pair<action, float>
 void get_gold_actions(Search::search &sch, uint32_t idx, uint64_t /* n */, v_array<action> &gold_actions)
 {
   task_data *data = sch.get_task_data<task_data>();
-  v_array<uint32_t> &action_loss = data->action_loss, &stack = data->stack, &gold_heads = data->gold_heads,
-                    &valid_actions = data->valid_actions;
+  v_array<uint32_t>& action_loss = data->action_loss;
+  v_array<uint32_t>& stack = data->stack;
+  v_array<uint32_t>& gold_heads = data->gold_heads;
+  v_array<uint32_t>& valid_actions = data->valid_actions;
   gold_actions.clear();
   size_t size = stack.size();
   size_t last = (size == 0) ? 0 : stack.last();
@@ -554,8 +557,10 @@ void convert_to_onelearner_actions(Search::search &sch, v_array<action> &actions
 void setup(Search::search &sch, multi_ex &ec)
 {
   task_data *data = sch.get_task_data<task_data>();
-  v_array<uint32_t> &gold_heads = data->gold_heads, &heads = data->heads, &gold_tags = data->gold_tags,
-                    &tags = data->tags;
+  v_array<uint32_t>& gold_heads = data->gold_heads;
+  v_array<uint32_t>& heads = data->heads;
+  v_array<uint32_t>& gold_tags = data->gold_tags;
+  v_array<uint32_t>& tags = data->tags;
   size_t n = ec.size();
   heads.resize(n + 1);
   tags.resize(n + 1);
@@ -565,7 +570,7 @@ void setup(Search::search &sch, multi_ex &ec)
   gold_tags.push_back(0);
   for (size_t i = 0; i < n; i++)
   {
-    const auto& costs = ec[i]->l.cs.costs;
+    const auto& costs = ec[i]->l.cs().costs;
     uint32_t head, tag;
     if (data->old_style_labels)
     {
diff --git a/vowpalwabbit/search_entityrelationtask.cc b/vowpalwabbit/search_entityrelationtask.cc
index 0ce16573a36..50f2607e91c 100644
--- a/vowpalwabbit/search_entityrelationtask.cc
+++ b/vowpalwabbit/search_entityrelationtask.cc
@@ -75,11 +75,11 @@ void initialize(Search::search& sch, size_t& /*num_actions*/, options_i& options
   }
   else
   {
-    example* ldf_examples = VW::alloc_examples(sizeof(CS::label), 10);
+    example* ldf_examples = VW::alloc_examples(10);
     CS::wclass default_wclass = {0., 0, 0., 0.};
     for (size_t a = 0; a < 10; a++)
     {
-      ldf_examples[a].l.cs.costs.push_back(default_wclass);
+      ldf_examples[a].l.cs().costs.push_back(default_wclass);
       ldf_examples[a].interactions = &sch.get_vw_pointer_unsafe().interactions;
     }
     my_task_data->ldf_entity = ldf_examples;
@@ -95,11 +95,10 @@ void initialize(Search::search& sch, size_t& /*num_actions*/, options_i& options
 void finish(Search::search& sch)
 {
   task_data* my_task_data = sch.get_task_data<task_data>();
-  my_task_data->y_allowed_entity.delete_v();
-  my_task_data->y_allowed_relation.delete_v();
   if (my_task_data->search_order == 3)
   {
-    for (size_t a = 0; a < 10; a++) VW::dealloc_example(CS::cs_label.delete_label, my_task_data->ldf_entity[a]);
+    for (size_t a = 0; a < 10; a++)
+      my_task_data->ldf_entity[a].~example();
     free(my_task_data->ldf_entity);
   }
   delete my_task_data;
@@ -145,8 +144,8 @@ size_t predict_entity(
   size_t prediction;
   if (my_task_data->allow_skip)
   {
-    v_array<uint32_t> star_labels = v_init<uint32_t>();
-    star_labels.push_back(ex->l.multi.label);
+    v_array<uint32_t> star_labels;
+    star_labels.push_back(ex->l.multi().label);
     star_labels.push_back(LABEL_SKIP);
     my_task_data->y_allowed_entity.push_back(LABEL_SKIP);
     prediction = Search::predictor(sch, my_tag)
@@ -165,7 +164,7 @@ size_t predict_entity(
       {
         VW::copy_example_data(false, &my_task_data->ldf_entity[a], ex);
         update_example_indicies(true, &my_task_data->ldf_entity[a], 28904713, 4832917 * (uint64_t)(a + 1));
-        CS::label& lab = my_task_data->ldf_entity[a].l.cs;
+        CS::label& lab = my_task_data->ldf_entity[a].l.cs();
         lab.costs[0].x = 0.f;
         lab.costs[0].class_index = a;
         lab.costs[0].partial_prediction = 0.f;
@@ -173,7 +172,7 @@ size_t predict_entity(
       }
       prediction = Search::predictor(sch, my_tag)
                        .set_input(my_task_data->ldf_entity, 4)
-                       .set_oracle(ex->l.multi.label - 1)
+                       .set_oracle(ex->l.multi().label - 1)
                        .set_learner_id(1)
                        .predict() +
           1;
@@ -182,7 +181,7 @@ size_t predict_entity(
     {
       prediction = Search::predictor(sch, my_tag)
                        .set_input(*ex)
-                       .set_oracle(ex->l.multi.label)
+                       .set_oracle(ex->l.multi().label)
                        .set_allowed(my_task_data->y_allowed_entity)
                        .set_learner_id(0)
                        .predict();
@@ -195,7 +194,7 @@ size_t predict_entity(
   {
     loss = my_task_data->skip_cost;
   }
-  else if (prediction != ex->l.multi.label)
+  else if (prediction != ex->l.multi().label)
     loss = my_task_data->entity_cost;
   sch.loss(loss);
   return prediction;
@@ -207,7 +206,7 @@ size_t predict_relation(Search::search& sch, example* ex, v_array<size_t>& predi
   task_data* my_task_data = sch.get_task_data<task_data>();
   size_t hist[2];
   decode_tag(ex->tag, type, id1, id2);
-  v_array<uint32_t> constrained_relation_labels = v_init<uint32_t>();
+  v_array<uint32_t> constrained_relation_labels;
   if (my_task_data->constraints && predictions[id1] != 0 && predictions[id2] != 0)
   {
     hist[0] = predictions[id1];
@@ -228,8 +227,8 @@ size_t predict_relation(Search::search& sch, example* ex, v_array<size_t>& predi
   size_t prediction;
   if (my_task_data->allow_skip)
   {
-    v_array<uint32_t> star_labels = v_init<uint32_t>();
-    star_labels.push_back(ex->l.multi.label);
+    v_array<uint32_t> star_labels;
+    star_labels.push_back(ex->l.multi().label);
     star_labels.push_back(LABEL_SKIP);
     constrained_relation_labels.push_back(LABEL_SKIP);
     prediction = Search::predictor(sch, my_tag)
@@ -252,12 +251,12 @@ size_t predict_relation(Search::search& sch, example* ex, v_array<size_t>& predi
         VW::copy_example_data(false, &my_task_data->ldf_relation[a], ex);
         update_example_indicies(
             true, &my_task_data->ldf_relation[a], 28904713, 4832917 * (uint64_t)(constrained_relation_labels[a]));
-        CS::label& lab = my_task_data->ldf_relation[a].l.cs;
+        CS::label& lab = my_task_data->ldf_relation[a].l.cs();
         lab.costs[0].x = 0.f;
         lab.costs[0].class_index = constrained_relation_labels[a];
         lab.costs[0].partial_prediction = 0.f;
         lab.costs[0].wap_value = 0.f;
-        if (constrained_relation_labels[a] == ex->l.multi.label)
+        if (constrained_relation_labels[a] == ex->l.multi().label)
         {
           correct_label = (int)a;
         }
@@ -273,7 +272,7 @@ size_t predict_relation(Search::search& sch, example* ex, v_array<size_t>& predi
     {
       prediction = Search::predictor(sch, my_tag)
                        .set_input(*ex)
-                       .set_oracle(ex->l.multi.label)
+                       .set_oracle(ex->l.multi().label)
                        .set_allowed(constrained_relation_labels)
                        .set_learner_id(1)
                        .predict();
@@ -285,9 +284,9 @@ size_t predict_relation(Search::search& sch, example* ex, v_array<size_t>& predi
   {
     loss = my_task_data->skip_cost;
   }
-  else if (prediction != ex->l.multi.label)
+  else if (prediction != ex->l.multi().label)
   {
-    if (ex->l.multi.label == R_NONE)
+    if (ex->l.multi().label == R_NONE)
     {
       loss = my_task_data->relation_none_cost;
     }
@@ -297,7 +296,6 @@ size_t predict_relation(Search::search& sch, example* ex, v_array<size_t>& predi
     }
   }
   sch.loss(loss);
-  constrained_relation_labels.delete_v();
   return prediction;
 }
 
@@ -407,7 +405,7 @@ void run(Search::search& sch, multi_ex& ec)
 {
   task_data* my_task_data = sch.get_task_data<task_data>();
 
-  v_array<size_t> predictions = v_init<size_t>();
+  v_array<size_t> predictions;
   for (size_t i = 0; i < ec.size(); i++)
   {
     predictions.push_back(0);
@@ -436,7 +434,6 @@ void run(Search::search& sch, multi_ex& ec)
     if (sch.output().good())
       sch.output() << predictions[i] << ' ';
   }
-  predictions.delete_v();
 }
 // this is totally bogus for the example -- you'd never actually do this!
 void update_example_indicies(bool /* audit */, example* ec, uint64_t mult_amount, uint64_t plus_amount)
diff --git a/vowpalwabbit/search_graph.cc b/vowpalwabbit/search_graph.cc
index dc7d7a9f546..dffa91e3634 100644
--- a/vowpalwabbit/search_graph.cc
+++ b/vowpalwabbit/search_graph.cc
@@ -89,7 +89,7 @@ struct task_data
   float true_counts_total;
 };
 
-inline bool example_is_test(polylabel& l) { return l.cs.costs.size() == 0; }
+inline bool example_is_test(polylabel& l) { return l.cs().costs.size() == 0; }
 
 void initialize(Search::search& sch, size_t& num_actions, options_i& options)
 {
@@ -140,7 +140,7 @@ void finish(Search::search& sch)
   delete D;
 }
 
-inline bool example_is_edge(example* e) { return e->l.cs.costs.size() > 1; }
+inline bool example_is_edge(example* e) { return e->l.cs().costs.size() > 1; }
 
 void run_bfs(task_data& D, multi_ex& ec)
 {
@@ -158,9 +158,9 @@ void run_bfs(task_data& D, multi_ex& ec)
     {
       uint32_t n = D.bfs[i];
       for (size_t id : D.adj[n])
-        for (size_t j = 0; j < ec[id]->l.cs.costs.size(); j++)
+        for (size_t j = 0; j < ec[id]->l.cs().costs.size(); j++)
         {
-          uint32_t m = ec[id]->l.cs.costs[j].class_index;
+          uint32_t m = ec[id]->l.cs().costs[j].class_index;
           if ((m > 0) && (!touched[m - 1]))
           {
             D.bfs.push_back(m - 1);
@@ -200,9 +200,9 @@ void setup(Search::search& sch, multi_ex& ec)
         THROW("error: got a node after getting edges!");
 
       D.N++;
-      if (ec[i]->l.cs.costs.size() > 0)
+      if (ec[i]->l.cs().costs.size() > 0)
       {
-        D.true_counts[ec[i]->l.cs.costs[0].class_index] += 1.;
+        D.true_counts[ec[i]->l.cs().costs[0].class_index] += 1.;
         D.true_counts_total += 1.;
       }
     }
@@ -214,15 +214,15 @@ void setup(Search::search& sch, multi_ex& ec)
 
   for (size_t i = D.N; i < ec.size(); i++)
   {
-    for (size_t n = 0; n < ec[i]->l.cs.costs.size(); n++)
+    for (size_t n = 0; n < ec[i]->l.cs().costs.size(); n++)
     {
-      if (ec[i]->l.cs.costs[n].class_index > D.N)
-        THROW("error: edge source points to too large of a node id: " << (ec[i]->l.cs.costs[n].class_index) << " > "
+      if (ec[i]->l.cs().costs[n].class_index > D.N)
+        THROW("error: edge source points to too large of a node id: " << (ec[i]->l.cs().costs[n].class_index) << " > "
                                                                       << D.N);
     }
-    for (size_t n = 0; n < ec[i]->l.cs.costs.size(); n++)
+    for (size_t n = 0; n < ec[i]->l.cs().costs.size(); n++)
     {
-      size_t nn = ec[i]->l.cs.costs[n].class_index;
+      size_t nn = ec[i]->l.cs().costs[n].class_index;
       if ((nn > 0) &&
           (((D.adj[nn - 1].size() == 0) || (D.adj[nn - 1][D.adj[nn - 1].size() - 1] != i))))  // don't allow dups
         D.adj[nn - 1].push_back(i);
@@ -280,9 +280,9 @@ void add_edge_features(Search::search& sch, task_data& D, size_t n, multi_ex& ec
     {
       bool n_in_sink = true;
       if (D.directed)
-        for (size_t j = 0; j < ec[i]->l.cs.costs.size() - 1; j++)
+        for (size_t j = 0; j < ec[i]->l.cs().costs.size() - 1; j++)
         {
-          size_t m = ec[i]->l.cs.costs[j].class_index;
+          size_t m = ec[i]->l.cs().costs[j].class_index;
           if (m == 0)
             break;
           if (m - 1 == n)
@@ -293,15 +293,15 @@ void add_edge_features(Search::search& sch, task_data& D, size_t n, multi_ex& ec
         }
 
       bool m_in_sink = false;
-      for (size_t j = 0; j < ec[i]->l.cs.costs.size(); j++)
+      for (size_t j = 0; j < ec[i]->l.cs().costs.size(); j++)
       {
-        size_t m = ec[i]->l.cs.costs[j].class_index;
+        size_t m = ec[i]->l.cs().costs[j].class_index;
         if (m == 0)
         {
           m_in_sink = true;
           continue;
         }
-        if (j == ec[i]->l.cs.costs.size() - 1)
+        if (j == ec[i]->l.cs().costs.size() - 1)
           m_in_sink = true;
         m--;
         if (m == n)
@@ -411,7 +411,7 @@ void run(Search::search& sch, multi_ex& ec)
     for (int n_id = start; n_id != end; n_id += step)
     {
       uint32_t n = D.bfs[n_id];
-      uint32_t k = (ec[n]->l.cs.costs.size() > 0) ? ec[n]->l.cs.costs[0].class_index : 0;
+      uint32_t k = (ec[n]->l.cs().costs.size() > 0) ? ec[n]->l.cs().costs[0].class_index : 0;
 
       bool add_features = /* D.use_structure && */ sch.predictNeedsExample();
       // add_features = false;
@@ -437,9 +437,9 @@ void run(Search::search& sch, multi_ex& ec)
       // add all the conditioning
       for (size_t i = 0; i < D.adj[n].size(); i++)
       {
-        for (size_t j = 0; j < ec[i]->l.cs.costs.size(); j++)
+        for (size_t j = 0; j < ec[i]->l.cs().costs.size(); j++)
         {
-          uint32_t m = ec[i]->l.cs.costs[j].class_index;
+          uint32_t m = ec[i]->l.cs().costs[j].class_index;
           if (m == 0)
             continue;
           m--;
@@ -451,15 +451,15 @@ void run(Search::search& sch, multi_ex& ec)
 
       // make the prediction
       D.pred[n] = P.predict();
-      if (ec[n]->l.cs.costs.size() > 0)  // for test examples
-        sch.loss((ec[n]->l.cs.costs[0].class_index == D.pred[n]) ? 0.f : (last_loop ? 0.5f : loss_val));
+      if (ec[n]->l.cs().costs.size() > 0)  // for test examples
+        sch.loss((ec[n]->l.cs().costs[0].class_index == D.pred[n]) ? 0.f : (last_loop ? 0.5f : loss_val));
 
       if (add_features)
         del_edge_features(D, n, ec);
     }
   }
 
-  for (uint32_t n = 0; n < D.N; n++) D.confusion_matrix[IDX(ec[n]->l.cs.costs[0].class_index, D.pred[n])]++;
+  for (uint32_t n = 0; n < D.N; n++) D.confusion_matrix[IDX(ec[n]->l.cs().costs[0].class_index, D.pred[n])]++;
   sch.loss(1.f - macro_f(D));
 
   if (sch.output().good())
diff --git a/vowpalwabbit/search_meta.cc b/vowpalwabbit/search_meta.cc
index a34c1c4fe0a..4284612dda0 100644
--- a/vowpalwabbit/search_meta.cc
+++ b/vowpalwabbit/search_meta.cc
@@ -69,17 +69,11 @@ struct task_data
   std::stringstream* kbest_out;
   task_data(size_t mb, size_t kb) : max_branches(mb), kbest(kb)
   {
-    branches = v_init<branch>();
-    final = v_init<std::pair<branch, std::string*> >();
-    trajectory = v_init<act_score>();
     output_string = nullptr;
     kbest_out = nullptr;
   }
   ~task_data()
   {
-    branches.delete_v();
-    final.delete_v();
-    trajectory.delete_v();
     delete output_string;
     delete kbest_out;
   }
@@ -125,7 +119,7 @@ void run(Search::search& sch, multi_ex& ec)
           return;  // ignore the taken action
         task_data& d = *sch.get_metatask_data<task_data>();
         float delta = a_cost - min_cost;
-        path branch = v_init<act_score>();
+        path branch;
         push_many<act_score>(branch, d.trajectory.begin(), d.trajectory.size());
         branch.push_back(std::make_pair(a, a_cost));
         d.branches.push_back(std::make_pair(delta, branch));
@@ -147,7 +141,7 @@ void run(Search::search& sch, multi_ex& ec)
 
   {
     // construct the final trajectory
-    path original_final = v_init<act_score>();
+    path original_final;
     copy_array(original_final, d.trajectory);
     d.final.push_back(std::make_pair(std::make_pair(d.total_cost, original_final), d.output_string));
   }
@@ -189,7 +183,7 @@ void run(Search::search& sch, multi_ex& ec)
 
     {
       // construct the final trajectory
-      path this_final = v_init<act_score>();
+      path this_final;
       copy_array(this_final, d.trajectory);
       d.final.push_back(std::make_pair(std::make_pair(d.total_cost, this_final), d.output_string));
     }
@@ -237,11 +231,9 @@ void run(Search::search& sch, multi_ex& ec)
       .Run();
 
   // clean up memory
-  for (size_t i = 0; i < d.branches.size(); i++) d.branches[i].second.delete_v();
   d.branches.clear();
   for (size_t i = 0; i < d.final.size(); i++)
   {
-    d.final[i].first.second.delete_v();
     delete d.final[i].second;
   }
   d.final.clear();
diff --git a/vowpalwabbit/search_multiclasstask.cc b/vowpalwabbit/search_multiclasstask.cc
index ad44bd71164..8c151d28654 100644
--- a/vowpalwabbit/search_multiclasstask.cc
+++ b/vowpalwabbit/search_multiclasstask.cc
@@ -32,14 +32,13 @@ void initialize(Search::search& sch, size_t& num_actions, VW::config::options_i&
 void finish(Search::search& sch)
 {
   task_data* my_task_data = sch.get_task_data<task_data>();
-  my_task_data->y_allowed.delete_v();
   delete my_task_data;
 }
 
 void run(Search::search& sch, multi_ex& ec)
 {
   task_data* my_task_data = sch.get_task_data<task_data>();
-  size_t gold_label = ec[0]->l.multi.label;
+  size_t gold_label = ec[0]->l.multi().label;
   size_t label = 0;
   size_t learner_id = 0;
 
diff --git a/vowpalwabbit/search_sequencetask.cc b/vowpalwabbit/search_sequencetask.cc
index 2d5789da2fc..afebd196ff7 100644
--- a/vowpalwabbit/search_sequencetask.cc
+++ b/vowpalwabbit/search_sequencetask.cc
@@ -42,7 +42,7 @@ void run(Search::search& sch, multi_ex& ec)
   Search::predictor P(sch, (ptag)0);
   for (size_t i = 0; i < ec.size(); i++)
   {
-    action oracle = ec[i]->l.multi.label;
+    action oracle = ec[i]->l.multi().label;
     size_t prediction = P.set_tag((ptag)i + 1)
                             .set_input(*ec[i])
                             .set_oracle(oracle)
@@ -96,9 +96,9 @@ void convert_bio_to_bilou(multi_ex& ec)
 {
   for (size_t n = 0; n < ec.size(); n++)
   {
-    MULTICLASS::label_t& ylab = ec[n]->l.multi;
+    MULTICLASS::label_t& ylab = ec[n]->l.multi();
     action y = ylab.label;
-    action nexty = (n == ec.size() - 1) ? 0 : ec[n + 1]->l.multi.label;
+    action nexty = (n == ec.size() - 1) ? 0 : ec[n + 1]->l.multi().label;
     if (y == 1)  // do nothing
       ;
     else if (y % 2 == 0)  // this is a begin-X
@@ -179,8 +179,6 @@ void initialize(Search::search& sch, size_t& num_actions, options_i& options)
 void finish(Search::search& sch)
 {
   task_data* D = sch.get_task_data<task_data>();
-  D->allowed_actions.delete_v();
-  D->only_two_allowed.delete_v();
   delete D;
 }
 
@@ -198,7 +196,7 @@ void takedown(Search::search& sch, multi_ex& ec)
   if (D.encoding == BILOU)
     for (size_t n = 0; n < ec.size(); n++)
     {
-      MULTICLASS::label_t ylab = ec[n]->l.multi;
+      MULTICLASS::label_t ylab = ec[n]->l.multi();
       ylab.label = bilou_to_bio(ylab.label);
     }
 }
@@ -213,7 +211,7 @@ void run(Search::search& sch, multi_ex& ec)
     action last_prediction = 1;
     for (size_t i = 0; i < ec.size(); i++)
     {
-      action oracle = ec[i]->l.multi.label;
+      action oracle = ec[i]->l.multi().label;
       size_t len = y_allowed->size();
       P.set_tag((ptag)i + 1);
       P.set_learner_id(pass - 1);
@@ -286,7 +284,7 @@ void run(Search::search& sch, multi_ex& ec)
   Search::predictor P(sch, (ptag)0);
   for (size_t i = 0; i < ec.size(); i++)
   {
-    action oracle = ec[i]->l.multi.label;
+    action oracle = ec[i]->l.multi().label;
     for (size_t k = 0; k < K; k++) costs[k] = 1.;
     costs[oracle - 1] = 0.;
     size_t prediction = P.set_tag((ptag)i + 1)
@@ -343,12 +341,12 @@ void run(Search::search& sch, multi_ex& ec)
   uint32_t max_prediction = 1;
   uint32_t max_label = 1;
 
-  for (size_t i = 0; i < ec.size(); i++) max_label = std::max(ec[i]->l.multi.label, max_label);
+  for (size_t i = 0; i < ec.size(); i++) max_label = std::max(ec[i]->l.multi().label, max_label);
 
   for (ptag i = 0; i < ec.size(); i++)
   {
     // labels should be 1 or 2, and our output is MAX of all predicted values
-    uint32_t oracle = D.predict_max ? max_label : ec[i]->l.multi.label;
+    uint32_t oracle = D.predict_max ? max_label : ec[i]->l.multi().label;
     uint32_t prediction = sch.predict(*ec[i], i + 1, &oracle, 1, &i, "p");
 
     max_prediction = std::max(prediction, max_prediction);
@@ -378,12 +376,12 @@ void initialize(Search::search& sch, size_t& num_actions, options_i& /*options*/
 {
   CS::wclass default_wclass = {0., 0, 0., 0.};
 
-  example* ldf_examples = VW::alloc_examples(sizeof(CS::label), num_actions);
+  example* ldf_examples = VW::alloc_examples(num_actions);
   for (size_t a = 0; a < num_actions; a++)
   {
-    CS::label& lab = ldf_examples[a].l.cs;
-    CS::cs_label.default_label(&lab);
-    lab.costs.push_back(default_wclass);
+    auto& l = ldf_examples[a].l;
+    CS::cs_label.default_label(l);
+    l.cs().costs.push_back(default_wclass);
     ldf_examples[a].interactions = &sch.get_vw_pointer_unsafe().interactions;
   }
 
@@ -400,7 +398,8 @@ void initialize(Search::search& sch, size_t& num_actions, options_i& /*options*/
 void finish(Search::search& sch)
 {
   task_data* data = sch.get_task_data<task_data>();
-  for (size_t a = 0; a < data->num_actions; a++) VW::dealloc_example(CS::cs_label.delete_label, data->ldf_examples[a]);
+  for (size_t a = 0; a < data->num_actions; a++)
+    data->ldf_examples[a].~example();
   free(data->ldf_examples);
   free(data);
 }
@@ -430,7 +429,7 @@ void run(Search::search& sch, multi_ex& ec)
       }
 
       // regardless of whether the example is needed or not, the class info is needed
-      CS::label& lab = data->ldf_examples[a].l.cs;
+      CS::label& lab = data->ldf_examples[a].l.cs();
       // need to tell search what the action id is, so that it can add history features correctly!
       lab.costs[0].x = 0.;
       lab.costs[0].class_index = a + 1;
@@ -438,7 +437,7 @@ void run(Search::search& sch, multi_ex& ec)
       lab.costs[0].wap_value = 0.;
     }
 
-    action oracle = ec[i]->l.multi.label - 1;
+    action oracle = ec[i]->l.multi().label - 1;
     action pred_id = P.set_tag((ptag)(i + 1))
                          .set_input(data->ldf_examples, data->num_actions)
                          .set_oracle(oracle)
diff --git a/vowpalwabbit/sender.cc b/vowpalwabbit/sender.cc
index e37196fa8c6..a00df3e98c5 100644
--- a/vowpalwabbit/sender.cc
+++ b/vowpalwabbit/sender.cc
@@ -38,8 +38,6 @@ struct sender
 
   ~sender()
   {
-    buf->files.delete_v();
-    buf->space.delete_v();
     free(delay_ring);
     delete buf;
   }
@@ -72,21 +70,22 @@ void receive_result(sender& s)
 
   get_prediction(s.sd, res, weight);
   example& ec = *s.delay_ring[s.received_index++ % s.all->p->ring_size];
-  ec.pred.scalar = res;
+  ec.pred.scalar() = res;
 
-  label_data& ld = ec.l.simple;
-  ec.loss = s.all->loss->getLoss(s.all->sd, ec.pred.scalar, ld.label) * ec.weight;
+  label_data& ld = ec.l.simple();
+  ec.loss = s.all->loss->getLoss(s.all->sd, ec.pred.scalar(), ld.label) * ec.weight;
 
-  return_simple_example(*(s.all), nullptr, ec);
+  return_simple_example_explicit(*(s.all), ec);
 }
 
 void learn(sender& s, LEARNER::single_learner&, example& ec)
 {
+  assert(ec.pred.get_type() == prediction_type_t::scalar);
   if (s.received_index + s.all->p->ring_size / 2 - 1 == s.sent_index)
     receive_result(s);
 
-  s.all->set_minmax(s.all->sd, ec.l.simple.label);
-  s.all->p->lp.cache_label(&ec.l, *s.buf);  // send label information.
+  s.all->set_minmax(s.all->sd, ec.l.simple().label);
+  s.all->p->lp.cache_label(ec.l, *s.buf);  // send label information.
   cache_tag(*s.buf, ec.tag);
   send_features(s.buf, ec, (uint32_t)s.all->parse_mask);
   s.delay_ring[s.sent_index++ % s.all->p->ring_size] = &ec;
@@ -124,5 +123,6 @@ LEARNER::base_learner* sender_setup(options_i& options, vw& all)
   LEARNER::learner<sender, example>& l = init_learner(s, learn, learn, 1);
   l.set_finish_example(finish_example);
   l.set_end_examples(end_examples);
+  l.label_type = label_type_t::simple;
   return make_base(l);
 }
diff --git a/vowpalwabbit/shared_feature_merger.cc b/vowpalwabbit/shared_feature_merger.cc
index ce69ec052a2..2a347468c30 100644
--- a/vowpalwabbit/shared_feature_merger.cc
+++ b/vowpalwabbit/shared_feature_merger.cc
@@ -42,7 +42,8 @@ void predict_or_learn(sfm_data&, LEARNER::multi_learner& base, multi_ex& ec_seq)
 
   multi_ex::value_type shared_example = nullptr;
 
-  const bool has_example_header = CB::ec_is_example_header(*ec_seq[0]);
+  const bool has_example_header = CB::ec_is_example_header(*ec_seq[0])
+    || COST_SENSITIVE::ec_is_example_header(*ec_seq[0]);
   if (has_example_header)
   {
     shared_example = ec_seq[0];
@@ -75,7 +76,7 @@ LEARNER::base_learner* shared_feature_merger_setup(config::options_i& options, v
 
   auto* base = LEARNER::as_multiline(setup_base(options, all));
   auto& learner = LEARNER::init_learner(data, base, predict_or_learn<true>, predict_or_learn<false>);
-
+  learner.label_type = base->label_type;
   // TODO: Incorrect feature numbers will be reported without merging the example namespaces from the
   //       shared example in a finish_example function. However, its too expensive to perform the full operation.
 
diff --git a/vowpalwabbit/simple_label.cc b/vowpalwabbit/simple_label.cc
index f647efa43e2..539858f4975 100644
--- a/vowpalwabbit/simple_label.cc
+++ b/vowpalwabbit/simple_label.cc
@@ -6,32 +6,32 @@
 #include <cfloat>
 #include <cmath>
 #include <cstdio>
-
+#include "vw_string_view.h"
 #include "cache.h"
 #include "accumulate.h"
 #include "best_constant.h"
 #include "vw_string_view.h"
 
-char* bufread_simple_label(shared_data* sd, label_data* ld, char* c)
+char* bufread_simple_label(shared_data* sd, label_data& ld, char* c)
 {
-  memcpy(&ld->label, c, sizeof(ld->label));
-  //  std::cout << ld->label << " " << sd->is_more_than_two_labels_observed << " " << sd->first_observed_label <<
+  memcpy(&ld.label, c, sizeof(ld.label));
+  //  std::cout << ld.label << " " << sd->is_more_than_two_labels_observed << " " << sd->first_observed_label <<
   //  std::endl;
-  c += sizeof(ld->label);
-  memcpy(&ld->weight, c, sizeof(ld->weight));
-  c += sizeof(ld->weight);
-  memcpy(&ld->initial, c, sizeof(ld->initial));
-  c += sizeof(ld->initial);
+  c += sizeof(ld.label);
+  memcpy(&ld.weight, c, sizeof(ld.weight));
+  c += sizeof(ld.weight);
+  memcpy(&ld.initial, c, sizeof(ld.initial));
+  c += sizeof(ld.initial);
 
-  count_label(sd, ld->label);
+  count_label(sd, ld.label);
   return c;
 }
 
-size_t read_cached_simple_label(shared_data* sd, void* v, io_buf& cache)
+size_t read_cached_simple_label(shared_data* sd, polylabel& in_ld, io_buf& cache)
 {
-  label_data* ld = (label_data*)v;
+  auto& ld = in_ld.simple();
   char* c;
-  size_t total = sizeof(ld->label) + sizeof(ld->weight) + sizeof(ld->initial);
+  size_t total = sizeof(ld.label) + sizeof(ld.weight) + sizeof(ld.initial);
   if (cache.buf_read(c, total) < total)
     return 0;
   bufread_simple_label(sd, ld, c);
@@ -39,91 +39,99 @@ size_t read_cached_simple_label(shared_data* sd, void* v, io_buf& cache)
   return total;
 }
 
-float get_weight(void* v)
-{
-  label_data* ld = (label_data*)v;
-  return ld->weight;
-}
+float get_weight(polylabel& v) { return v.simple().weight; }
 
-char* bufcache_simple_label(label_data* ld, char* c)
+char* bufcache_simple_label(label_data& ld, char* c)
 {
-  memcpy(c, &ld->label, sizeof(ld->label));
-  c += sizeof(ld->label);
-  memcpy(c, &ld->weight, sizeof(ld->weight));
-  c += sizeof(ld->weight);
-  memcpy(c, &ld->initial, sizeof(ld->initial));
-  c += sizeof(ld->initial);
+  memcpy(c, &ld.label, sizeof(ld.label));
+  c += sizeof(ld.label);
+  memcpy(c, &ld.weight, sizeof(ld.weight));
+  c += sizeof(ld.weight);
+  memcpy(c, &ld.initial, sizeof(ld.initial));
+  c += sizeof(ld.initial);
   return c;
 }
 
-void cache_simple_label(void* v, io_buf& cache)
+void cache_simple_label(polylabel& v, io_buf& cache)
 {
   char* c;
-  label_data* ld = (label_data*)v;
-  cache.buf_write(c, sizeof(ld->label) + sizeof(ld->weight) + sizeof(ld->initial));
+  auto& ld = v.simple();
+  cache.buf_write(c, sizeof(ld.label) + sizeof(ld.weight) + sizeof(ld.initial));
   bufcache_simple_label(ld, c);
 }
 
-void default_simple_label(void* v)
+void default_simple_label(polylabel& v)
 {
-  label_data* ld = (label_data*)v;
+  label_data* ld;
+  if (v.get_type() == label_type_t::unset)
+  {
+    ld = &v.init_as_simple();
+  }
+  else if (v.get_type() == label_type_t::simple)
+  {
+    ld = &v.simple();
+  }
+  else
+  {
+    v.reset();
+    ld = &v.init_as_simple();
+  }
+
   ld->label = FLT_MAX;
   ld->weight = 1.;
   ld->initial = 0.;
 }
 
-bool test_label(void* v)
+bool test_label(polylabel& v)
 {
-  label_data* ld = (label_data*)v;
-  return ld->label == FLT_MAX;
+  auto& ld = v.simple();
+  return ld.label == FLT_MAX;
 }
 
-void delete_simple_label(void*) {}
-
-void parse_simple_label(parser*, shared_data* sd, void* v, v_array<VW::string_view>& words)
+void parse_simple_label(parser*, shared_data* sd, polylabel& v, v_array<VW::string_view>& words)
 {
-  label_data* ld = (label_data*)v;
+  auto& ld = v.simple();
 
   switch (words.size())
   {
     case 0:
       break;
     case 1:
-      ld->label = float_of_string(words[0]);
+      ld.label = float_of_string(words[0]);
       break;
     case 2:
-      ld->label = float_of_string(words[0]);
-      ld->weight = float_of_string(words[1]);
+      ld.label = float_of_string(words[0]);
+      ld.weight = float_of_string(words[1]);
       break;
     case 3:
-      ld->label = float_of_string(words[0]);
-      ld->weight = float_of_string(words[1]);
-      ld->initial = float_of_string(words[2]);
+      ld.label = float_of_string(words[0]);
+      ld.weight = float_of_string(words[1]);
+      ld.initial = float_of_string(words[2]);
       break;
     default:
       std::cout << "Error: " << words.size() << " is too many tokens for a simple label: ";
-      for (const auto & word : words) std::cout << word;
+      for (const auto& word : words) std::cout << word;
       std::cout << std::endl;
   }
-  count_label(sd, ld->label);
+  count_label(sd, ld.label);
 }
 
 label_parser simple_label = {default_simple_label, parse_simple_label, cache_simple_label, read_cached_simple_label,
-    delete_simple_label, get_weight, nullptr, test_label, sizeof(label_data)};
+    polylabel_delete_label, get_weight, polylabel_copy_label, test_label, sizeof(label_data)};
 
 void print_update(vw& all, example& ec)
 {
   if (all.sd->weighted_labeled_examples + all.sd->weighted_unlabeled_examples >= all.sd->dump_interval && !all.quiet &&
       !all.bfgs)
   {
-    all.sd->print_update(all.holdout_set_off, all.current_pass, ec.l.simple.label, ec.pred.scalar, ec.num_features,
+    all.sd->print_update(all.holdout_set_off, all.current_pass, ec.l.simple().label, ec.pred.scalar(), ec.num_features,
         all.progress_add, all.progress_arg);
   }
 }
 
 void output_and_account_example(vw& all, example& ec)
 {
-  label_data ld = ec.l.simple;
+  label_data ld = ec.l.simple();
 
   all.sd->update(ec.test_only, ld.label != FLT_MAX, ec.loss, ec.weight, ec.num_features);
   if (ld.label != FLT_MAX && !ec.test_only)
@@ -133,13 +141,19 @@ void output_and_account_example(vw& all, example& ec)
   for (size_t i = 0; i < all.final_prediction_sink.size(); i++)
   {
     int f = (int)all.final_prediction_sink[i];
-    all.print_by_ref(f, ec.pred.scalar, 0, ec.tag);
+    all.print_by_ref(f, ec.pred.scalar(), 0, ec.tag);
   }
 
   print_update(all, ec);
 }
 
-void return_simple_example(vw& all, void*, example& ec)
+void return_simple_example_explicit(vw& all, example& ec)
+{
+  output_and_account_example(all, ec);
+  VW::finish_example(all, ec);
+}
+
+void return_simple_example(vw& all, polylabel&, example& ec)
 {
   output_and_account_example(all, ec);
   VW::finish_example(all, ec);
diff --git a/vowpalwabbit/simple_label.h b/vowpalwabbit/simple_label.h
index 231e5246918..3bfb8c22373 100644
--- a/vowpalwabbit/simple_label.h
+++ b/vowpalwabbit/simple_label.h
@@ -4,6 +4,8 @@
 #pragma once
 #include "label_parser.h"
 
+#include <cfloat>
+
 struct example;
 struct vw;
 
@@ -12,9 +14,13 @@ struct label_data
   float label;
   float weight;
   float initial;
+
+  label_data() : label(FLT_MAX), weight(0.f), initial(0.f) {}
+  label_data(float label, float weight, float initial) : label(label), weight(weight), initial(initial) {}
 };
 
-void return_simple_example(vw& all, void*, example& ec);
+void return_simple_example(vw& all, polylabel&, example& ec);
+void return_simple_example_explicit(vw& all, example& ec);
 
 extern label_parser simple_label;
 
diff --git a/vowpalwabbit/stagewise_poly.cc b/vowpalwabbit/stagewise_poly.cc
index f5867ee7421..58dd9cc98f8 100644
--- a/vowpalwabbit/stagewise_poly.cc
+++ b/vowpalwabbit/stagewise_poly.cc
@@ -75,8 +75,6 @@ struct stagewise_poly
     cout << "total feature number (after poly expansion!) = " << sum_sparsity << std::endl;
 #endif  // DEBUG
 
-    //synth_ec.feature_space[tree_atomics].delete_v();
-    synth_ec.indices.delete_v();
     free(sd);
     free(depthsbits);
   }
@@ -503,12 +501,12 @@ void predict(stagewise_poly &poly, single_learner &base, example &ec)
   base.predict(poly.synth_ec);
   ec.partial_prediction = poly.synth_ec.partial_prediction;
   ec.updated_prediction = poly.synth_ec.updated_prediction;
-  ec.pred.scalar = poly.synth_ec.pred.scalar;
+  ec.pred.scalar() = poly.synth_ec.pred.scalar();
 }
 
 void learn(stagewise_poly &poly, single_learner &base, example &ec)
 {
-  bool training = poly.all->training && ec.l.simple.label != FLT_MAX;
+  bool training = poly.all->training && ec.l.simple().label != FLT_MAX;
   poly.original_ec = &ec;
 
   if (training)
@@ -523,7 +521,7 @@ void learn(stagewise_poly &poly, single_learner &base, example &ec)
     base.learn(poly.synth_ec);
     ec.partial_prediction = poly.synth_ec.partial_prediction;
     ec.updated_prediction = poly.synth_ec.updated_prediction;
-    ec.pred.scalar = poly.synth_ec.pred.scalar;
+    ec.pred.scalar() = poly.synth_ec.pred.scalar();
 
     if (ec.example_counter
         // following line is to avoid repeats when multiple reductions on same example.
@@ -656,7 +654,7 @@ void save_load(stagewise_poly &poly, io_buf &model_file, bool read, bool text)
   //#endif //DEBUG
 }
 
-base_learner *stagewise_poly_setup(options_i &options, vw &all)
+base_learner* stagewise_poly_setup(options_i &options, vw &all)
 {
   auto poly = scoped_calloc_or_throw<stagewise_poly>();
   bool stage_poly = false;
@@ -696,10 +694,12 @@ base_learner *stagewise_poly_setup(options_i &options, vw &all)
   poly->original_ec = nullptr;
   poly->next_batch_sz = poly->batch_sz;
 
+  poly->synth_ec.pred.init_as_scalar();
+
   learner<stagewise_poly, example> &l = init_learner(poly, as_singleline(setup_base(options, all)), learn, predict);
   l.set_save_load(save_load);
   l.set_finish_example(finish_example);
   l.set_end_pass(end_pass);
-
+  l.label_type = label_type_t::simple;
   return make_base(l);
 }
diff --git a/vowpalwabbit/svrg.cc b/vowpalwabbit/svrg.cc
index ef4cf03d3ec..0425c84a8c0 100644
--- a/vowpalwabbit/svrg.cc
+++ b/vowpalwabbit/svrg.cc
@@ -44,7 +44,7 @@ inline void vec_add(float& p, const float x, float& w)
 template <int offset>
 inline float inline_predict(vw& all, example& ec)
 {
-  float acc = ec.l.simple.initial;
+  float acc = ec.l.simple().initial;
   GD::foreach_feature<float, vec_add<offset> >(all, ec, acc);
   return acc;
 }
@@ -59,12 +59,12 @@ float predict_stable(const svrg& s, example& ec)
 void predict(svrg& s, single_learner&, example& ec)
 {
   ec.partial_prediction = inline_predict<W_INNER>(*s.all, ec);
-  ec.pred.scalar = GD::finalize_prediction(s.all->sd, ec.partial_prediction);
+  ec.pred.scalar() = GD::finalize_prediction(s.all->sd, ec.partial_prediction);
 }
 
 float gradient_scalar(const svrg& s, const example& ec, float pred)
 {
-  return s.all->loss->first_derivative(s.all->sd, pred, ec.l.simple.label) * ec.weight;
+  return s.all->loss->first_derivative(s.all->sd, pred, ec.l.simple().label) * ec.weight;
 }
 
 // -- Updates, taking inner steps vs. accumulating a full gradient --
@@ -93,7 +93,7 @@ void update_inner(const svrg& s, example& ec)
 {
   update u;
   // |ec| already has prediction according to inner weights.
-  u.g_scalar_inner = gradient_scalar(s, ec, ec.pred.scalar);
+  u.g_scalar_inner = gradient_scalar(s, ec, ec.pred.scalar());
   u.g_scalar_stable = gradient_scalar(s, ec, predict_stable(s, ec));
   u.eta = s.all->eta;
   u.norm = (float)s.stable_grad_count;
@@ -190,5 +190,6 @@ base_learner* svrg_setup(options_i& options, vw& all)
   all.weights.stride_shift(2);
   learner<svrg, example>& l = init_learner(s, learn, predict, UINT64_ONE << all.weights.stride_shift());
   l.set_save_load(save_load);
+  l.label_type = label_type_t::simple;
   return make_base(l);
 }
diff --git a/vowpalwabbit/topk.cc b/vowpalwabbit/topk.cc
index 959b0867424..9d88e21f38e 100644
--- a/vowpalwabbit/topk.cc
+++ b/vowpalwabbit/topk.cc
@@ -43,7 +43,7 @@ void VW::topk::predict(LEARNER::single_learner& base, multi_ex& ec_seq)
   for (auto ec : ec_seq)
   {
     base.predict(*ec);
-    update_priority_queue(ec->pred.scalar, ec->tag);
+    update_priority_queue(ec->pred.scalar(), ec->tag);
   }
 }
 
@@ -52,7 +52,7 @@ void VW::topk::learn(LEARNER::single_learner& base, multi_ex& ec_seq)
   for (auto ec : ec_seq)
   {
     base.learn(*ec);
-    update_priority_queue(ec->pred.scalar, ec->tag);
+    update_priority_queue(ec->pred.scalar(), ec->tag);
   }
 }
 
@@ -101,7 +101,7 @@ void print_result(int file_descriptor, std::pair<VW::topk::const_iterator_t, VW:
 
 void output_example(vw& all, example& ec)
 {
-  label_data& ld = ec.l.simple;
+  label_data& ld = ec.l.simple();
 
   all.sd->update(ec.test_only, ld.label != FLT_MAX, ec.loss, ec.weight, ec.num_features);
   if (ld.label != FLT_MAX)
@@ -142,6 +142,6 @@ LEARNER::base_learner* topk_setup(options_i& options, vw& all)
   LEARNER::learner<VW::topk, multi_ex>& l =
       init_learner(data, as_singleline(setup_base(options, all)), predict_or_learn<true>, predict_or_learn<false>);
   l.set_finish_example(finish_example);
-
+  l.label_type = label_type_t::simple;
   return make_base(l);
 }
diff --git a/vowpalwabbit/util.h b/vowpalwabbit/util.h
new file mode 100644
index 00000000000..ec1abdbca9f
--- /dev/null
+++ b/vowpalwabbit/util.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "example.h"
+#include "prediction.h"
+
+inline void swap_to_scores(multi_ex& examples)
+{
+  for (auto& ex : examples)
+  {
+    ex->pred.reinterpret(prediction_type_t::action_scores);
+  }
+}
+
+inline void swap_to_probs(multi_ex& examples)
+{
+  for (auto& ex : examples)
+  {
+    ex->pred.reinterpret(prediction_type_t::action_probs);
+  }
+}
\ No newline at end of file
diff --git a/vowpalwabbit/v_array.h b/vowpalwabbit/v_array.h
index 48b4ec9a72b..feb795a54a3 100644
--- a/vowpalwabbit/v_array.h
+++ b/vowpalwabbit/v_array.h
@@ -22,12 +22,28 @@
 #endif
 
 #include "memory.h"
+#include "future_compat.h"
 
 const size_t erase_point = ~((1u << 10u) - 1u);
 
 template <class T>
 struct v_array
 {
+ private:
+   void delete_v_array()
+   {
+     if (_begin != nullptr)
+     {
+       for (T* item = _begin; item != _end; ++item) item->~T();
+       free(_begin);
+     }
+     _begin = nullptr;
+     _end = nullptr;
+     end_array = nullptr;
+     erase_count = 0;
+  }
+
+
   // private:
   T* _begin;
   T* _end;
@@ -46,12 +62,50 @@ struct v_array
   inline T* cbegin() const { return _begin; }
   inline T* cend() const { return _end; }
 
-  // v_array cannot have a user-defined constructor, because it participates in various unions.
-  // union members cannot have user-defined constructors.
-  // v_array() : _begin(nullptr), _end(nullptr), end_array(nullptr), erase_count(0) {}
-  // ~v_array() {
-  //  delete_v();
-  // }
+  v_array() : _begin(nullptr), _end(nullptr), end_array(nullptr), erase_count(0) {}
+  ~v_array() { delete_v_array(); }
+
+  v_array(v_array<T>&& other)
+  {
+    erase_count = 0;
+    _begin = nullptr;
+    _end = nullptr;
+    end_array = nullptr;
+
+    std::swap(_begin, other._begin);
+    std::swap(_end, other._end);
+    std::swap(end_array, other.end_array);
+    std::swap(erase_count, other.erase_count);
+  }
+
+  v_array<T>& operator=(v_array<T>&& other)
+  {
+    delete_v_array();
+    std::swap(_begin, other._begin);
+    std::swap(_end, other._end);
+    std::swap(end_array, other.end_array);
+    std::swap(erase_count, other.erase_count);
+    return *this;
+  }
+
+  v_array(const v_array<T>& other)
+  {
+    _begin = nullptr;
+    _end = nullptr;
+    end_array = nullptr;
+    erase_count = 0;
+
+    // TODO this should use the other version when T is trivially copyable and this otherwise.
+    copy_array_no_memcpy(*this, other);
+  }
+
+  v_array<T>& operator=(const v_array<T>& other)
+  {
+    delete_v_array();
+    copy_array_no_memcpy(*this, other);
+    return *this;
+  }
+
   T last() const { return *(_end - 1); }
   T pop() { return *(--_end); }
   bool empty() const { return _begin == _end; }
@@ -93,14 +147,10 @@ struct v_array
     for (T* item = _begin; item != _end; ++item) item->~T();
     _end = _begin;
   }
-  void delete_v()
-  {
-    if (_begin != nullptr)
-    {
-      for (T* item = _begin; item != _end; ++item) item->~T();
-      free(_begin);
-    }
-    _begin = _end = end_array = nullptr;
+
+  VW_DEPRECATED("delete_v is no longer supported. Use the destructor of the object to clean up.")
+  void delete_v() {
+    delete_v_array();
   }
   void push_back(const T& new_ele)
   {
@@ -109,7 +159,15 @@ struct v_array
     new (_end++) T(new_ele);
   }
 
+  void push_back(T&& new_ele)
+  {
+    if (_end == end_array)
+      resize(2 * (end_array - _begin) + 3);
+    new (_end++) T(std::move(new_ele));
+  }
+
   void push_back_unchecked(const T& new_ele) { new (_end++) T(new_ele); }
+  void push_back_unchecked(T&& new_ele) { new (_end++) T(std::move(new_ele)); }
 
   template <class... Args>
   void emplace_back(Args&&... args)
@@ -180,12 +238,26 @@ struct v_array
 
     return false;
   }
+
+  template<typename U>
+  friend void copy_array(v_array<U>& dst, const v_array<U>& src);
+  template<typename U>
+  friend void copy_array_no_memcpy(v_array<U>& dst, const v_array<U>& src);
+  template<typename U>
+  friend void copy_array(v_array<U>& dst, const v_array<U>& src, U (*copy_item)(U&));
+  template<typename U>
+  friend void push_many(v_array<U>& v, const U* _begin, size_t num);
+  template<typename U>
+  friend void calloc_reserve(v_array<U>& v, size_t length);
+
+  friend class io_buf;
 };
 
 template <class T>
+VW_DEPRECATED("v_init is no longer supported, use the constructor.")
 inline v_array<T> v_init()
 {
-  return {nullptr, nullptr, nullptr, 0};
+  return v_array<T>();
 }
 
 template <class T>
@@ -232,18 +304,20 @@ void calloc_reserve(v_array<T>& v, size_t length)
 }
 
 template <class T>
+VW_DEPRECATED("This performs a copy return and is no longer possible. Need to work out a better way here.")
 v_array<T> pop(v_array<v_array<T> >& stack)
 {
-  if (stack._end != stack._begin)
-    return *(--stack._end);
+  if (stack.end() != stack.begin())
+    return *(--stack.end());
   else
     return v_array<T>();
 }
 
 template <class T>
+VW_DEPRECATED("Use std::find")
 bool v_array_contains(v_array<T>& A, T x)
 {
-  for (T* e = A._begin; e != A._end; ++e)
+  for (T* e = A.begin(); e != A.end(); ++e)
     if (*e == x)
       return true;
   return false;
@@ -253,7 +327,7 @@ template <class T>
 std::ostream& operator<<(std::ostream& os, const v_array<T>& v)
 {
   os << '[';
-  for (T* i = v._begin; i != v._end; ++i) os << ' ' << *i;
+  for (const T* i = v.begin(); i != v.end(); ++i) os << ' ' << *i;
   os << " ]";
   return os;
 }
@@ -262,24 +336,7 @@ template <class T, class U>
 std::ostream& operator<<(std::ostream& os, const v_array<std::pair<T, U> >& v)
 {
   os << '[';
-  for (std::pair<T, U>* i = v._begin; i != v._end; ++i) os << ' ' << i->first << ':' << i->second;
+  for (const std::pair<T, U>* i = v.begin(); i != v.end(); ++i) os << ' ' << i->first << ':' << i->second;
   os << " ]";
   return os;
 }
-
-typedef v_array<unsigned char> v_string;
-
-inline v_string string2v_string(const std::string& s)
-{
-  v_string res = v_init<unsigned char>();
-  if (!s.empty())
-    push_many(res, (unsigned char*)s.data(), s.size());
-  return res;
-}
-
-inline std::string v_string2string(const v_string& v_s)
-{
-  std::string res;
-  for (unsigned char* i = v_s._begin; i != v_s._end; ++i) res.push_back(*i);
-  return res;
-}
diff --git a/vowpalwabbit/v_array_pool.h b/vowpalwabbit/v_array_pool.h
deleted file mode 100644
index f0e615b4217..00000000000
--- a/vowpalwabbit/v_array_pool.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) by respective owners including Yahoo!, Microsoft, and
-// individual contributors. All rights reserved. Released under a BSD (revised)
-// license as described in the file LICENSE.
-
-#pragma once
-
-#include "v_array.h"
-#include "object_pool.h"
-
-namespace VW
-{
-template <typename T>
-struct v_array_allocator
-{
-  v_array<T> operator()() { return v_init<T>(); }
-};
-
-template <typename T>
-struct v_array_deleter
-{
-  void operator()(v_array<T>& array) { array.delete_v(); }
-};
-
-template <typename T>
-using v_array_pool = VW::value_object_pool<v_array<T>, v_array_allocator<T>, v_array_deleter<T>>;
-}  // namespace VW
\ No newline at end of file
diff --git a/vowpalwabbit/vw.h b/vowpalwabbit/vw.h
index ede377c1c08..570d4758574 100644
--- a/vowpalwabbit/vw.h
+++ b/vowpalwabbit/vw.h
@@ -84,8 +84,13 @@ example* import_example(vw& all, const std::string& label, primitive_feature_spa
 // thus any delay introduced when freeing examples must be at least as long as the one
 // introduced by all.l->finish_example implementations.
 // e.g. multiline examples as used by cb_adf must not be released before the finishing newline example.
+VW_DEPRECATED("Do not need to specify label size, use new instead")
 example* alloc_examples(size_t, size_t);
-void dealloc_example(void (*delete_label)(void*), example& ec, void (*delete_prediction)(void*) = nullptr);
+VW_DEPRECATED("Use new instead")
+example* alloc_examples(size_t);
+
+VW_DEPRECATED("Examples can simply be deleted now.")
+void dealloc_example(void (*delete_label)(polylabel&), example& ec, void (*delete_prediction)(void*) = nullptr);
 
 void parse_example_label(vw& all, example& ec, std::string label);
 void setup_examples(vw& all, v_array<example*>& examples);
@@ -117,7 +122,8 @@ void finish_example(vw& all, example& ec);
 void finish_example(vw& all, multi_ex& ec);
 void empty_example(vw& all, example& ec);
 
-void copy_example_data(bool audit, example*, example*, size_t, void (*copy_label)(void*, void*));
+VW_DEPRECATED("Copy the label object directly.")
+void copy_example_data(bool audit, example*, example*, size_t, void (*copy_label)(polylabel&, polylabel&));
 void copy_example_metadata(bool audit, example*, example*);
 void copy_example_data(bool audit, example*, example*);  // metadata + features, don't copy the label
 void move_feature_namespace(example* dst, example* src, namespace_index c);
diff --git a/vowpalwabbit/vw.vcxproj b/vowpalwabbit/vw.vcxproj
index f5f0bc0a3d5..848aabe70ff 100644
--- a/vowpalwabbit/vw.vcxproj
+++ b/vowpalwabbit/vw.vcxproj
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Debug|Win32">
@@ -150,4 +150,4 @@
     <Error Condition="!Exists('$(SolutionDir)packages\zlib-msvc-x86.1.2.11.8900\build\native\zlib-msvc-x86.targets')" Text="$([System.String]::Format('$(ErrorText)', '$(SolutionDir)packages\zlib-msvc-x86.1.2.11.8900\build\native\zlib-msvc-x86.targets'))" />
   </Target>
   <Import Project="..\sdl\SDL-7.0-NativeAnalysis.targets" />
-</Project>
\ No newline at end of file
+</Project>
diff --git a/vowpalwabbit/vw_core.vcxproj b/vowpalwabbit/vw_core.vcxproj
index 5490691c009..3a1169411e9 100644
--- a/vowpalwabbit/vw_core.vcxproj
+++ b/vowpalwabbit/vw_core.vcxproj
@@ -28,7 +28,7 @@
     <PlatformToolset>v141</PlatformToolset>
     <!-- This is the ruleset file for code analysis, you can change it in VS -->
     <CodeAnalysisRuleSet>$(MSBuildProjectDirectory)\..\sdl\SDL-7.0-Recommended.ruleset</CodeAnalysisRuleSet>
-    <RunCodeAnalysis>true</RunCodeAnalysis>
+    <RunCodeAnalysis>false</RunCodeAnalysis>
     <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
@@ -159,6 +159,7 @@
     <ClInclude Include="interactions_predict.h" />
     <ClInclude Include="interactions.h" />
     <ClInclude Include="io_buf.h" />
+    <ClInclude Include="label.h" />
     <ClInclude Include="label_dictionary.h" />
     <ClInclude Include="lda_core.h" />
     <ClInclude Include="learner.h" />
@@ -192,7 +193,7 @@
     <ClInclude Include="parse_primitives.h" />
     <ClInclude Include="parse_regressor.h" />
     <ClInclude Include="parser.h" />
-    <ClInclude Include="print.h" />
+    <ClInclude Include="prediction.h" />
     <ClInclude Include="queue.h" />
     <ClInclude Include="rand48.h" />
     <ClInclude Include="recall_tree.h" />
@@ -275,6 +276,7 @@
     <ClCompile Include="io_buf.cc" />
     <ClCompile Include="kernel_svm.cc" />
     <ClCompile Include="label_dictionary.cc" />
+    <ClCompile Include="label_parser.cc" />
     <ClCompile Include="lda_core.cc" />
     <ClCompile Include="learner.cc" />
     <ClCompile Include="log_multi.cc" />
diff --git a/vowpalwabbit/warm_cb.cc b/vowpalwabbit/warm_cb.cc
index b979e1cb237..88cc10e4a5e 100644
--- a/vowpalwabbit/warm_cb.cc
+++ b/vowpalwabbit/warm_cb.cc
@@ -11,6 +11,7 @@
 #include "hash.h"
 #include "explore.h"
 #include "vw_exception.h"
+#include "util.h"
 
 #include <vector>
 #include <memory>
@@ -85,31 +86,19 @@ struct warm_cb
 
   ~warm_cb()
   {
-    CB::cb_label.delete_label(&cb_label);
-    a_s.delete_v();
+    delete[] csls;
+    delete[] cbls;
 
-    for (size_t a = 0; a < num_actions; ++a)
+    for (auto& ex : ecs)
     {
-      COST_SENSITIVE::cs_label.delete_label(&csls[a]);
+      ex->~example();
+      free(ex);
     }
-    free(csls);
-    free(cbls);
 
-    for (size_t a = 0; a < num_actions; ++a)
+    for (auto& ex : ws_vali)
     {
-      ecs[a]->pred.a_s.delete_v();
-      VW::dealloc_example(CB::cb_label.delete_label, *ecs[a]);
-      free_it(ecs[a]);
-    }
-
-    a_s_adf.delete_v();
-    for (size_t i = 0; i < ws_vali.size(); ++i)
-    {
-      if (use_cs)
-        VW::dealloc_example(COST_SENSITIVE::cs_label.delete_label, *ws_vali[i]);
-      else
-        VW::dealloc_example(MULTICLASS::mc_label.delete_label, *ws_vali[i]);
-      free(ws_vali[i]);
+      ex->~example();
+      free(ex);
     }
   }
 };
@@ -137,20 +126,10 @@ float loss_cs(warm_cb& data, v_array<COST_SENSITIVE::wclass>& costs, uint32_t fi
 }
 
 template <class T>
-uint32_t find_min(std::vector<T> arr)
+uint32_t find_min(std::vector<T>& arr)
 {
-  T min_val = FLT_MAX;
-  uint32_t argmin = 0;
-
-  for (uint32_t i = 0; i < arr.size(); i++)
-  {
-    if (arr[i] < min_val)
-    {
-      min_val = arr[i];
-      argmin = i;
-    }
-  }
-  return argmin;
+  auto min = std::min_element(arr.begin(), arr.end());
+  return static_cast<uint32_t>(std::distance(arr.begin(), min));
 }
 
 void finish(warm_cb& data)
@@ -175,8 +154,12 @@ void copy_example_to_adf(warm_cb& data, example& ec)
   {
     auto& eca = *data.ecs[a];
     // clear label
-    auto& lab = eca.l.cb;
-    CB::cb_label.default_label(&lab);
+    CB::default_label(eca.l.cb());
+    if (eca.pred.get_type() != prediction_type_t::action_probs)
+    {
+      eca.pred.reset();
+      eca.pred.init_as_action_probs();
+    }
 
     // copy data
     VW::copy_example_data(false, &eca, &ec);
@@ -191,7 +174,7 @@ void copy_example_to_adf(warm_cb& data, example& ec)
     }
 
     // avoid empty example by adding a tag (hacky)
-    if (CB_ALGS::example_is_newline_not_header(eca) && CB::cb_label.test_label(&eca.l))
+    if (CB_ALGS::example_is_newline_not_header(eca) && CB::cb_label.test_label(eca.l))
     {
       eca.tag.push_back('n');
     }
@@ -308,7 +291,7 @@ uint32_t predict_sublearner_adf(warm_cb& data, multi_learner& base, example& ec,
 {
   copy_example_to_adf(data, ec);
   base.predict(data.ecs, i);
-  return data.ecs[0]->pred.a_s[0].action + 1;
+  return data.ecs[0]->pred.action_probs()[0].action + 1;
 }
 
 void accumu_costs_iv_adf(warm_cb& data, multi_learner& base, example& ec)
@@ -328,13 +311,9 @@ template <bool use_cs>
 void add_to_vali(warm_cb& data, example& ec)
 {
   // TODO: set the first parameter properly
-  example* ec_copy = VW::alloc_examples(sizeof(polylabel), 1);
-
-  if (use_cs)
-    VW::copy_example_data(false, ec_copy, &ec, 0, COST_SENSITIVE::cs_label.copy_label);
-  else
-    VW::copy_example_data(false, ec_copy, &ec, 0, MULTICLASS::mc_label.copy_label);
-
+  example* ec_copy = VW::alloc_examples(1);
+  // Label copy is automatic now -> hence the nullptr
+  VW::copy_example_data(false, ec_copy, &ec, 0, nullptr);
   data.ws_vali.push_back(ec_copy);
 }
 
@@ -355,19 +334,22 @@ void learn_sup_adf(warm_cb& data, example& ec, int ec_type)
   {
     csls[a].costs[0].class_index = a + 1;
     if (use_cs)
-      csls[a].costs[0].x = loss_cs(data, ec.l.cs.costs, a + 1);
+      csls[a].costs[0].x = loss_cs(data, ec.l.cs().costs, a + 1);
     else
-      csls[a].costs[0].x = loss(data, ec.l.multi.label, a + 1);
+      csls[a].costs[0].x = loss(data, ec.l.multi().label, a + 1);
   }
   for (size_t a = 0; a < data.num_actions; ++a)
   {
-    cbls[a] = data.ecs[a]->l.cb;
-    data.ecs[a]->l.cs = csls[a];
+    cbls[a] = std::move(data.ecs[a]->l.cb());
+    data.ecs[a]->l.reset();
+    data.ecs[a]->l.init_as_cs(std::move(csls[a]));
   }
 
   std::vector<float> old_weights;
   for (size_t a = 0; a < data.num_actions; ++a) old_weights.push_back(data.ecs[a]->weight);
 
+  swap_to_scores(data.ecs);
+
   for (uint32_t i = 0; i < data.choices_lambda; i++)
   {
     float weight_multiplier = compute_weight_multiplier(data, i, ec_type);
@@ -376,9 +358,16 @@ void learn_sup_adf(warm_cb& data, example& ec, int ec_type)
     cs_learner->learn(data.ecs, i);
   }
 
+  swap_to_probs(data.ecs);
+
   for (size_t a = 0; a < data.num_actions; ++a) data.ecs[a]->weight = old_weights[a];
 
-  for (size_t a = 0; a < data.num_actions; ++a) data.ecs[a]->l.cb = cbls[a];
+  for (size_t a = 0; a < data.num_actions; ++a)
+  {
+    csls[a] = std::move(data.ecs[a]->l.cs());
+    data.ecs[a]->l.reset();
+    data.ecs[a]->l.init_as_cb(std::move(cbls[a]));
+  }
 }
 
 template <bool use_cs>
@@ -389,7 +378,7 @@ void predict_or_learn_sup_adf(warm_cb& data, multi_learner& base, example& ec, i
   if (ind_update(data, ec_type))
     learn_sup_adf<use_cs>(data, ec, ec_type);
 
-  ec.pred.multiclass = action;
+  ec.pred.multiclass() = action;
 }
 
 uint32_t predict_bandit_adf(warm_cb& data, multi_learner& base, example& ec)
@@ -401,12 +390,12 @@ uint32_t predict_bandit_adf(warm_cb& data, multi_learner& base, example& ec)
 
   auto& out_ec = *data.ecs[0];
   uint32_t chosen_action;
-  if (sample_after_normalizing(data.app_seed + data.example_counter++, begin_scores(out_ec.pred.a_s),
-          end_scores(out_ec.pred.a_s), chosen_action))
+  if (sample_after_normalizing(data.app_seed + data.example_counter++, begin_scores(out_ec.pred.action_probs()),
+          end_scores(out_ec.pred.action_probs()), chosen_action))
     THROW("Failed to sample from pdf");
 
   auto& a_s = data.a_s_adf;
-  copy_array<action_score>(a_s, out_ec.pred.a_s);
+  copy_array<action_score>(a_s, out_ec.pred.action_probs());
 
   return chosen_action;
 }
@@ -417,7 +406,7 @@ void learn_bandit_adf(warm_cb& data, multi_learner& base, example& ec, int ec_ty
 
   // add cb label to chosen action
   auto& cl = data.cl_adf;
-  auto& lab = data.ecs[cl.action - 1]->l.cb;
+  auto& lab = data.ecs[cl.action - 1]->l.cb();
   lab.costs.push_back(cl);
 
   std::vector<float> old_weights;
@@ -447,9 +436,9 @@ void predict_or_learn_bandit_adf(warm_cb& data, multi_learner& base, example& ec
     THROW("No action with non-zero probability found!");
 
   if (use_cs)
-    cl.cost = loss_cs(data, ec.l.cs.costs, cl.action);
+    cl.cost = loss_cs(data, ec.l.cs().costs, cl.action);
   else
-    cl.cost = loss(data, ec.l.multi.label, cl.action);
+    cl.cost = loss(data, ec.l.multi().label, cl.action);
 
   if (ec_type == INTERACTION)
     accumu_costs_iv_adf(data, base, ec);
@@ -457,7 +446,7 @@ void predict_or_learn_bandit_adf(warm_cb& data, multi_learner& base, example& ec
   if (ind_update(data, ec_type))
     learn_bandit_adf(data, base, ec, ec_type);
 
-  ec.pred.multiclass = cl.action;
+  ec.pred.multiclass() = cl.action;
 }
 
 void accumu_var_adf(warm_cb& data, multi_learner& base, example& ec)
@@ -477,12 +466,12 @@ void predict_or_learn_adf(warm_cb& data, multi_learner& base, example& ec)
 {
   // Corrupt labels (only corrupting multiclass labels as of now)
   if (use_cs)
-    data.cs_label = ec.l.cs;
+    data.cs_label = ec.l.cs();
   else
   {
-    data.mc_label = ec.l.multi;
+    data.mc_label = ec.l.multi();
     if (data.ws_iter < data.ws_period)
-      ec.l.multi.label = corrupt_action(data, data.mc_label.label, WARM_START);
+      ec.l.multi().label = corrupt_action(data, data.mc_label.label, WARM_START);
   }
 
   // Warm start phase
@@ -508,14 +497,14 @@ void predict_or_learn_adf(warm_cb& data, multi_learner& base, example& ec)
   else
   {
     ec.weight = 0;
-    ec.pred.multiclass = 1;
+    ec.pred.multiclass() = 1;
   }
 
   // Restore the original labels
   if (use_cs)
-    ec.l.cs = data.cs_label;
+    ec.l.cs() = std::move(data.cs_label);
   else
-    ec.l.multi = data.mc_label;
+    ec.l.multi() = data.mc_label;
 }
 
 void init_adf_data(warm_cb& data, const uint32_t num_actions)
@@ -528,19 +517,19 @@ void init_adf_data(warm_cb& data, const uint32_t num_actions)
   data.ecs.resize(num_actions);
   for (size_t a = 0; a < num_actions; ++a)
   {
-    data.ecs[a] = VW::alloc_examples(CB::cb_label.label_size, 1);
-    auto& lab = data.ecs[a]->l.cb;
-    CB::cb_label.default_label(&lab);
+    data.ecs[a] = VW::alloc_examples(1);
+    auto& lab = data.ecs[a]->l.init_as_cb();
+    CB::default_label(lab);
   }
 
   // The rest of the initialization is for warm start CB
-  data.csls = calloc_or_throw<COST_SENSITIVE::label>(num_actions);
+  data.csls = new COST_SENSITIVE::label[num_actions];
   for (uint32_t a = 0; a < num_actions; ++a)
   {
-    COST_SENSITIVE::cs_label.default_label(&data.csls[a]);
+    COST_SENSITIVE::default_label(data.csls[a]);
     data.csls[a].costs.push_back({0, a + 1, 0, 0});
   }
-  data.cbls = calloc_or_throw<CB::label>(num_actions);
+  data.cbls = new CB::label[num_actions];
 
   data.ws_train_size = data.ws_period;
   data.ws_vali_size = 0;
@@ -614,7 +603,6 @@ base_learner* warm_cb_setup(options_i& options, vw& all)
   }
 
   data->app_seed = uniform_hash("vw", 2, 0);
-  data->a_s = v_init<action_score>();
   data->all = &all;
   data->_random_state = all.get_random_state();
   data->use_cs = use_cs;
@@ -645,14 +633,19 @@ base_learner* warm_cb_setup(options_i& options, vw& all)
   }
 
   if (use_cs)
+  {
     l = &init_cost_sensitive_learner(
         data, base, predict_or_learn_adf<true, true>, predict_or_learn_adf<false, true>, all.p, data->choices_lambda);
+    l->label_type = label_type_t::cs;
+  }
   else
+  {
     l = &init_multiclass_learner(
         data, base, predict_or_learn_adf<true, false>, predict_or_learn_adf<false, false>, all.p, data->choices_lambda);
+    l->label_type = label_type_t::multi;
+  }
 
   l->set_finish(finish);
-  all.delete_prediction = nullptr;
 
   return make_base(*l);
 }