test/fix_cbtest_feature_count.patch

diff --git b/test/train-sets/ref/cb_adf_crash2.stderr a/test/train-sets/ref/cb_adf_crash2.stderr
index 2a6a7c6c..8784d074 100644
--- b/test/train-sets/ref/cb_adf_crash2.stderr
+++ a/test/train-sets/ref/cb_adf_crash2.stderr
@@ -8,11 +8,11 @@ Reading datafile = train-sets/cb_adf_crash_2.data
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-    n.a.     n.a.            1            1.0  unknown        0:1...        2
+    n.a.     n.a.            1            1.0  unknown        0:1...        3
 
 finished run
 number of examples = 1
 weighted example sum = 1.000000
 weighted label sum = 0.000000
 average loss = n.a.
-total feature number = 4
+total feature number = 3
diff --git b/test/train-sets/ref/cb_adf_dm.stderr a/test/train-sets/ref/cb_adf_dm.stderr
index 5ee390d3..d790ee21 100644
--- b/test/train-sets/ref/cb_adf_dm.stderr
+++ a/test/train-sets/ref/cb_adf_dm.stderr
@@ -8,7 +8,7 @@ Reading datafile = train-sets/cb_test.ldf
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-2.000000 2.000000            1            1.0    known        0:0...       12
+2.000000 2.000000            1            1.0    known        0:0...       18
 1.000000 0.000000            2            2.0    known        1:-0.0849907...        8
 
 finished run
@@ -16,4 +16,4 @@ number of examples = 3
 weighted example sum = 3.000000
 weighted label sum = 0.000000
 average loss = 1.000000
-total feature number = 31
+total feature number = 34
diff --git b/test/train-sets/ref/cb_adf_dr.stderr a/test/train-sets/ref/cb_adf_dr.stderr
index 13366ba5..265c65e4 100644
--- b/test/train-sets/ref/cb_adf_dr.stderr
+++ a/test/train-sets/ref/cb_adf_dr.stderr
@@ -8,7 +8,7 @@ Reading datafile = train-sets/cb_test.ldf
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-2.000000 2.000000            1            1.0    known        0:0...       12
+2.000000 2.000000            1            1.0    known        0:0...       18
 0.931417 -0.137166            2            2.0    known        1:0.121728...        8
 
 finished run
@@ -16,4 +16,4 @@ number of examples = 3
 weighted example sum = 3.000000
 weighted label sum = 0.000000
 average loss = 0.931417
-total feature number = 31
+total feature number = 34
diff --git b/test/train-sets/ref/cb_adf_mtr.stderr a/test/train-sets/ref/cb_adf_mtr.stderr
index f0759d88..b7401476 100644
--- b/test/train-sets/ref/cb_adf_mtr.stderr
+++ a/test/train-sets/ref/cb_adf_mtr.stderr
@@ -7,7 +7,7 @@ Reading datafile = train-sets/cb_test.ldf
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-2.000000 2.000000            1            1.0    known        0:0...        9
+2.000000 2.000000            1            1.0    known        0:0...       15
 1.000000 0.000000            2            2.0    known        1:0...        6
 
 finished run
@@ -15,4 +15,4 @@ number of examples = 3
 weighted example sum = 3.000000
 weighted label sum = 0.000000
 average loss = 1.000000
-total feature number = 23
+total feature number = 27
diff --git b/test/train-sets/ref/cb_adf_rank.stderr a/test/train-sets/ref/cb_adf_rank.stderr
index 1e9968e0..a7d50aac 100644
--- b/test/train-sets/ref/cb_adf_rank.stderr
+++ a/test/train-sets/ref/cb_adf_rank.stderr
@@ -8,7 +8,7 @@ Reading datafile = train-sets/cb_test.ldf
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-2.000000 2.000000            1            1.0    known        0:0...        9
+2.000000 2.000000            1            1.0    known        0:0...       15
 1.000000 0.000000            2            2.0    known        1:0...        6
 
 finished run
@@ -16,4 +16,4 @@ number of examples = 3
 weighted example sum = 3.000000
 weighted label sum = 0.000000
 average loss = 1.000000
-total feature number = 23
+total feature number = 27
diff --git b/test/train-sets/ref/cb_adf_sm.stderr a/test/train-sets/ref/cb_adf_sm.stderr
index cfd35a18..55dc63a1 100644
--- b/test/train-sets/ref/cb_adf_sm.stderr
+++ a/test/train-sets/ref/cb_adf_sm.stderr
@@ -8,13 +8,13 @@ Reading datafile = train-sets/cb_adf_sm.data
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-6.000000 6.000000            1            1.0    known        0:0...        9
-3.000000 0.000000            2            2.0    known        2:-0.532113...        9
-1.500000 0.000000            4            4.0    known        2:-0.165079...       12
+6.000000 6.000000            1            1.0    known        0:0...       12
+3.000000 0.000000            2            2.0    known        2:-0.532113...       12
+1.500000 0.000000            4            4.0    known        2:-0.165079...       16
 
 finished run
 number of examples = 4
 weighted example sum = 4.000000
 weighted label sum = 0.000000
 average loss = 1.500000
-total feature number = 47
+total feature number = 52
diff --git b/test/train-sets/ref/cbe_adf_bag.stderr a/test/train-sets/ref/cbe_adf_bag.stderr
index bf60365f..967dd62a 100644
--- b/test/train-sets/ref/cbe_adf_bag.stderr
+++ a/test/train-sets/ref/cbe_adf_bag.stderr
@@ -8,7 +8,7 @@ Reading datafile = train-sets/cb_test.ldf
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-0.666667 0.666667            1            1.0    known        0:0.333333...        9
+0.666667 0.666667            1            1.0    known        0:0.333333...       15
 0.333333 0.000000            2            2.0    known        0:0.5...        6
 
 finished run
@@ -16,4 +16,4 @@ number of examples = 3
 weighted example sum = 3.000000
 weighted label sum = 0.000000
 average loss = 0.333333
-total feature number = 23
+total feature number = 27
diff --git b/test/train-sets/ref/cbe_adf_cover.stderr a/test/train-sets/ref/cbe_adf_cover.stderr
index 6880a093..3035d093 100644
--- b/test/train-sets/ref/cbe_adf_cover.stderr
+++ a/test/train-sets/ref/cbe_adf_cover.stderr
@@ -9,7 +9,7 @@ Reading datafile = train-sets/cb_test.ldf
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-0.666667 0.666667            1            1.0    known        0:0.333333...        9
+0.666667 0.666667            1            1.0    known        0:0.333333...       15
 0.333333 0.000000            2            2.0    known        0:0.5...        6
 
 finished run
@@ -17,4 +17,4 @@ number of examples = 3
 weighted example sum = 3.000000
 weighted label sum = 0.000000
 average loss = 0.333333
-total feature number = 23
+total feature number = 27
diff --git b/test/train-sets/ref/cbe_adf_cover_dr.json.stderr a/test/train-sets/ref/cbe_adf_cover_dr.json.stderr
index b351a486..818a5cc4 100644
--- b/test/train-sets/ref/cbe_adf_cover_dr.json.stderr
+++ a/test/train-sets/ref/cbe_adf_cover_dr.json.stderr
@@ -8,7 +8,7 @@ Reading datafile = train-sets/cb_test.json
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-0.666667 0.666667            1            1.0    known        0:0.333333...        9
+0.666667 0.666667            1            1.0    known        0:0.333333...       15
 0.333333 0.000000            2            2.0    known        0:0.5...        6
 
 finished run
@@ -16,4 +16,4 @@ number of examples = 3
 weighted example sum = 3.000000
 weighted label sum = 0.000000
 average loss = 0.333333
-total feature number = 23
+total feature number = 27
diff --git b/test/train-sets/ref/cbe_adf_cover_dr.stderr a/test/train-sets/ref/cbe_adf_cover_dr.stderr
index 08b1f32a..979eb436 100644
--- b/test/train-sets/ref/cbe_adf_cover_dr.stderr
+++ a/test/train-sets/ref/cbe_adf_cover_dr.stderr
@@ -8,7 +8,7 @@ Reading datafile = train-sets/cb_test.ldf
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-0.666667 0.666667            1            1.0    known        0:0.333333...        9
+0.666667 0.666667            1            1.0    known        0:0.333333...       15
 0.333333 0.000000            2            2.0    known        0:0.5...        6
 
 finished run
@@ -16,4 +16,4 @@ number of examples = 3
 weighted example sum = 3.000000
 weighted label sum = 0.000000
 average loss = 0.333333
-total feature number = 23
+total feature number = 27
diff --git b/test/train-sets/ref/cbe_adf_cover_dr256.json.stderr a/test/train-sets/ref/cbe_adf_cover_dr256.json.stderr
index a4909f33..78196105 100644
--- b/test/train-sets/ref/cbe_adf_cover_dr256.json.stderr
+++ a/test/train-sets/ref/cbe_adf_cover_dr256.json.stderr
@@ -8,7 +8,7 @@ Reading datafile = train-sets/cb_test256.json
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-0.666667 0.666667            1            1.0    known        0:0.333333...        9
+0.666667 0.666667            1            1.0    known        0:0.333333...       15
 0.333333 0.000000            2            2.0    known        0:0.5...        6
 0.333333 0.333333            4            4.0    known        1:0.591752...        6
 0.297761 0.262189            8            8.0    known        1:0.666667...        6
@@ -23,4 +23,4 @@ number of examples = 260
 weighted example sum = 260.000000
 weighted label sum = 0.000000
 average loss = 0.068875
-total feature number = 2210
+total feature number = 2730
diff --git b/test/train-sets/ref/cbe_adf_dsjson.stderr a/test/train-sets/ref/cbe_adf_dsjson.stderr
index abef6034..ee6fca6e 100644
--- b/test/train-sets/ref/cbe_adf_dsjson.stderr
+++ a/test/train-sets/ref/cbe_adf_dsjson.stderr
@@ -9,13 +9,13 @@ Reading datafile = train-sets/decisionservice.json
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
--0.102041 -0.102041            1            1.0    known        0:0.0833333...      361
--0.051020 0.000000            2            2.0    known        6:0.816667...      361
--0.040816 -0.020408            3            3.0    known        6:0.816667...      361
+-0.102041 -0.102041            1            1.0    known        0:0.0833333...      433
+-0.051020 0.000000            2            2.0    known        6:0.816667...      433
+-0.040816 -0.020408            3            3.0    known        6:0.816667...      433
 
 finished run
 number of examples = 3
 weighted example sum = 3.000000
 weighted label sum = 0.000000
 average loss = -0.040816
-total feature number = 1104
+total feature number = 1299
diff --git b/test/train-sets/ref/cbe_adf_epsilon.stderr a/test/train-sets/ref/cbe_adf_epsilon.stderr
index 98e59395..dacf6bfc 100644
--- b/test/train-sets/ref/cbe_adf_epsilon.stderr
+++ a/test/train-sets/ref/cbe_adf_epsilon.stderr
@@ -8,7 +8,7 @@ Reading datafile = train-sets/cb_test.ldf
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-0.666667 0.666667            1            1.0    known        0:0.333333...        9
+0.666667 0.666667            1            1.0    known        0:0.333333...       15
 0.333333 0.000000            2            2.0    known        1:0.95...        6
 
 finished run
@@ -16,4 +16,4 @@ number of examples = 3
 weighted example sum = 3.000000
 weighted label sum = 0.000000
 average loss = 0.333333
-total feature number = 23
+total feature number = 27
diff --git b/test/train-sets/ref/cbe_adf_first.stderr a/test/train-sets/ref/cbe_adf_first.stderr
index af2100bb..35158afb 100644
--- b/test/train-sets/ref/cbe_adf_first.stderr
+++ a/test/train-sets/ref/cbe_adf_first.stderr
@@ -8,7 +8,7 @@ Reading datafile = train-sets/cb_test.ldf
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-0.666667 0.666667            1            1.0    known        0:0.333333...        9
+0.666667 0.666667            1            1.0    known        0:0.333333...       15
 0.333333 0.000000            2            2.0    known        1:0.5...        6
 
 finished run
@@ -16,4 +16,4 @@ number of examples = 3
 weighted example sum = 3.000000
 weighted label sum = 0.000000
 average loss = 0.333333
-total feature number = 23
+total feature number = 27
diff --git b/test/train-sets/ref/cbe_adf_softmax.stderr a/test/train-sets/ref/cbe_adf_softmax.stderr
index 543bc037..3ea3464b 100644
--- b/test/train-sets/ref/cbe_adf_softmax.stderr
+++ a/test/train-sets/ref/cbe_adf_softmax.stderr
@@ -8,7 +8,7 @@ Reading datafile = train-sets/cb_test.ldf
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-0.666667 0.666667            1            1.0    known        0:0.333333...        9
+0.666667 0.666667            1            1.0    known        0:0.333333...       15
 0.333333 0.000000            2            2.0    known        1:0.559575...        6
 
 finished run
@@ -16,4 +16,4 @@ number of examples = 3
 weighted example sum = 3.000000
 weighted label sum = 0.000000
 average loss = 0.333333
-total feature number = 23
+total feature number = 27
diff --git b/test/train-sets/ref/no_shared_features.stderr a/test/train-sets/ref/no_shared_features.stderr
index 23c135ee..83a007ce 100644
--- b/test/train-sets/ref/no_shared_features.stderr
+++ a/test/train-sets/ref/no_shared_features.stderr
@@ -14,4 +14,4 @@ number of examples = 1
 weighted example sum = 1.000000
 weighted label sum = 0.000000
 average loss = 0.000000
-total feature number = 9
+total feature number = 8
diff --git b/test/train-sets/ref/sparse.stderr a/test/train-sets/ref/sparse.stderr
index 5ee390d3..d790ee21 100644
--- b/test/train-sets/ref/sparse.stderr
+++ a/test/train-sets/ref/sparse.stderr
@@ -8,7 +8,7 @@ Reading datafile = train-sets/cb_test.ldf
 num sources = 1
 average  since         example        example  current  current  current
 loss     last          counter         weight    label  predict features
-2.000000 2.000000            1            1.0    known        0:0...       12
+2.000000 2.000000            1            1.0    known        0:0...       18
 1.000000 0.000000            2            2.0    known        1:-0.0849907...        8
 
 finished run
@@ -16,4 +16,4 @@ number of examples = 3
 weighted example sum = 3.000000
 weighted label sum = 0.000000
 average loss = 1.000000
-total feature number = 31
+total feature number = 34