Add chain hash option (#2214)

* Add chain hash option * Fix comment Co-authored-by: Jack Gerrits <jackgerrits@users.noreply.github.com>
VowpalWabbit · Feb 20, 2020 · 39b5d37 · 39b5d37
1 parent 899ad76
commit 39b5d37
Show file tree

Hide file tree

Showing 16 changed files with 233 additions and 30 deletions.
diff --git a/cs/unittest/RunTests.tt b/cs/unittest/RunTests.tt
@@ -18,7 +18,7 @@ var lines = File.ReadAllLines(Path.Combine(testRoot, "RunTests"))
 var skipList = new[] { 13, 32, 39, 40, 41, 59, 60, 61, 66, 68, 90,
 	25, 26, // crash
 	92, 95, 96, 98,	91, 99, 118, 119, 120,
-	176, 177, //depend on shell scripts for input/output
+	176, 177, 207, 208, //depend on shell scripts for input/output
 	14, 16, 17, 31, 33, 34,53, 101, 102, 103, 105, 106, 111, 112, // float delta
 	71, // --examples to test parser
 	143, 144, 146, 158, 189, 202, // native json parsing

diff --git a/test/RunTests b/test/RunTests
@@ -372,8 +372,8 @@ sub lenient_array_compare($$) {
         next if ($word1 eq $word2);
 
         # Some output contains '...', remove this for comparison.
-        $word1 =~ s/\.\.\.//; 
-        $word2 =~ s/\.\.\.//; 
+        $word1 =~ s/\.\.\.//;
+        $word2 =~ s/\.\.\.//;
 
         # There's some difference, is it significant?
         unless (looks_like_number($word1)) {
@@ -1787,4 +1787,22 @@ python3 ./cluster_test.py --vw ../build/vowpalwabbit/vw --spanning_tree ../build
     train-sets/ref/cache_interaction_audit.stdout
     train-sets/ref/cache_interaction_audit.stderr
 
+# Test 207: Enable chain hash option for json example
+{VW} --audit --json -d train-sets/chain_hash_json_test.json --invert_hash chain_hash_json_result.cmp --chain_hash && \
+    tail -n +2 chain_hash_json_result.cmp > chain_hash_json_result.cmp.new && \
+        rm chain_hash_json_result.cmp && \
+            mv chain_hash_json_result.cmp.new chain_hash_json_result.cmp
+    test-sets/ref/chain_hash_json_test.stderr
+    test-sets/ref/chain_hash_json_test.stdout
+    test-sets/ref/chain_hash_json_result.cmp
+
+# Test 208: Enable chain hash option for text example
+{VW} --audit -d train-sets/chain_hash_text_test.dat --invert_hash chain_hash_text_result.cmp --chain_hash && \
+    tail -n +2 chain_hash_text_result.cmp > chain_hash_text_result.cmp.new && \
+        rm chain_hash_text_result.cmp && \
+            mv chain_hash_text_result.cmp.new chain_hash_text_result.cmp
+    test-sets/ref/chain_hash_text_result.stderr
+    test-sets/ref/chain_hash_text_result.stdout
+    test-sets/ref/chain_hash_text_result.cmp
+
 # Do not delete this line or the empty line above it
diff --git a/test/test-sets/ref/chain_hash_json_result.cmp b/test/test-sets/ref/chain_hash_json_result.cmp
@@ -0,0 +1,14 @@
+Id
+Min label:0
+Max label:1
+bits:18
+lda:0
+0 ngram:
+0 skip:
+options:
+Checksum: 2704463348
+:0
+Constant:116060:0.702477
+emotion^happiness:196388:0.221601
+emotion^happiness^true:248861:0.205255
+happiness:143395:0.341188
diff --git a/test/test-sets/ref/chain_hash_json_test.stderr b/test/test-sets/ref/chain_hash_json_test.stderr
@@ -0,0 +1,19 @@
+Num weight bits = 18
+learning rate = 0.5
+initial_t = 0
+power_t = 0.5
+using no cache
+Reading datafile = train-sets/chain_hash_json_test.json
+num sources = 1
+average  since         example        example  current  current  current
+loss     last          counter         weight    label  predict features
+1.000000 1.000000            1            1.0   1.0000   0.0000        1
+0.683985 0.367969            2            2.0   1.0000   0.3934        2
+0.399624 0.115263            4            4.0   1.0000   0.5724        2
+
+finished run
+number of examples = 5
+weighted example sum = 5.000000
+weighted label sum = 5.000000
+average loss = 0.344573
+total feature number = 9
diff --git a/test/test-sets/ref/chain_hash_json_test.stdout b/test/test-sets/ref/chain_hash_json_test.stdout
@@ -0,0 +1,10 @@
+0
+	Constant:116060:1:0@0
+0.393395
+	Constant:116060:1:0.393395@4	happiness:143395:1:0@0
+0.781713
+	Constant:116060:1:0.526028@5.47188	happiness:143395:1:0.255685@1.47188
+0.572360
+	Constant:116060:1:0.57236@5.66247	emotion^happiness:196388:1:0@0
+0.647337
+	Constant:116060:1:0.647337@6.39398	emotion^happiness^true:248861:1:0@0
diff --git a/test/test-sets/ref/chain_hash_text_result.cmp b/test/test-sets/ref/chain_hash_text_result.cmp
@@ -0,0 +1,16 @@
+Id
+Min label:0
+Max label:1
+bits:18
+lda:0
+0 ngram:
+0 skip:
+options:
+Checksum: 2704463348
+:0
+Constant:116060:0.651544
+emotion^happiness:196388:0.218498
+emotion^happiness^true:54560:0.208726
+happiness:143395:0.36921
+happiness^true:172839:0.297672
+sadness^false:94443:0.146142
diff --git a/test/test-sets/ref/chain_hash_text_result.stderr b/test/test-sets/ref/chain_hash_text_result.stderr
@@ -0,0 +1,21 @@
+Num weight bits = 18
+learning rate = 0.5
+initial_t = 0
+power_t = 0.5
+using no cache
+Reading datafile = train-sets/chain_hash_text_test.dat
+num sources = 1
+average  since         example        example  current  current  current
+loss     last          counter         weight    label  predict features
+1.000000 1.000000            1            1.0   1.0000   0.0000        2
+0.621600 0.243201            2            2.0   1.0000   0.5068        2
+0.428942 0.236284            4            4.0   1.0000   0.7267        3
+
+finished run
+number of examples = 6
+weighted example sum = 6.000000
+weighted label sum = 6.000000
+average loss = 0.352323
+best constant = 1.000000
+best constant's loss = 0.000000
+total feature number = 13
diff --git a/test/test-sets/ref/chain_hash_text_test.stdout b/test/test-sets/ref/chain_hash_text_test.stdout
@@ -0,0 +1,12 @@
+0
+	happiness:143395:1:0@0	Constant:116060:1:0@0
+0.506846
+	happiness:143395:1:0.253423@4	Constant:116060:1:0.253423@4
+0.369210
+	Constant:116060:1:0.36921@4.9728	happiness^true:172839:1:0@0
+0.726737
+	Constant:116060:1:0.487168@6.56439	happiness^true:172839:1:0.239569@1.59158	sadness^false:94443:1:0@0
+0.517663
+	Constant:116060:1:0.517663@6.86308	emotion^happiness:196388:1:0@0
+0.593155
+	Constant:116060:1:0.593155@7.79367	emotion^happiness^true:54560:1:0@0
diff --git a/test/train-sets/chain_hash_json_test.json b/test/train-sets/chain_hash_json_test.json
@@ -0,0 +1,5 @@
+{"_label":1,"happiness":null}
+{"_label":1,"happiness":1}
+{"_label":1,"happiness":true}
+{"_label":1,"emotion":{"happiness":1}}
+{"_label":1,"emotion":{"happiness":"true"}}
diff --git a/test/train-sets/chain_hash_text_test.dat b/test/train-sets/chain_hash_text_test.dat
@@ -0,0 +1,6 @@
++1 | happiness
++1 | happiness:1
++1 | happiness:true
++1 | happiness:    true sadness:false
++1 |emotion happiness:1
++1 |emotion happiness:true
diff --git a/vowpalwabbit/global_data.h b/vowpalwabbit/global_data.h
@@ -359,6 +359,8 @@ struct vw
   AllReduceType all_reduce_type;
   AllReduce* all_reduce;
 
+  bool chain_hash = false;
+
   LEARNER::base_learner* l;               // the top level learner
   LEARNER::single_learner* scorer;        // a scoring function
   LEARNER::base_learner* cost_sensitive;  // a cost sensitive learning algorithm.  can be single or multi line learner

diff --git a/vowpalwabbit/parse_args.cc b/vowpalwabbit/parse_args.cc
@@ -436,7 +436,10 @@ input_options parse_source(vw& all, options_i& options)
               .help(
                   "use gzip format whenever possible. If a cache file is being created, this option creates a "
                   "compressed cache file. A mixture of raw-text & compressed inputs are supported with autodetection."))
-      .add(make_option("no_stdin", all.stdin_off).help("do not default to reading from stdin"));
+      .add(make_option("no_stdin", all.stdin_off).help("do not default to reading from stdin"))
+      .add(make_option("chain_hash", parsed_options.chain_hash)
+               .help("enable chain hash for feature name and string feature value. e.g. {'A': {'B': 'C'}} is hashed as A^B^C"));
+
 
   options.add_and_parse(input_options);
 

diff --git a/vowpalwabbit/parse_args.h b/vowpalwabbit/parse_args.h
@@ -20,6 +20,7 @@ struct input_options
   bool dsjson;
   bool kill_cache;
   bool compressed;
+  bool chain_hash;
 };
 
 // trace listener + context need to be passed at initialization to capture all messages.

diff --git a/vowpalwabbit/parse_example.cc b/vowpalwabbit/parse_example.cc
@@ -2,7 +2,6 @@
 // individual contributors. All rights reserved. Released under a BSD (revised)
 // license as described in the file LICENSE.
 
-#include <cmath>
 #include <cmath>
 #include <cctype>
 #include "parse_example.h"
@@ -67,6 +66,7 @@ class TC_parser
   v_array<char> _spelling;
   uint32_t _hash_seed;
   uint64_t _parse_mask;
+  bool _chain_hash;
 
   std::array<std::vector<std::shared_ptr<feature_dict>>, NUM_NAMESPACES>* _namespace_dictionaries;
 
@@ -93,37 +93,59 @@ class TC_parser
     }
   }
 
-  inline float featureValue()
+  inline VW::string_view stringFeatureValue(VW::string_view sv)
   {
-    if (_read_idx >= _line.size() || _line[_read_idx] == ' ' || _line[_read_idx] == '\t' ||
-        _line[_read_idx] == '|' || _line[_read_idx] == '\r')
-      return 1.;
+    size_t start_idx = sv.find_first_not_of(" \t\r\n");
+    if (start_idx > 0 && start_idx != std::string::npos)
+    {
+      _read_idx += start_idx;
+      sv.remove_prefix(start_idx);
+    }
+
+    size_t end_idx = sv.find_first_of(" \t\r\n");
+    if (end_idx == std::string::npos)
+    {
+      end_idx = sv.size();
+    }
+    _read_idx += end_idx;
+    return sv.substr(0, end_idx);
+  }
+
+  inline bool isFeatureValueFloat(float& float_feature_value)
+  {
+    if (_read_idx >= _line.size() || _line[_read_idx] == ' ' || _line[_read_idx] == '\t' || _line[_read_idx] == '|' ||
+        _line[_read_idx] == '\r')
+    {
+      float_feature_value = 1.;
+      return true;
+    }
+
     else if (_line[_read_idx] == ':')
     {
       // featureValue --> ':' 'Float'
       ++_read_idx;
       size_t end_read = 0;
       VW::string_view sv = _line.substr(_read_idx);
-      _v = parseFloat(sv.begin(), end_read, sv.end());
+      _v = float_feature_value = parseFloat(sv.begin(), end_read, sv.end());
       if (end_read == 0)
       {
-        parserWarning("malformed example! Float expected after : \"", _line.substr(0, _read_idx), "\"");
+        return false;
       }
       if (std::isnan(_v))
       {
-        _v = 0.f;
+        _v = float_feature_value = 0.f;
         parserWarning(
             "warning: invalid feature value:\"", _line.substr(_read_idx), "\" read as NaN. Replacing with 0.");
       }
       _read_idx += end_read;
-      return _v;
+      return true;
     }
     else
     {
+      _v = float_feature_value = 0.f;
       // syntax error
-      parserWarning(
-          "malformed example! '|', ':', space, or EOL expected after : \"", _line.substr(0, _read_idx), "\"");
-      return 0.f;
+      parserWarning("malformed example! '|', ':', space, or EOL expected after : \"", _line.substr(0, _read_idx), "\"");
+      return true;
     }
   }
 
@@ -148,20 +170,57 @@ class TC_parser
     {
       // maybeFeature --> 'String' FeatureValue
       VW::string_view feature_name = read_name();
-      _v = _cur_channel_v * featureValue();
+      VW::string_view string_feature_value;
+
+      float float_feature_value = 0.f;
+      bool is_feature_float = isFeatureValueFloat(float_feature_value);
+
+      if (_chain_hash && !is_feature_float)
+      {
+        string_feature_value = stringFeatureValue(_line.substr(_read_idx));
+        _v = 1;
+      }
+      else
+      {
+        _v = _cur_channel_v * float_feature_value;
+      }
+
+
       uint64_t word_hash;
-      if (!feature_name.empty())
+
+      if (_chain_hash && !string_feature_value.empty())
+      {
+        word_hash = (_p->hasher(feature_name.begin(), feature_name.length(),
+                         _p->hasher(string_feature_value.begin(), string_feature_value.length(), _channel_hash)) & _parse_mask);
+      }
+      else if (!feature_name.empty())
+      {
         word_hash = (_p->hasher(feature_name.begin(), feature_name.length(), _channel_hash) & _parse_mask);
+      }
       else
+      {
         word_hash = _channel_hash + _anon++;
+      }
+
       if (_v == 0)
         return;  // dont add 0 valued features to list of features
       features& fs = _ae->feature_space[_index];
       fs.push_back(_v, word_hash);
+
       if (audit)
       {
-        fs.space_names.push_back(audit_strings_ptr(new audit_strings(_base.to_string(), feature_name.to_string())));
+        if (_chain_hash && !string_feature_value.empty())
+        {
+          std::stringstream ss;
+          ss << feature_name << "^" << string_feature_value;
+          fs.space_names.push_back(audit_strings_ptr(new audit_strings(_base.to_string(), ss.str())));
+        }
+        else
+        {
+          fs.space_names.push_back(audit_strings_ptr(new audit_strings(_base.to_string(), feature_name.to_string())));
+        }
       }
+
       if (((*_affix_features)[_index] > 0) && (!feature_name.empty()))
       {
         features& affix_fs = _ae->feature_space[affix_namespace];
@@ -182,8 +241,7 @@ class TC_parser
               affix_name.remove_prefix(affix_name.size() - len);
           }
 
-          word_hash =
-              _p->hasher(affix_name.begin(), affix_name.length(), (uint64_t)_channel_hash) * (affix_constant + (affix & 0xF) * quadratic_constant);
+          word_hash = _p->hasher(affix_name.begin(), affix_name.length(), (uint64_t)_channel_hash) * (affix_constant + (affix & 0xF) * quadratic_constant);
           affix_fs.push_back(_v, word_hash);
           if (audit)
           {
@@ -260,7 +318,7 @@ class TC_parser
             if (audit)
               for (const auto& id : feats->indicies)
               {
-		std::stringstream ss;
+                std::stringstream ss;
                 ss << _index << '_';
                 ss << feature_name;
                 ss << '=' << id;
@@ -301,8 +359,7 @@ class TC_parser
     else
     {
       // syntax error
-      parserWarning(
-          "malformed example! '|',':', space, or EOL expected after : \"", _line.substr(0, _read_idx), "\"");
+      parserWarning("malformed example! '|',':', space, or EOL expected after : \"", _line.substr(0, _read_idx), "\"");
     }
   }
 
@@ -387,8 +444,7 @@ class TC_parser
 
   inline void listNameSpace()
   {
-    while (
-        (_read_idx < _line.size()) && (_line[_read_idx] == '|'))  // ListNameSpace --> '|' NameSpace ListNameSpace
+    while ((_read_idx < _line.size()) && (_line[_read_idx] == '|'))  // ListNameSpace --> '|' NameSpace ListNameSpace
     {
       ++_read_idx;
       nameSpace();
@@ -415,7 +471,7 @@ class TC_parser
       this->_namespace_dictionaries = &all.namespace_dictionaries;
       this->_hash_seed = all.hash_seed;
       this->_parse_mask = all.parse_mask;
-
+      this->_chain_hash = all.chain_hash;
       listNameSpace();
     }
   }