Skip to content

Commit

Permalink
Add chain hash option (#2214)
Browse files Browse the repository at this point in the history
* Add chain hash option

* Fix comment

Co-authored-by: Jack Gerrits <jackgerrits@users.noreply.github.com>
  • Loading branch information
cheng-tan and jackgerrits committed Feb 20, 2020
1 parent 899ad76 commit 39b5d37
Show file tree
Hide file tree
Showing 16 changed files with 233 additions and 30 deletions.
2 changes: 1 addition & 1 deletion cs/unittest/RunTests.tt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ var lines = File.ReadAllLines(Path.Combine(testRoot, "RunTests"))
var skipList = new[] { 13, 32, 39, 40, 41, 59, 60, 61, 66, 68, 90,
25, 26, // crash
92, 95, 96, 98, 91, 99, 118, 119, 120,
176, 177, //depend on shell scripts for input/output
176, 177, 207, 208, //depend on shell scripts for input/output
14, 16, 17, 31, 33, 34,53, 101, 102, 103, 105, 106, 111, 112, // float delta
71, // --examples to test parser
143, 144, 146, 158, 189, 202, // native json parsing
Expand Down
22 changes: 20 additions & 2 deletions test/RunTests
Original file line number Diff line number Diff line change
Expand Up @@ -372,8 +372,8 @@ sub lenient_array_compare($$) {
next if ($word1 eq $word2);

# Some output contains '...', remove this for comparison.
$word1 =~ s/\.\.\.//;
$word2 =~ s/\.\.\.//;
$word1 =~ s/\.\.\.//;
$word2 =~ s/\.\.\.//;

# There's some difference, is it significant?
unless (looks_like_number($word1)) {
Expand Down Expand Up @@ -1787,4 +1787,22 @@ python3 ./cluster_test.py --vw ../build/vowpalwabbit/vw --spanning_tree ../build
train-sets/ref/cache_interaction_audit.stdout
train-sets/ref/cache_interaction_audit.stderr
# Test 207: Enable chain hash option for json example
{VW} --audit --json -d train-sets/chain_hash_json_test.json --invert_hash chain_hash_json_result.cmp --chain_hash && \
tail -n +2 chain_hash_json_result.cmp > chain_hash_json_result.cmp.new && \
rm chain_hash_json_result.cmp && \
mv chain_hash_json_result.cmp.new chain_hash_json_result.cmp
test-sets/ref/chain_hash_json_test.stderr
test-sets/ref/chain_hash_json_test.stdout
test-sets/ref/chain_hash_json_result.cmp
# Test 208: Enable chain hash option for text example
{VW} --audit -d train-sets/chain_hash_text_test.dat --invert_hash chain_hash_text_result.cmp --chain_hash && \
tail -n +2 chain_hash_text_result.cmp > chain_hash_text_result.cmp.new && \
rm chain_hash_text_result.cmp && \
mv chain_hash_text_result.cmp.new chain_hash_text_result.cmp
test-sets/ref/chain_hash_text_result.stderr
test-sets/ref/chain_hash_text_result.stdout
test-sets/ref/chain_hash_text_result.cmp
# Do not delete this line or the empty line above it
14 changes: 14 additions & 0 deletions test/test-sets/ref/chain_hash_json_result.cmp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Id
Min label:0
Max label:1
bits:18
lda:0
0 ngram:
0 skip:
options:
Checksum: 2704463348
:0
Constant:116060:0.702477
emotion^happiness:196388:0.221601
emotion^happiness^true:248861:0.205255
happiness:143395:0.341188
19 changes: 19 additions & 0 deletions test/test-sets/ref/chain_hash_json_test.stderr
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = train-sets/chain_hash_json_test.json
num sources = 1
average since example example current current current
loss last counter weight label predict features
1.000000 1.000000 1 1.0 1.0000 0.0000 1
0.683985 0.367969 2 2.0 1.0000 0.3934 2
0.399624 0.115263 4 4.0 1.0000 0.5724 2

finished run
number of examples = 5
weighted example sum = 5.000000
weighted label sum = 5.000000
average loss = 0.344573
total feature number = 9
10 changes: 10 additions & 0 deletions test/test-sets/ref/chain_hash_json_test.stdout
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
0
Constant:116060:1:0@0
0.393395
Constant:116060:1:0.393395@4 happiness:143395:1:0@0
0.781713
Constant:116060:1:0.526028@5.47188 happiness:143395:1:0.255685@1.47188
0.572360
Constant:116060:1:0.57236@5.66247 emotion^happiness:196388:1:0@0
0.647337
Constant:116060:1:0.647337@6.39398 emotion^happiness^true:248861:1:0@0
16 changes: 16 additions & 0 deletions test/test-sets/ref/chain_hash_text_result.cmp
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Id
Min label:0
Max label:1
bits:18
lda:0
0 ngram:
0 skip:
options:
Checksum: 2704463348
:0
Constant:116060:0.651544
emotion^happiness:196388:0.218498
emotion^happiness^true:54560:0.208726
happiness:143395:0.36921
happiness^true:172839:0.297672
sadness^false:94443:0.146142
21 changes: 21 additions & 0 deletions test/test-sets/ref/chain_hash_text_result.stderr
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = train-sets/chain_hash_text_test.dat
num sources = 1
average since example example current current current
loss last counter weight label predict features
1.000000 1.000000 1 1.0 1.0000 0.0000 2
0.621600 0.243201 2 2.0 1.0000 0.5068 2
0.428942 0.236284 4 4.0 1.0000 0.7267 3

finished run
number of examples = 6
weighted example sum = 6.000000
weighted label sum = 6.000000
average loss = 0.352323
best constant = 1.000000
best constant's loss = 0.000000
total feature number = 13
12 changes: 12 additions & 0 deletions test/test-sets/ref/chain_hash_text_test.stdout
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
0
happiness:143395:1:0@0 Constant:116060:1:0@0
0.506846
happiness:143395:1:0.253423@4 Constant:116060:1:0.253423@4
0.369210
Constant:116060:1:0.36921@4.9728 happiness^true:172839:1:0@0
0.726737
Constant:116060:1:0.487168@6.56439 happiness^true:172839:1:0.239569@1.59158 sadness^false:94443:1:0@0
0.517663
Constant:116060:1:0.517663@6.86308 emotion^happiness:196388:1:0@0
0.593155
Constant:116060:1:0.593155@7.79367 emotion^happiness^true:54560:1:0@0
5 changes: 5 additions & 0 deletions test/train-sets/chain_hash_json_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"_label":1,"happiness":null}
{"_label":1,"happiness":1}
{"_label":1,"happiness":true}
{"_label":1,"emotion":{"happiness":1}}
{"_label":1,"emotion":{"happiness":"true"}}
6 changes: 6 additions & 0 deletions test/train-sets/chain_hash_text_test.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
+1 | happiness
+1 | happiness:1
+1 | happiness:true
+1 | happiness: true sadness:false
+1 |emotion happiness:1
+1 |emotion happiness:true
2 changes: 2 additions & 0 deletions vowpalwabbit/global_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,8 @@ struct vw
AllReduceType all_reduce_type;
AllReduce* all_reduce;

bool chain_hash = false;

LEARNER::base_learner* l; // the top level learner
LEARNER::single_learner* scorer; // a scoring function
LEARNER::base_learner* cost_sensitive; // a cost sensitive learning algorithm. can be single or multi line learner
Expand Down
5 changes: 4 additions & 1 deletion vowpalwabbit/parse_args.cc
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,10 @@ input_options parse_source(vw& all, options_i& options)
.help(
"use gzip format whenever possible. If a cache file is being created, this option creates a "
"compressed cache file. A mixture of raw-text & compressed inputs are supported with autodetection."))
.add(make_option("no_stdin", all.stdin_off).help("do not default to reading from stdin"));
.add(make_option("no_stdin", all.stdin_off).help("do not default to reading from stdin"))
.add(make_option("chain_hash", parsed_options.chain_hash)
.help("enable chain hash for feature name and string feature value. e.g. {'A': {'B': 'C'}} is hashed as A^B^C"));


options.add_and_parse(input_options);

Expand Down
1 change: 1 addition & 0 deletions vowpalwabbit/parse_args.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ struct input_options
bool dsjson;
bool kill_cache;
bool compressed;
bool chain_hash;
};

// trace listener + context need to be passed at initialization to capture all messages.
Expand Down
102 changes: 79 additions & 23 deletions vowpalwabbit/parse_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// individual contributors. All rights reserved. Released under a BSD (revised)
// license as described in the file LICENSE.

#include <cmath>
#include <cmath>
#include <cctype>
#include "parse_example.h"
Expand Down Expand Up @@ -67,6 +66,7 @@ class TC_parser
v_array<char> _spelling;
uint32_t _hash_seed;
uint64_t _parse_mask;
bool _chain_hash;

std::array<std::vector<std::shared_ptr<feature_dict>>, NUM_NAMESPACES>* _namespace_dictionaries;

Expand All @@ -93,37 +93,59 @@ class TC_parser
}
}

inline float featureValue()
inline VW::string_view stringFeatureValue(VW::string_view sv)
{
if (_read_idx >= _line.size() || _line[_read_idx] == ' ' || _line[_read_idx] == '\t' ||
_line[_read_idx] == '|' || _line[_read_idx] == '\r')
return 1.;
size_t start_idx = sv.find_first_not_of(" \t\r\n");
if (start_idx > 0 && start_idx != std::string::npos)
{
_read_idx += start_idx;
sv.remove_prefix(start_idx);
}

size_t end_idx = sv.find_first_of(" \t\r\n");
if (end_idx == std::string::npos)
{
end_idx = sv.size();
}
_read_idx += end_idx;
return sv.substr(0, end_idx);
}

inline bool isFeatureValueFloat(float& float_feature_value)
{
if (_read_idx >= _line.size() || _line[_read_idx] == ' ' || _line[_read_idx] == '\t' || _line[_read_idx] == '|' ||
_line[_read_idx] == '\r')
{
float_feature_value = 1.;
return true;
}

else if (_line[_read_idx] == ':')
{
// featureValue --> ':' 'Float'
++_read_idx;
size_t end_read = 0;
VW::string_view sv = _line.substr(_read_idx);
_v = parseFloat(sv.begin(), end_read, sv.end());
_v = float_feature_value = parseFloat(sv.begin(), end_read, sv.end());
if (end_read == 0)
{
parserWarning("malformed example! Float expected after : \"", _line.substr(0, _read_idx), "\"");
return false;
}
if (std::isnan(_v))
{
_v = 0.f;
_v = float_feature_value = 0.f;
parserWarning(
"warning: invalid feature value:\"", _line.substr(_read_idx), "\" read as NaN. Replacing with 0.");
}
_read_idx += end_read;
return _v;
return true;
}
else
{
_v = float_feature_value = 0.f;
// syntax error
parserWarning(
"malformed example! '|', ':', space, or EOL expected after : \"", _line.substr(0, _read_idx), "\"");
return 0.f;
parserWarning("malformed example! '|', ':', space, or EOL expected after : \"", _line.substr(0, _read_idx), "\"");
return true;
}
}

Expand All @@ -148,20 +170,57 @@ class TC_parser
{
// maybeFeature --> 'String' FeatureValue
VW::string_view feature_name = read_name();
_v = _cur_channel_v * featureValue();
VW::string_view string_feature_value;

float float_feature_value = 0.f;
bool is_feature_float = isFeatureValueFloat(float_feature_value);

if (_chain_hash && !is_feature_float)
{
string_feature_value = stringFeatureValue(_line.substr(_read_idx));
_v = 1;
}
else
{
_v = _cur_channel_v * float_feature_value;
}


uint64_t word_hash;
if (!feature_name.empty())

if (_chain_hash && !string_feature_value.empty())
{
word_hash = (_p->hasher(feature_name.begin(), feature_name.length(),
_p->hasher(string_feature_value.begin(), string_feature_value.length(), _channel_hash)) & _parse_mask);
}
else if (!feature_name.empty())
{
word_hash = (_p->hasher(feature_name.begin(), feature_name.length(), _channel_hash) & _parse_mask);
}
else
{
word_hash = _channel_hash + _anon++;
}

if (_v == 0)
return; // dont add 0 valued features to list of features
features& fs = _ae->feature_space[_index];
fs.push_back(_v, word_hash);

if (audit)
{
fs.space_names.push_back(audit_strings_ptr(new audit_strings(_base.to_string(), feature_name.to_string())));
if (_chain_hash && !string_feature_value.empty())
{
std::stringstream ss;
ss << feature_name << "^" << string_feature_value;
fs.space_names.push_back(audit_strings_ptr(new audit_strings(_base.to_string(), ss.str())));
}
else
{
fs.space_names.push_back(audit_strings_ptr(new audit_strings(_base.to_string(), feature_name.to_string())));
}
}

if (((*_affix_features)[_index] > 0) && (!feature_name.empty()))
{
features& affix_fs = _ae->feature_space[affix_namespace];
Expand All @@ -182,8 +241,7 @@ class TC_parser
affix_name.remove_prefix(affix_name.size() - len);
}

word_hash =
_p->hasher(affix_name.begin(), affix_name.length(), (uint64_t)_channel_hash) * (affix_constant + (affix & 0xF) * quadratic_constant);
word_hash = _p->hasher(affix_name.begin(), affix_name.length(), (uint64_t)_channel_hash) * (affix_constant + (affix & 0xF) * quadratic_constant);
affix_fs.push_back(_v, word_hash);
if (audit)
{
Expand Down Expand Up @@ -260,7 +318,7 @@ class TC_parser
if (audit)
for (const auto& id : feats->indicies)
{
std::stringstream ss;
std::stringstream ss;
ss << _index << '_';
ss << feature_name;
ss << '=' << id;
Expand Down Expand Up @@ -301,8 +359,7 @@ class TC_parser
else
{
// syntax error
parserWarning(
"malformed example! '|',':', space, or EOL expected after : \"", _line.substr(0, _read_idx), "\"");
parserWarning("malformed example! '|',':', space, or EOL expected after : \"", _line.substr(0, _read_idx), "\"");
}
}

Expand Down Expand Up @@ -387,8 +444,7 @@ class TC_parser

inline void listNameSpace()
{
while (
(_read_idx < _line.size()) && (_line[_read_idx] == '|')) // ListNameSpace --> '|' NameSpace ListNameSpace
while ((_read_idx < _line.size()) && (_line[_read_idx] == '|')) // ListNameSpace --> '|' NameSpace ListNameSpace
{
++_read_idx;
nameSpace();
Expand All @@ -415,7 +471,7 @@ class TC_parser
this->_namespace_dictionaries = &all.namespace_dictionaries;
this->_hash_seed = all.hash_seed;
this->_parse_mask = all.parse_mask;

this->_chain_hash = all.chain_hash;
listNameSpace();
}
}
Expand Down

0 comments on commit 39b5d37

Please sign in to comment.