-
Notifications
You must be signed in to change notification settings - Fork 1.9k
/
sklearn_vw.py
561 lines (460 loc) · 20.2 KB
/
sklearn_vw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
# -*- coding: utf-8 -*-
# pylint: unused-argument, invalid-name, too-many-arguments, too-many-locals
"""
Utilities to support integration of Vowpal Wabbit and scikit-learn
"""
import numpy as np
import re
import io
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model.base import LinearClassifierMixin, SparseCoefMixin
from sklearn.datasets.svmlight_format import dump_svmlight_file
from sklearn.utils.validation import check_is_fitted
from sklearn.externals import joblib
from vowpalwabbit import pyvw
DEFAULT_NS = ''
CONSTANT_HASH = 116060
INVALID_CHARS = re.compile(r"[\|: \n]+")
class VW(BaseEstimator):
"""Vowpal Wabbit Scikit-learn Base Estimator wrapper
Attributes
----------
params : {dict}
dictionary of model parameter keys and values
fit_ : {bool}
this variable is only created after the model is fitted
"""
params = dict()
def __init__(self,
probabilities=None,
random_seed=None,
ring_size=None,
convert_to_vw=None,
bfgs=None,
mem=None,
ftrl=None,
ftrl_alpha=None,
ftrl_beta=None,
learning_rate=None,
l=None,
power_t=None,
decay_learning_rate=None,
initial_t=None,
feature_mask=None,
initial_regressor=None,
i=None,
initial_weight=None,
random_weights=None,
input_feature_regularizer=None,
audit=None,
a=None,
progress=None,
P=None,
quiet=None,
no_stdin=None,
hash=None,
ignore=None,
keep=None,
redefine=None,
bit_precision=None,
b=None,
noconstant=None,
constant=None,
C=None,
ngram=None,
skips=None,
feature_limit=None,
affix=None,
spelling=None,
dictionary=None,
dictionary_path=None,
interactions=None,
permutations=None,
leave_duplicate_interactions=None,
quadratic=None,
q=None,
cubic=None,
testonly=None,
t=None,
min_prediction=None,
max_prediction=None,
sort_features=None,
loss_function=None,
link=None,
quantile_tau=None,
l1=None,
l2=None,
named_labels=None,
final_regressor=None,
f=None,
readable_model=None,
invert_hash=None,
passes=None,
save_resume=None,
output_feature_regularizer_binary=None,
output_feature_regularizer_text=None,
oaa=None,
ect=None,
csoaa=None,
wap=None):
"""VW model constructor, exposing all supported parameters to keep sklearn happy
Parameters
----------
probabilities
random_seed (int): seed random number generator
ring_size (int): size of example ring
convert_to_vw (bool): flag to convert X input to vw format
Update options
bfgs: use L-BFGS optimization algorithm
mem: set the rank of the inverse hessian approximation used by bfgs
ftrl: use FTRL-Proximal optimization algorithm
ftrl_alpha: ftrl alpha parameter
ftrl_beta: ftrl beta parameter
learning_rate,l (float): Set learning rate
power_t (float): t power value
decay_learning_rate (float): Set Decay factor for learning_rate between passes
initial_t (float): initial t value
feature_mask (str): Use existing regressor to determine which parameters may be updated.
If no initial_regressor given, also used for initial weights.
Weight options
initial_regressor,i (str): Initial regressor(s)
initial_weight (float): Set all weights to an initial value of arg.
random_weights (bool): make initial weights random
input_feature_regularizer (str): Per feature regularization input file
Diagnostic options
audit,a (bool): print weights of features
progress,P (str): Progress update frequency. int: additive, float: multiplicative
quiet (bool): Don't output disgnostics and progress updates
Feature options
hash (str): how to hash the features. Available options: strings, all
ignore (str): ignore namespaces beginning with character <arg>
keep (str): keep namespaces beginning with character <arg>
redefine (str): Redefine namespaces beginning with characters of string S as namespace N. <arg> shall be in
form 'N:=S' where := is operator. Empty N or S are treated as default namespace.
Use ':' as a wildcard in S.
bit_precision,b (int): number of bits in the feature table
noconstant (bool): Don't add a constant feature
constant,C (float): Set initial value of constant
ngram (str): Generate N grams. To generate N grams for a single namespace 'foo', arg should be fN.
skips (str): Generate skips in N grams. This in conjunction with the ngram tag can be used to generate
generalized n-skip-k-gram. To generate n-skips for a single namespace 'foo', arg should be fN.
feature_limit (str): limit to N features. To apply to a single namespace 'foo', arg should be fN
affix (str): generate prefixes/suffixes of features; argument '+2a,-3b,+1' means generate 2-char prefixes for
namespace a, 3-char suffixes for b and 1 char prefixes for default namespace
spelling (str): compute spelling features for a give namespace (use '_' for default namespace)
dictionary (str): read a dictionary for additional features (arg either 'x:file' or just 'file')
dictionary_path (str): look in this directory for dictionaries; defaults to current directory or env{PATH}
interactions (str): Create feature interactions of any level between namespaces.
permutations (bool): Use permutations instead of combinations for feature interactions of same namespace.
leave_duplicate_interactions (bool): Don't remove interactions with duplicate combinations of namespaces. For
ex. this is a duplicate: '-q ab -q ba' and a lot more in '-q ::'.
quadratic,q (str): Create and use quadratic features, q:: corresponds to a wildcard for all printable characters
cubic (str): Create and use cubic features
Example options
testonly,t (bool): Ignore label information and just test
min_prediction (float): Smallest prediction to output
max_prediction (float): Largest prediction to output
sort_features (bool): turn this on to disregard order in which features have been defined. This will lead to
smaller cache sizes
loss_function (str): default_value("squared"), "Specify the loss function to be used, uses squared by default.
Currently available ones are squared, classic, hinge, logistic and quantile.
link (str): apply a link function to convert output: e.g. 'logistic'
quantile_tau (float): default_value(0.5), "Parameter \\tau associated with Quantile loss. Defaults to 0.5
l1 (float): l_1 lambda
l2 (float): l_2 lambda
named_labels (str): use names for labels (multiclass, etc.) rather than integers, argument specified all
possible labels, comma-sep, eg \"--named_labels Noun,Verb,Adj,Punc\"
Output model
final_regressor,f (str): Final regressor
readable_model (str): Output human-readable final regressor with numeric features
invert_hash (str): Output human-readable final regressor with feature names. Computationally expensive.
passes (int): Number of training passes
save_resume (bool): save extra state so learning can be resumed later with new data
output_feature_regularizer_binary (str): Per feature regularization output file
output_feature_regularizer_text (str): Per feature regularization output file, in text
Multiclass options
oaa (int): Use one-against-all multiclass learning with labels
ect (int): Use error correcting tournament multiclass learning
csoaa (int): Use cost sensitive one-against-all multiclass learning
wap (int): Use weighted all pairs multiclass learning
Contextual Bandit Optimization
cb (int): Use contextual bandit learning with specified costs
cbify (int): Convert multiclass on <k> classes into a contextual bandit problem
Returns
-------
(BaseEstimator): Returns self
"""
# clear estimator attributes
if hasattr(self, 'fit_'):
del self.fit_
if hasattr(self, 'passes_'):
del self.passes_
if hasattr(self, 'convert_to_vw_'):
del self.convert_to_vw_
if hasattr(self, 'vw_'):
del self.vw_
# reset params and quiet models by default
self.params = {'quiet': True}
# assign all valid args to params dict
args = dict(locals())
for k, v in args.items():
if k != 'self' and k != '__class__' and v is not None:
self.params[k] = v
# store passes separately to be used in fit
self.passes_ = self.params.pop('passes', 1)
# pull out convert_to_vw from params
self.convert_to_vw_ = self.params.pop('convert_to_vw', True)
self.vw_ = None
super(VW, self).__init__()
def get_vw(self):
"""Factory to create a vw instance on demand
Returns
-------
pyvw.vw instance
"""
if self.vw_ is None:
self.vw_ = pyvw.vw(**self.params)
return self.vw_
def fit(self, X, y=None, sample_weight=None):
"""Fit the model according to the given training data
TODO: for first pass create and store example objects.
for N-1 passes use example objects directly (simulate cache file...but in memory for faster processing)
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features or 1 if not convert_to_vw) or
Training vector, where n_samples in the number of samples and
n_features is the number of features.
if not using convert_to_vw, X is expected to be a list of vw formatted feature vector strings with labels
y : array-like, shape (n_samples,), optional if not convert_to_vw
Target vector relative to X.
sample_weight : array-like, shape (n_samples,)
sample weight vector relative to X.
Returns
-------
return self so pipeline can call transform() after fit
"""
if self.convert_to_vw_:
X = tovw(x=X, y=y, sample_weight=sample_weight)
model = self.get_vw()
# add examples to model
for n in range(self.passes_):
if n > 1:
np.random.shuffle(X)
for idx, x in enumerate(X):
model.learn(x)
self.fit_ = True
return self
def transform(self, X, y=None):
"""Transform does nothing by default besides closing the model. Transform is required for any estimator
in a sklearn pipeline that isn't the final estimator
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features or 1 if not convert_to_vw) or
Training vector, where n_samples in the number of samples and
n_features is the number of features.
if not using convert_to_vw, X is expected to be a list of vw formatted feature vector strings with labels
y : array-like, shape (n_samples,), optional if not convert_to_vw
Target vector relative to X.
Returns
-------
return X to be passed into next estimator in pipeline
"""
if not self.get_vw().finished:
self.get_vw().finish()
return X
def predict(self, X):
"""Predict with Vowpal Wabbit model
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features or 1)
Training vector, where n_samples in the number of samples and
n_features is the number of features.
if not using convert_to_vw, X is expected to be a list of vw formatted feature vector strings with labels
Returns
-------
y : array-like, shape (n_samples,)
Output vector relative to X.
"""
check_is_fitted(self, 'fit_')
try:
num_samples = X.shape[0] if X.ndim > 1 else len(X)
except AttributeError:
num_samples = len(X)
if self.convert_to_vw_:
X = tovw(X)
model = self.get_vw()
label_type = model.get_label_type()
y = np.empty([num_samples])
# add test examples to model
for idx, x in enumerate(X):
y[idx] = model.predict(ec=x, labelType=label_type)
return y
def __str__(self):
if self.params is not None:
return str(self.params)
def __repr__(self):
return self.__str__()
def __del__(self):
if self.vw_ is not None:
self.vw_.__del__()
def get_params(self, deep=True):
"""This returns the set of vw and estimator parameters currently in use"""
out = dict()
# add in the vw params
out.update(self.params)
# add in the estimator params
out['passes'] = self.passes_
out['convert_to_vw'] = self.convert_to_vw_
return out
def set_params(self, **params):
"""This destroys and recreates the Vowpal Wabbit model with updated parameters
any parameters not provided will remain as they were initialized to at construction
Parameters
----------
params : {dict}
dictionary of model parameter keys and values to update
"""
self.params.update(params)
# manage passes and convert_to_vw params different because they are estimator params, not vw params
if 'passes' not in params:
self.params['passes'] = self.passes_
if 'convert_to_vw' not in params:
self.params['convert_to_vw'] = self.convert_to_vw_
self.__init__(**self.params)
return self
def get_coefs(self):
"""Returns coefficient weights as ordered sparse matrix
Returns
-------
{sparse matrix} coefficient weights for model
"""
model = self.get_vw()
return csr_matrix([model.get_weight(i) for i in range(model.num_weights())])
def set_coefs(self, coefs):
"""Sets coefficients weights from ordered sparse matrix
Parameters
----------
coefs : {sparse matrix} coefficient weights for model
"""
model = self.get_vw()
for i in range(coefs.getnnz()):
model.set_weight(int(coefs.indices[i]), 0, float(coefs.data[i]))
def get_intercept(self):
""" Returns intercept weight for model
Returns
-------
{int} intercept value, 0 if noconstant
"""
return self.get_vw().get_weight(CONSTANT_HASH)
def save(self, filename):
joblib.dump(dict(params=self.get_params(), coefs=self.get_coefs(), fit=self.fit_), filename=filename)
def load(self, filename):
obj = joblib.load(filename=filename)
self.set_params(**obj['params'])
self.set_coefs(obj['coefs'])
self.fit_ = obj['fit']
class ThresholdingLinearClassifierMixin(LinearClassifierMixin):
"""Mixin for linear classifiers. A threshold is used to specify the positive
class cutoff
Handles prediction for sparse and dense X.
"""
classes_ = np.array([-1., 1.])
def __init__(self, **params):
# assume 0 as positive score threshold
self.pos_threshold = params.pop('pos_threshold', 0.0)
super(ThresholdingLinearClassifierMixin, self).__init__(**params)
def predict(self, X):
"""Predict class labels for samples in X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Samples.
Returns
-------
C : array, shape = [n_samples]
Predicted class label per sample.
"""
scores = self.decision_function(X)
if len(scores.shape) == 1:
indices = (scores >= self.pos_threshold).astype(np.int)
else:
indices = scores.argmax(axis=1)
return self.classes_[indices]
class VWClassifier(SparseCoefMixin, ThresholdingLinearClassifierMixin, VW):
"""Vowpal Wabbit Classifier model
Only supports binary classification currently. Use VW directly for multiclass classification
note - don't try to apply link='logistic' on top of the existing functionality
"""
def __init__(self, **params):
# assume logistic loss functions
if 'loss_function' not in params:
params['loss_function'] = 'logistic'
super(VWClassifier, self).__init__(**params)
def decision_function(self, X):
"""Predict confidence scores for samples.
The confidence score for a sample is the signed distance of that
sample to the hyperplane.
Parameters
----------
X : {array-like, sparse matrix}, shape = (n_samples, n_features)
Samples.
Returns
-------
array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
Confidence scores per (sample, class) combination. In the binary
case, confidence score for self.classes_[1] where >0 means this
class would be predicted.
"""
return VW.predict(self, X=X)
class VWRegressor(VW, RegressorMixin):
"""Vowpal Wabbit Regressor model """
pass
def tovw(x, y=None, sample_weight=None):
"""Convert array or sparse matrix to Vowpal Wabbit format
Parameters
----------
x : {array-like, sparse matrix}, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : {array-like}, shape (n_samples,), optional
Target vector relative to X.
sample_weight : {array-like}, shape (n_samples,), optional
sample weight vector relative to X.
Returns
-------
out : {array-like}, shape (n_samples, 1)
Training vectors in VW string format
"""
use_truth = y is not None
use_weight = sample_weight is not None
# convert to numpy array if needed
if not isinstance(x, (np.ndarray, csr_matrix)):
x = np.array(x)
if not isinstance(y, np.ndarray):
y = np.array(y)
# make sure this is a 2d array
if x.ndim == 1:
x = x.reshape(1, -1)
if y.ndim == 0:
y = y.reshape(1)
rows, cols = x.shape
# check for invalid characters if array has string values
if x.dtype.char == 'S':
for row in rows:
for col in cols:
x[row, col] = INVALID_CHARS.sub('.', x[row, col])
# convert input to svmlight format
s = io.BytesIO()
dump_svmlight_file(x, np.zeros(rows), s)
# parse entries to construct VW format
rows = s.getvalue().decode('ascii').split('\n')[:-1]
out = []
for idx, row in enumerate(rows):
truth = y[idx] if use_truth else 1
weight = sample_weight[idx] if use_weight else 1
features = row.split('0 ', 1)[1]
# only using a single namespace and no tags
out.append(('{y} {w} |{ns} {x}'.format(y=truth, w=weight, ns=DEFAULT_NS, x=features)))
s.close()
return out