In [None]:
# Binary Classification

from __future__ import print_function

# import warnings filter
from warnings import simplefilter
# ignore future warning stating that import of LinearClassifierMixin and SparseCoefMixin will be 
# removed from sklearn v0.24
simplefilter(action='ignore', category=FutureWarning)

from vowpalwabbit.sklearn_vw import VWClassifier

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

# get some data
X, y = datasets.make_hastie_10_2(n_samples=10000, random_state=1)
X = X.astype(np.float32)

# split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=256)

# build vowpal wabbit model
model = VWClassifier()
model.fit(X_train, y_train)

# evaluate
print('training score: {}'.format(model.score(X_train, y_train)))
print('testing score: {}'.format(model.score(X_test, y_test)))

In [None]:
# Parameter Grid Search
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py

import pandas as pd
from operator import itemgetter
from time import time
try:
    from sklearn.grid_search import RandomizedSearchCV
except ImportError as _:
    from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import uniform

# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = grid_scores.nlargest(n_top, 'mean_test_score')
    i = 0
    for score in top_scores.itertuples(index=False):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              getattr(score, 'mean_test_score'),
              getattr(score, 'std_test_score')))
        print("Parameters: {0}".format(getattr(score, 'params')))
        print("")
        i += 1
        
# use a full grid over all parameters
np.random.seed(0)
n_iter = 20
params = {"l2": uniform(0.0001, 0.01),
          "l": [0.01, 0.1, 1.0],
          "power_t": uniform()}

# run search
search = RandomizedSearchCV(VWClassifier(), param_distributions=params, n_iter=n_iter)
start = time()
search.fit(X, y)

results = pd.DataFrame(search.cv_results_)

print("Parameter search took %.2f seconds for %d candidate parameter settings."
      % (time() - start, results.shape[0]))
report(results)

In [None]:
# evaluate
model = VWClassifier(loss_function='logistic', l=0.01, l2=0.1)
model.fit(X_train, y_train)

print('training score: {}'.format(model.score(X_train, y_train)))
print('testing score: {}'.format(model.score(X_test, y_test)))

# cleanup
del model

In [None]:
# Linear Regression

from vowpalwabbit.sklearn_vw import VWRegressor
from sklearn import datasets

# Load the diabetes dataset
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

model = VWRegressor(l=100)
model.fit(X, y)

print('intercept: {}'.format(model.get_intercept()))
print('predictions: {}'.format(model.predict(X[:10])))
print('training R2 score: {}'.format(model.score(X, y)))

In [None]:
# Save the model and reload it
model.save('test.model')
del model
model = VWRegressor()
model.load('test.model')
print('intercept: {}'.format(model.get_intercept()))
print('predictions: {}'.format(model.predict(X[:10])))
print('training R2 score: {}'.format(model.score(X, y)))