# HIGGS example using IBM PowerAI Snap ML

In this example we will train a Logistic Regression model on the HIGGS dataset, using both scikit-learn and snap-ml-local.

The HIGGS dataset is avaliable in the UCI machine learning repository.

### Preprocess the data

### Training and Evaluating a Logistic Regression Model using CPU

In [None]:
# Download and decompress the data from the LIBSVM repository
# This may take some time! 
!mkdir -p data; cd data; wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/HIGGS.bz2; bunzip2 HIGGS.bz2; cd ../

In [8]:
# Preprocess the data
from sklearn.datasets import load_svmlight_file

defaultPath = "."

X,y = load_svmlight_file(defaultPath + "/data/HIGGS")

# Make the train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Convert to numpy ararys
import numpy as np
X_train = np.array(X_train.todense())
X_test  = np.array(X_test.todense())

# Normalize the training data
from sklearn.preprocessing import normalize
X_train = normalize(X_train, axis=1, norm='l1')
X_test  = normalize(X_test,  axis=1, norm='l1')

# Save the dense matrices
np.save(defaultPath + "/data/HIGGS.X_train", X_train)
np.save(defaultPath + "/data/HIGGS.X_test",  X_test)

# Save the labels
np.save(defaultPath + "/data/HIGGS.y_train", y_train)
np.save(defaultPath + "/data/HIGGS.y_test", y_test)

In [9]:
# Training and Evaluating a Logistic Regression Model using CPU
from scipy import sparse

# Load the data
import time
import numpy as np
import argparse

defaultPath = "."

t0 = time.time()
X_train = np.load(defaultPath + "/data/HIGGS.X_train.npy")
X_test  = np.load(defaultPath + "/data/HIGGS.X_test.npy")
y_train = np.load(defaultPath + "/data/HIGGS.y_train.npy")
y_test  = np.load(defaultPath + "/data/HIGGS.y_test.npy")
print("Data load time (s):  {0:.2f}".format(time.time()-t0))

Data load time (s):  11.59


In [10]:
# Import the LogisticRegression from snap.ml
from pai4sk import LogisticRegression
lr = LogisticRegression(use_gpu=False, max_iter=15, dual=True, num_threads=32, device_ids=[])

# Training
t0 = time.time()
lr.fit(X_train, y_train)
print("[snap.ml] Training time (s):  {0:.2f}".format(time.time()-t0))

# Inference
proba_test = lr.predict_proba(X_test)

# Evaluate log-loss on test set
from sklearn.metrics import log_loss
logloss_snap = log_loss(y_test, proba_test)
print("[snap.ml] Logarithmic loss:   {0:.4f}".format(logloss_snap))

# Import the LogisticRegression from sklearn
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(fit_intercept=False, dual=True, solver="liblinear")

# Training time
t0 = time.time()
lr.fit(X_train, y_train)
print("[sklearn] Training time (s):  {0:.2f}".format(time.time()-t0))

# Inference
proba_test = lr.predict_proba(X_test)

# Evaluate log-loss on test set
logloss_sklearn = log_loss(y_test, proba_test)
print("[sklearn] Logarithmic loss:   {0:.4f}".format(logloss_sklearn))

[Info] Training will run in multi-threaded mode on CPU.
[snap.ml] Training time (s):  6.99
[snap.ml] Logarithmic loss:   0.6374
[sklearn] Training time (s):  76.45
[sklearn] Logarithmic loss:   0.6374
