In [2]:
! pwd

/Users/Home/Documents/QML/Hackathon/qml_hackathon


In [2]:
import os

import pandas as pd
import numpy as np

X_tr = np.load('X_train.npy')
X_te = np.load('X_test.npy')
y1_tr = np.load('y1_train.npy')
y2_tr = np.load('y2_train.npy')

# Fill in your API token:


sapi_token = 'CDL8-df1de1d5d76560ee73a82ffca3833a1a444536d3'
url = 'https://cloud.dwavesys.com/sapi'
token = sapi_token
solver_name = 'c4-sw_sample'#'DW_2000Q_2'

# import necessary packages
from sklearn import preprocessing, metrics
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.datasets.mldata import fetch_mldata
from sklearn.datasets import load_breast_cancer
from dwave.system.samplers import DWaveSampler
from dwave.system.composites import EmbeddingComposite

from qboost import WeakClassifiers, QBoostClassifier, QboostPlus


# Define the functions required in this example

def metric(y, y_pred):
    """
    :param y: true label
    :param y_pred: predicted label
    :return: metric score
    """

    return metrics.accuracy_score(y, y_pred)

# performance metric

def train_model(X_train, y_train, X_test, y_test, lmd):
    """
    :param X_train: training data
    :param y_train: training label
    :param X_test: testing data
    :param y_test: testing label
    :param lmd: lambda used in regularization
    :return:
    """

    # define parameters used in this function
    NUM_READS = 1000
    NUM_WEAK_CLASSIFIERS = 30
    TREE_DEPTH = 4
    DW_PARAMS = {'num_reads': NUM_READS,
                 'auto_scale': True,
                 'num_spin_reversal_transforms': 10,
                 'postprocess': 'optimization',
                 }

    # define sampler
    dwave_sampler = DWaveSampler(token=sapi_token, endpoint = url)
    emb_sampler = EmbeddingComposite(dwave_sampler)

    N_train = len(X_train)
    N_test = len(X_test)
    print("\n======================================")
    print("Train size: %d, Test size: %d" %(N_train, N_test))
    print('Num weak classifiers:', NUM_WEAK_CLASSIFIERS)

    # Preprocessing data
    imputer = preprocessing.Imputer()
    scaler = preprocessing.StandardScaler()
    normalizer = preprocessing.Normalizer()

    X_train = scaler.fit_transform(X_train)
    X_train = normalizer.fit_transform(X_train)

    X_test = scaler.fit_transform(X_test)
    X_test = normalizer.fit_transform(X_test)
    
    ## Adaboost
    print('\nAdaboost')
    clf1 = AdaBoostClassifier(n_estimators=NUM_WEAK_CLASSIFIERS)
    clf1.fit(X_train, y_train)
    y_train1 = clf1.predict(X_train)
    y_test1 = clf1.predict(X_test)
#     print(clf1.estimator_weights_)
    print('accu (train): %5.2f'%(metric(y_train, y_train1)))
    print('accu (test): %5.2f'%(metric(y_test, y_test1)))

        # Ensembles of Decision Tree
    print('\nDecision tree')
    clf2 = WeakClassifiers(n_estimators=NUM_WEAK_CLASSIFIERS, max_depth=TREE_DEPTH)
    clf2.fit(X_train, y_train)
    y_train2 = clf2.predict(X_train)
    y_test2 = clf2.predict(X_test)
#     print(clf2.estimator_weights)
    print('accu (train): %5.2f' % (metric(y_train, y_train2)))
    print('accu (test): %5.2f' % (metric(y_test, y_test2)))
    
    # Random forest
    print('\nRandom Forest')
    clf3 = RandomForestClassifier(max_depth=TREE_DEPTH, n_estimators=NUM_WEAK_CLASSIFIERS)
    clf3.fit(X_train, y_train)
    y_train3 = clf3.predict(X_train)
    y_test3 = clf3.predict(X_test)
    print('accu (train): %5.2f' % (metric(y_train, y_train3)))
    print('accu (test): %5.2f' % (metric(y_test, y_test3)))
    
    # Qboost
    print('\nQBoost')
    clf4 = QBoostClassifier(n_estimators=NUM_WEAK_CLASSIFIERS, max_depth=TREE_DEPTH)
    clf4.fit(X_train, y_train, emb_sampler, lmd=lmd, **DW_PARAMS)
    y_train4 = clf4.predict(X_train)
    y_test4 = clf4.predict(X_test)
    print(clf4.estimator_weights)
    print('accu (train): %5.2f' % (metric(y_train, y_train4)))
    print('accu (test): %5.2f' % (metric(y_test, y_test4)))

#    QboostPlus
    print('\nQBoostPlus')
    clf5 = QboostPlus([clf1, clf2, clf3, clf4])
    clf5.fit(X_train, y_train, emb_sampler, lmd=lmd, **DW_PARAMS)
    y_train5 = clf5.predict(X_train)
    y_test5 = clf5.predict(X_test)
    print(clf5.estimator_weights)
    print('accu (train): %5.2f' % (metric(y_train, y_train5)))
    print('accu (test): %5.2f' % (metric(y_test, y_test5)))

    
    return [clf4]

# start training the model

idx = np.arange(len(X_tr))
np.random.shuffle(idx)  # shuffles index

y_train = y1_tr
y_bin = 2*(y1_tr >0.25) - 1

X_train = X_tr[:int(len(idx)*.8)]
y_train = y_bin[:int(len(idx)*.8)]

X_test = X_tr[int(len(idx)*.8):]
y_test = y_bin[int(len(idx)*.8):]

# start training the model
#X_train = X_tr
#y_train = y1_tr
#y_train = 2*(y_train >0.25) - 1
#X_test = X_train
#y_test = y_train


In [3]:
clfs = train_model(X_train, y_train, X_test, y_test, 1.0)



Train size: 1920, Test size: 480
('Num weak classifiers:', 30)

Adaboost
accu (train):  0.93
accu (test):  0.89

Decision tree
accu (train):  0.97
accu (test):  0.91

Random Forest
accu (train):  0.92
accu (test):  0.90

QBoost
[1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 1]
accu (train):  0.94
accu (test):  0.88

QBoostPlus
[1 1 1 1]
accu (train):  0.95
accu (test):  0.91


In [4]:
# create bins and allocate y to each bin

import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

y = y1_tr
x = X_tr

# split into n_classes, or into n_classes bins
n_classes = 10
bins = np.linspace(np.min(y), np.max(y), num=n_classes)
inds = np.digitize(y, bins)

inds

array([2, 4, 3, ..., 2, 5, 2])

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [6]:
from sklearn.metrics import accuracy_score
# implement one versus rest classifier

random_state = np.random.RandomState(0)
y = inds # label_binarize(inds, classes=bins)
#n_classes = y.shape[1]
print np.min(y),np.max(y)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.2,
                                                    random_state=0)
print len(X_test)

#OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train).predict(X_test)

#classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
#y_score = classifier.fit(X_train, y_train)
#y_pred = classifier.predict(X_test)
#acc = accuracy_score(y_test, y_pred)
#print acc

1 10
480


In [7]:
# run qboost with dicrete variables
# does not work well

clfs = train_model(X_train, y_train, X_test, y_test, 1.0)


Train size: 1920, Test size: 480
('Num weak classifiers:', 30)

Adaboost
accu (train):  0.29
accu (test):  0.27

Decision tree
accu (train):  0.10
accu (test):  0.08

Random Forest
accu (train):  0.61
accu (test):  0.51

QBoost
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
accu (train):  0.10
accu (test):  0.08

QBoostPlus
[1 1 1 1]
accu (train):  0.10
accu (test):  0.08


In [9]:
# one vs rest classifer with AdaBoost
# works well, but low performance due to unbalances classes 
NUM_WEAK_CLASSIFIERS=30
classifier = OneVsRestClassifier(AdaBoostClassifier(n_estimators=NUM_WEAK_CLASSIFIERS))
y_score = classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

acc = accuracy_score(y_test, y_pred)

print y_pred, acc

[4 4 2 1 6 3 1 2 5 6 4 4 4 2 2 3 2 7 3 4 2 3 2 4 1 1 3 4 4 1 4 3 4 4 4 3 2
 6 2 5 1 2 2 4 1 5 1 3 2 4 1 6 2 4 4 2 1 2 3 2 2 2 2 3 6 4 3 4 4 3 6 3 6 2
 3 5 5 2 4 2 1 4 2 3 3 3 5 3 4 4 2 3 5 5 4 3 4 3 3 3 4 2 3 4 6 5 4 3 5 6 1
 4 3 3 2 1 4 5 5 2 4 2 4 5 6 5 4 2 3 6 3 5 2 4 5 2 7 4 4 2 6 3 1 6 2 2 2 2
 5 4 5 3 5 5 5 4 3 3 6 2 3 4 4 6 2 2 4 6 2 3 2 4 5 7 4 4 3 4 2 2 6 5 2 4 4
 5 5 4 3 5 4 4 3 3 3 2 2 2 1 3 1 2 3 3 3 6 3 4 5 4 3 4 2 3 5 5 4 3 3 4 3 2
 4 1 2 1 2 4 3 3 4 4 5 4 4 4 3 3 2 4 2 2 6 3 3 5 1 2 3 1 4 5 4 4 2 3 4 7 4
 2 6 4 2 3 6 2 6 4 2 4 2 5 3 4 4 2 7 2 2 4 4 4 6 3 4 4 4 4 5 3 6 2 5 2 3 5
 4 2 3 1 2 3 2 4 5 6 3 4 3 5 3 1 4 2 3 2 4 4 6 1 3 4 2 2 1 5 6 2 3 4 2 3 5
 3 6 3 5 3 4 2 2 2 2 5 5 3 3 2 4 4 2 3 3 4 6 4 3 4 3 2 2 4 4 4 3 2 2 3 4 4
 2 2 6 4 2 4 4 2 6 5 5 2 2 2 4 2 5 1 3 2 5 4 2 6 3 3 4 5 5 3 2 3 6 4 3 5 4
 3 4 2 2 4 3 3 3 4 5 3 6 3 4 1 4 3 4 3 6 1 4 2 6 2 4 3 3 3 5 5 2 4 1 3 5 6
 4 4 4 6 2 4 5 1 4 4 4 3 5 4 3 1 6 7 1 3 4 6 3 4 3 3 5 3 3 4 4 6 4 6 6 5] 0.6125


In [50]:
## One vs Rest classifier

y = y1_tr
x = X_tr

# split into n_classes, or into n_classes bins
n_classes = 3
bins = np.linspace(np.min(y), np.max(y), num=n_classes)
inds = np.digitize(y, bins)


# split data

X_train, X_test, y_train, y_test = train_test_split(x, inds, test_size=.2,
                                                    random_state=0)
##

NUM_READS = 1000
NUM_WEAK_CLASSIFIERS = 30
TREE_DEPTH = 4
DW_PARAMS = {'num_reads': NUM_READS,
             'auto_scale': True,
             'num_spin_reversal_transforms': 10,
             'postprocess': 'optimization',
             }

# define sampler
dwave_sampler = DWaveSampler(token=sapi_token, endpoint = url)
emb_sampler = EmbeddingComposite(dwave_sampler)
lmd = 0.2

classifiers = []
predictions = []
pred_test_labels = [0]*len(y_test)
pred_train_labels = [0]*len(y_train)

for i in range(n_classes):
    print 'Working on Qboost: ',i+1,'/',n_classes
    new_label = 2*(y_train==i+1)-1
     
    new_label = np.array(new_label)
    clf = QBoostClassifier(n_estimators=NUM_WEAK_CLASSIFIERS, max_depth=TREE_DEPTH)
    clf.fit(X_train, new_label, emb_sampler, lmd=lmd, **DW_PARAMS)
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    classifiers.append(clf)
    predictions.append(y_test_pred)

for i in range(len(predictions)):
    for j in range(len(predictions[i])):
        if predictions[i][j] == 1:
            pred_test_labels[j] = i+1
            
print pred_test_labels
acc = accuracy_score(y_test, pred_test_labels)

print 'Accuracy Score:', acc

Working on Qboost:  1 / 3
Working on Qboost:  2 / 3
Working on Qboost:  3 / 3
[1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 