In [1]:
# -*- coding: utf-8 -*-
import numpy as np
from scipy import sparse
from sklearn.svm import LinearSVC
from sklearn.metrics import log_loss, roc_auc_score

from grad_hinge import batch_grad_hinge_loss, grad_hinge_loss_theta, hessian_hingle_loss_theta
from grad_hinge import inverse_hvp_hinge_newtonCG

from dataset import load_data_v1,select_from_one_class

import argparse
import time
import pdb
import os

np.random.seed(2019)

In [2]:
# select the dataset used
# since the processed data set is too large to be submitted in supplementary material,
# this notebook cannot be run but only for demonstration.
dataset_name = "cancer"
# parameter for the sigmoid sampling function
sigmoid_k = 10
# sample ratio
sample_ratio = 0.9
flip_ratio = 0.4

acc_func = lambda x,y: (x==y).sum() / y.shape[0]

In [3]:
start_time = time.time()
# load data, pick 30% as the Va set
x_train,y_train,x_va,y_va,x_te,y_te = load_data_v1(dataset_name,va_ratio=0.3)
print("x_train, nr sample {}, nr feature {}".format(x_train.shape[0],x_train.shape[1]))
print("x_va,    nr sample {}, nr feature {}".format(x_va.shape[0],x_va.shape[1]))
print("x_te,    nr sample {}, nr feature {}".format(x_te.shape[0],x_te.shape[1]))
print("Tr: Pos {} Neg {}".format(y_train[y_train==1].shape[0],y_train[y_train==0].shape[0]))
print("Va: Pos {} Neg {}".format(y_va[y_va==1].shape[0],y_va[y_va==0].shape[0]))
print("Te: Pos {} Neg {}".format(y_te[y_te==1].shape[0],y_te[y_te==0].shape[0]))
print("Load data, cost {:.1f} sec".format(time.time()-start_time))

x_train, nr sample 350, nr feature 11
x_va,    nr sample 150, nr feature 11
x_te,    nr sample 183, nr feature 11
Tr: Pos 164 Neg 186
Va: Pos 33 Neg 117
Te: Pos 42 Neg 141
Load data, cost 0.0 sec


In [4]:
# get the subset samples number
num_tr_sample = x_train.shape[0]
obj_sample_size = int(sample_ratio * num_tr_sample)

# define the full-set-model \hat{\theta}
clf = LinearSVC(loss="squared_hinge", dual=False, fit_intercept=False)

# flip labels
idxs = np.arange(y_train.shape[0])
np.random.shuffle(idxs)
num_flip = int(flip_ratio * len(idxs))
y_train[idxs[:num_flip]] = np.logical_xor(np.ones(num_flip), y_train[idxs[:num_flip]]).astype(int)
                                                                                              
# zero to -1
y_train[y_train == 0] = -1
y_va[y_va == 0] = -1
y_te[y_te == 0] = -1


clf.fit(x_train,y_train)


LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [5]:
# on Va
y_va_pred = clf.predict(x_va)
full_acc = acc_func(y_va, y_va_pred)
weight_ar = clf.coef_.flatten()
# on Te
y_te_pred = clf.predict(x_te)
full_te_acc = acc_func(y_te,y_te_pred)
full_te_auc = roc_auc_score(y_te,y_te_pred)
# print full-set-model results
print("[FullSet] Va acc {:.6f}".format(full_acc))
print("[FullSet] Te acc {:.6f}".format(full_te_acc))

[FullSet] Va acc 0.820000
[FullSet] Te acc 0.857923


In [6]:
# get time cost for computing the IF
if_start_time = time.time()
test_grad_loss_val = grad_hinge_loss_theta(y_va,y_va_pred,x_va)
tr_pred = clf.predict(x_train)
# tr_pred = clf.decision_function(x_train)
batch_size = 10000
# computing the inverse Hessian-vector-product
iv_hvp = inverse_hvp_hinge_newtonCG(x_train,y_train,tr_pred,test_grad_loss_val,1e-5,True,1.0)
# get influence score
total_batch = int(np.ceil(x_train.shape[0] / float(batch_size)))
predicted_loss_diff = []
for idx in range(total_batch):
    train_grad_loss_val = batch_grad_hinge_loss(y_train[idx*batch_size:(idx+1)*batch_size],
        tr_pred[idx*batch_size:(idx+1)*batch_size],
        x_train[idx*batch_size:(idx+1)*batch_size],)

    predicted_loss_diff.extend(np.array(train_grad_loss_val.dot(iv_hvp)).flatten())
    
predicted_loss_diffs = np.asarray(predicted_loss_diff)
duration = time.time() - if_start_time
print("The Influence function's computation completed, cost {:.1f} sec".format(duration))

Function value: -0.08498508673969181
Split function value: 0.0, -0.08498508673969181
Optimization terminated successfully.
         Current function value: -0.084985
         Iterations: 1
         Function evaluations: 2
         Gradient evaluations: 2
         Hessian evaluations: 1
implicit hessian-vector products mean: -0.07872199256311764
implicit hessian-vector products norm: 0.29152201758991003
Inverse HVP took 0.0 sec
The Influence function's computation completed, cost 0.0 sec


In [7]:
print("=="*30)
print("IF Stats: mean {:.10f}, max {:.10f}, min {:.10f}".format(
    predicted_loss_diffs.mean(), predicted_loss_diffs.max(), predicted_loss_diffs.min())
)
# build sampling probability
phi_ar = - predicted_loss_diffs
IF_interval = phi_ar.max() - phi_ar.min()
a_param = sigmoid_k / IF_interval
prob_pi = 1 / (1 + np.exp(a_param * phi_ar))
print("Pi Stats:",np.percentile(prob_pi,[10,25,50,75,90]))

# Do subsampling
pos_idx = select_from_one_class(y_train,prob_pi,1,sample_ratio)
neg_idx = select_from_one_class(y_train,prob_pi,-1,sample_ratio)
sb_idx = np.union1d(pos_idx,neg_idx)
sb_x_train = x_train[sb_idx]
sb_y_train = y_train[sb_idx]

# Train the subset-model \tilde{\theta}
clf.fit(sb_x_train,sb_y_train)
y_va_pred = clf.predict(x_va)
sb_acc = (y_va == y_va_pred).sum() / y_va.shape[0]
sb_weight = clf.coef_.flatten()
diff_w_norm = np.linalg.norm(weight_ar - sb_weight)
sb_size = sb_x_train.shape[0]
y_te_pred = clf.predict(x_te)
sb_te_auc = roc_auc_score(y_te, y_te_pred)
y_te_pred = clf.predict(x_te)
sb_te_acc = (y_te == y_te_pred).sum() / y_te.shape[0]


# baseline: random sampling
u_idxs = np.arange(x_train.shape[0])
uniform_idxs = np.random.choice(u_idxs,obj_sample_size,replace=False)
us_x_train = x_train[uniform_idxs]
us_y_train = y_train[uniform_idxs]
clf.fit(us_x_train, us_y_train)
y_va_pred = clf.predict(x_va)
us_acc = (y_va == y_va_pred).sum() / y_va.shape[0]
us_size = us_x_train.shape[0]
y_te_pred = clf.predict(x_te)
us_te_auc = roc_auc_score(y_te, y_te_pred)
y_te_pred = clf.predict(x_te)
us_te_acc = (y_te == y_te_pred).sum() / y_te.shape[0]



print("=="*30)
print("Result Summary on Va")
print("[SigUIDS]  acc {:.6f}, # {}".format(sb_acc,sb_size))
print("[Random]   acc {:.6f}, # {}".format(us_acc,us_size))
print("[Full]     acc {:.6f}, # {}".format(full_acc,num_tr_sample))
print("Result Summary on Te")
print("[SigUIDS]  acc {:.6f}, # {}".format(sb_te_acc,sb_size))
print("[Random]   acc {:.6f}, # {}".format(us_te_acc,us_size))
print("[Full]     acc {:.6f}, # {}".format(full_te_acc,num_tr_sample))
print("Result Summary on Te")
print("[SigUIDS]  auc {:.6f}, # {}".format(sb_te_auc,sb_size))
print("[Random]   auc {:.6f}, # {}".format(us_te_auc,us_size))
print("[Full]     auc {:.6f}, # {}".format(full_te_auc,num_tr_sample))
print("=="*30)

IF Stats: mean -0.1603376189, max 0.7835683592, min -0.8479409791
Pi Stats: [0.00814249 0.14694704 0.5        0.5        0.5       ]
Result Summary on Va
[SigUIDS]  acc 0.920000, # 314
[Random]   acc 0.720000, # 315
[Full]     acc 0.820000, # 350
Result Summary on Te
[SigUIDS]  acc 0.961749, # 314
[Random]   acc 0.759563, # 315
[Full]     acc 0.857923, # 350
Result Summary on Te
[SigUIDS]  auc 0.950101, # 314
[Random]   auc 0.743668, # 315
[Full]     auc 0.824215, # 350
