In [10]:
!pip3 install cvxpy mosek
# !pip3 install scipy --upgrade

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting mosek
  Downloading Mosek-10.0.45-cp38-cp38-manylinux2014_x86_64.whl (12.4 MB)
[K     |████████████████████████████████| 12.4 MB 8.2 MB/s eta 0:00:01
Installing collected packages: mosek
Successfully installed mosek-10.0.45


In [1]:
import torch
import torch.nn.functional as F
import torch.optim as optim

import cvxpy as cp
import pickle as pkl
import numpy as np
import scipy.sparse as sp
from scipy.linalg import pinv, inv
import scipy.linalg as spl
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

# Utils

In [2]:
def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()

def propagation_matrix(adj, alpha=0.85, sigma=1, nodes=None):
    """
    Computes the propagation matrix  (1-alpha)(I - alpha D^{-sigma} A D^{sigma-1})^{-1}.
    Parameters
    ----------
    adj : sp.spmatrix, shape [n, n]
        Sparse adjacency matrix.
    alpha : float
        (1-alpha) is the teleport probability.
    sigma
        Hyper-parameter controlling the propagation style.
        Set sigma=1 to obtain the PPR matrix.
    nodes : np.ndarray, shape [?]
        Nodes for which we want to compute Personalized PageRank.
    Returns
    -------
    prop_matrix : np.ndarray, shape [n, n]
        Propagation matrix.
    """
    n = adj.shape[0]
    deg = adj.sum(1).A1

    deg_min_sig = sp.diags(np.power(deg, -sigma))
    deg_sig_min = sp.diags(np.power(deg, sigma - 1))
    pre_inv = sp.eye(n) - alpha * deg_min_sig @ adj @ deg_sig_min

    # solve for x in: pre_inx @ x = b
    b = np.eye(n)
    if nodes is not None:
        b = b[:, nodes]

    return (1 - alpha) * spl.solve(pre_inv.toarray().T, b).T


def flip_label(lbl, total_classes):
    """
    Flip given label to a random false label
    """
    lbl_ = lbl.copy()
    
    lbl_class = np.argmax(lbl)
    
    possible_classes = [i for i in range(total_classes) if i != lbl_class]
    
    lbl_[lbl_class] = 0.
    lbl_[np.random.choice(possible_classes)] = 1.
    
    return lbl_


def poison_labels_random(labels, train_idx, BUDGET):
    
    labels_ = labels.copy()
    total_classes = labels_.shape[1]
    
    random_ids = np.random.choice(train_idx, size=BUDGET, replace=False)
    
    for idx in random_ids:
        labels_[idx] = flip_label(labels_[idx], total_classes)

    return labels_

# SGC Binary Attack

In [3]:
def sgc_binclass(pie, A_square_X, SGC_solution, 
                   train_ids, test_ids, dataset, 
                   poison_budget, CV, flip=False, yhat=None, verbose=False):   
    
    Y = dataset['labels']
    Y_L = Y[train_ids]
    num_classes = Y.shape[1]
    
    class_a, class_b = np.bincount(Y.argmax(1)).argsort()[::-1][:2]
    
    ab_train_ids = []
    
    for idx in range(len(train_ids)):       
            label = y_gt[train_ids[idx]]
            if label == class_a or label == class_b:
                ab_train_ids.append(idx) 
    
    print("Total ab ids:", len(ab_train_ids))
    
    # construct Y_L_flipped
    class_a_ids = np.where(y_gt[train_ids] == class_a)[0]
    class_b_ids = np.where(y_gt[train_ids] == class_b)[0]
    
    y_gt_copy = y_gt[train_ids].copy()
    y_gt_copy[class_a_ids] = class_b
    y_gt_copy[class_b_ids] = class_a
    
    Y_L_flipped = np.eye(num_classes)[y_gt_copy]
    
    # construct flips for margin loss
    margin_flips = poison_labels_random(Y[test_ids], range(len(test_ids)), len(test_ids))
    margin_flips2 = poison_labels_random(Y[test_ids], range(len(test_ids)), len(test_ids))

    
    # SGC-binary Objective
    
    # define variables
    H_long = cp.Variable((len(train_ids), 1), boolean=True)
    epsilons = cp.Variable((len(test_ids),1))
    h_test_acc = cp.Variable((len(test_ids)), boolean=True)
    
    z = cp.Variable((len(test_ids), num_classes), boolean=True)
    maxim = cp.Variable((len(test_ids), 1))
    
    #poisoned labels of size (n x c)
    poison_preds = cp.multiply(H_long, Y_L_flipped)
    
    #flip poison preds 
    clean_labels = cp.multiply(1 - H_long, Y_L)
    
    # create poisoned training labels by combining clean labels and poison preds
    Y_poisoned = poison_preds + clean_labels
    
    # W* by using SGC closed form
    W = SGC_solution @ Y_poisoned
    
    # predictions of SGC using W*
    Y_pred_poisoned = A_square_X @ W
  
    
    P_true = cp.sum(cp.multiply(Y_pred_poisoned[test_ids], Y[test_ids]), axis=1)
    P_false = cp.multiply(Y_pred_poisoned[test_ids], 1 - Y[test_ids]).T
    
    M = 100
    m = -100
    eps = 1e-8

    # define constraints
    constraints = [ # budget constraint
                    cp.sum(H_long[ab_train_ids]) == poison_budget
                    ] 
    
    # uncomment for semi-optimal attack
    #multi_obj = cp.Minimize(cp.sum(h_test_acc))
    
    if yhat is not None:
        multi_obj = cp.Minimize(cp.sum(cp.multiply(Y_pred_poisoned[test_ids], yhat[test_ids])))
    else:
        multi_obj = cp.Minimize(cp.sum(cp.multiply(Y_pred_poisoned[test_ids], Y[test_ids])))
    
    # Solve
    prob = cp.Problem(multi_obj, constraints) 
    prob.solve(solver=cp.MOSEK, verbose=False)
    
    predictions_argmax = Y_pred_poisoned.value.argmax(1)
    labels_argmax = Y.argmax(1)
    
    train_acc_lp = accuracy_score(predictions_argmax[train_ids], labels_argmax[train_ids])
    test_acc_lp = accuracy_score(predictions_argmax[test_ids], labels_argmax[test_ids])
    
    if verbose and not flip:
        #print("optimal value", prob.value)
        print("Test Acc: {:.4f}".format(test_acc_lp))        
    
    # create poisoned labels 
    Y_copy = Y.copy()
    poisoned_labels = Y_poisoned.value.argmax(1)
    Y_copy[train_ids] = 0.
    Y_copy[train_ids, poisoned_labels] = 1.
    
    if flip:
        flip_ids = np.array(ab_train_ids)[np.where(Y_copy.argmax(1)[train_ids][ab_train_ids]== Y.argmax(1)[train_ids][ab_train_ids])[0]]
        
        Y_copy = Y.copy()
        
        for flip_id in flip_ids:
            flip_class = Y_copy[train_ids[flip_id]].argmax()
            Y_copy[train_ids[flip_id]] = 0.
            
            if flip_class == class_a:
                Y_copy[train_ids[flip_id]] = np.eye(num_classes)[class_b]
            else:
                Y_copy[train_ids[flip_id]] = np.eye(num_classes)[class_a]
            
            
        # recompute SGC solution and test accuracy
        W = SGC_solution @ Y_copy[train_ids]
        Y_pred_poisoned = A_square_X @ W
        predictions_argmax = Y_pred_poisoned.argmax(1)
        test_acc_lp = accuracy_score(predictions_argmax[test_ids], labels_argmax[test_ids])
        print("Test Acc: {:.4f}".format(test_acc_lp))
        
        return Y_copy, H_long, W, test_acc_lp, ab_train_ids
    
    else:
        return Y_copy, H_long, W.value, test_acc_lp, ab_train_ids

# SGC MultiClass Attack

In [5]:
def sgc_multiclass(pie, A_square_X, SGC_solution, 
                   train_ids, test_ids, dataset, 
                   poison_budget, CV, verbose=False):   
    
    
    Y = dataset['labels']

    # construct one-hot false classes for all training nodes
    false_labels = []
    num_classes = Y.shape[1]
    for label in y_gt[train_ids]:
        false_labels += [i for i in range(num_classes) if i != label]   
    
    # construct all possible masks for false labels
    margin_masks = []
    for label in y_gt[test_ids]:
        temp = []
        for idx in range(num_classes):
            if idx != label:
                temp.append([np.eye(num_classes)[idx]])
        margin_masks.append(temp) 
        
    margin_masks = np.array(margin_masks).squeeze()
        
    Y_L_expanded = np.eye(num_classes)[false_labels]
    Y_L = Y[train_ids]
    

    # construct flips for margin loss
    margin_flips = poison_labels_random(Y[test_ids], range(len(test_ids)), len(test_ids))
    margin_flips2 = poison_labels_random(Y[test_ids], range(len(test_ids)), len(test_ids))
    
    # SGC Based Objective
    
    # define variables
    H_long = cp.Variable((Y_L_expanded.shape[0], 1), boolean=True)
    epsilons = cp.Variable((len(test_ids),1))
    h_test_acc = cp.Variable((len(test_ids)), boolean=True)
    
    # for 0-1 test acc and finding maximum
    z = cp.Variable((len(test_ids), num_classes), boolean=True)
    maxim = cp.Variable((len(test_ids), 1))
    
    #poisoned labels of size (n x c-1) x c 
    poison_preds = cp.multiply(H_long, Y_L_expanded)
    
    #flip poison preds 
    clean_labels = 1 - cp.sum(cp.reshape(H_long, (num_classes-1, Y_L.shape[0])).T, axis=1)
    
    # reshape poisoned preds and clean labels
    # by creating n groups of size (c-1 x c) and summing them along axis 0
    x = np.arange(Y_L_expanded.shape[0])
    idxs = x.reshape(Y_L.shape[0], num_classes - 1)
    
    poison_preds_reshaped = cp.vstack([cp.sum(poison_preds[idxs[i],:], axis=0) for i in range(Y_L.shape[0])])
    clean_labels_reshaped = cp.multiply(clean_labels[:, None], Y_L)
    
    # create poisoned training labels by combining clean labels and poison preds
    Y_poisoned = poison_preds_reshaped + clean_labels_reshaped
    
    # W* by using SGC closed form
    W = SGC_solution @ Y_poisoned
    
    # predictions of SGC using W*
    Y_pred_poisoned = A_square_X @ W
  
    # objective-2
    P_true = cp.sum(cp.multiply(Y_pred_poisoned[test_ids], Y[test_ids]), axis=1)
    P_false = cp.multiply(Y_pred_poisoned[test_ids], 1 - Y[test_ids]).T
    # P_false = cp.sum(cp.multiply(Y_pred_poisoned[test_ids], margin_flips), axis=1)
    

    # multiclass objective
    multi_obj = cp.Minimize(cp.sum(cp.multiply(Y_pred_poisoned[test_ids], Y[test_ids])))
    
    #uncomment for semi-optimal attack
    #multi_obj = cp.Minimize(cp.sum(h_test_acc))
    
    M = 100
    m = -100
    eps = 1e-3
    
    # define constraints
    constraints = [ # budget constraint
                    cp.sum(poison_preds_reshaped) == poison_budget, 
                          
                    # exclusivity constraint
                    cp.sum(cp.reshape(H_long, (num_classes-1, Y_L.shape[0])).T, 
                                  axis=1) <= 1, 
        
                    ] 
    
    
    # Solve
    prob = cp.Problem(multi_obj, constraints) 
    prob.solve(solver=cp.MOSEK, verbose=False)
    
    predictions_argmax = Y_pred_poisoned.value.argmax(1)
    labels_argmax = Y.argmax(1)
    
    train_acc_lp = accuracy_score(predictions_argmax[train_ids], labels_argmax[train_ids])
    test_acc_lp = accuracy_score(predictions_argmax[test_ids], labels_argmax[test_ids])
    
    if verbose:
        #print("status:", prob.status)
        #print("optimal value", prob.value)
        print("Test Acc: {:.4f}".format(test_acc_lp))
        
    # create poisoned labels 
    Y_copy = Y.copy()
    poisoned_labels = Y_poisoned.value.argmax(1)
    Y_copy[train_ids] = 0.
    Y_copy[train_ids, poisoned_labels] = 1.


    return Y_copy, H_long, W.value, test_acc_lp

# Run Experiments

In [36]:
# load dataset 
dset = 'pubmed'
dataset = pkl.load(open('../small_val_random/{}.pkl'.format(dset), 'rb'))
X = dataset['X']
y_gt = dataset['labels'].argmax(1)
num_classes = y_gt.max() + 1

In [37]:
A = dataset['sym_adj']
A_hat = normalize_adj(A)
pie = A_hat@A_hat #propagation_matrix(A_hat) #use pie from PPNP propagation_matrix(A_hat) 
diffused_X = pie@X

In [38]:
# CV setting
CV = True

# using new LP optimization
poisoned_labels_lp = {'dataset_name': dset}
all_test_accs = []

A_square_full = (A_hat@A_hat)
A_square_X = A_square_full @ X

test_accs = []
for split_no in (range(10)):  #tqdm
    print("split: {:2d}".format(split_no))
    temp_d = {}
    
    if CV:
        train_ids = np.append(dataset['split_{}'.format(split_no)]['train_ids'],
                              dataset['split_{}'.format(split_no)]['val_ids'])
    else:
        train_ids = dataset['split_{}'.format(split_no)]['train_ids']
        
        
    rand_subset = np.random.randint(len(dataset['split_{}'.format(split_no)]['test_ids']), size=100)
    test_ids = dataset['split_{}'.format(split_no)]['test_ids'] #[rand_subset]   
    
    A_square = A_square_full[train_ids] 
    Y_L = dataset['labels'][train_ids]
    lamb = 1
    
    # Get y-hat 
    clf1 = LogisticRegression(random_state=123, C=1.).fit((diffused_X)[train_ids], y_gt[train_ids])
    yhat = np.eye(num_classes)[clf1.predict_proba(diffused_X).argmax(1)]
    
    bin_acc = accuracy_score(yhat.argmax(1)[test_ids], y_gt[test_ids])*100
    
    # precompute W = ( (A^2 X).T (A^2 X) )^{-1} (A^2 X).T
    inverse = pinv((A_square@X).T @ A_square@X + lamb*sp.eye(X.shape[1]))
    P = inverse @ (A_square@X).T # @ Y_L inside opt
    
    for poison_per in [5, 10, 15, 20, 30]:
        
        #numeric labels
        y_gt = dataset['labels'].argmax(1)
        
        #print("Poison percentage: {:2d}%".format(poison_per))
        poison_budget = np.ceil(len(train_ids)*(poison_per/100))


        # y_poison, h_long, W_star, t_acc = sgc_multiclass(pie.todense(), A_square_X, 
        #                                                    P, train_ids, test_ids, 
        #                                                    dataset, poison_budget,
        #                                                    CV=CV, verbose=True)
                

        y_poison, h_long, W_star, t_acc, ab_train_ids = sgc_binclass(pie.todense(), A_square_X, 
                                                         P, train_ids, test_ids, 
                                                         dataset, poison_budget,
                                                         CV=CV, flip=False, verbose=True, yhat=yhat)
        
        
        print("Given Budget: {:2d}   Total flips: {:2d}".format(int(poison_budget), int((y_poison[train_ids].argmax(1) != y_gt[train_ids]).sum())))
        print()
        
        test_accs.append(t_acc)
        
        # log poisoned labels to dict
        if 'split_{}'.format(split_no) in poisoned_labels_lp:
           poisoned_labels_lp['split_{}'.format(split_no)]['{}_percent_poison'.format(poison_per)] = y_poison
        else:
           poisoned_labels_lp['split_{}'.format(split_no)] = {}
           poisoned_labels_lp['split_{}'.format(split_no)]['{}_percent_poison'.format(poison_per)] = y_poison
        
print("Average Test Acc: {:.2f} ({:.2f})".format(np.mean(test_accs)*100, np.std(test_accs)*100))

split:  0
Total ab ids: 98
Test Acc: 0.7145
Given Budget:  6   Total flips:  6

Total ab ids: 98
Test Acc: 0.6508
Given Budget: 12   Total flips: 12

Total ab ids: 98
Test Acc: 0.5832
Given Budget: 18   Total flips: 18

Total ab ids: 98
Test Acc: 0.5124
Given Budget: 24   Total flips: 24

Total ab ids: 98
Test Acc: 0.4043
Given Budget: 36   Total flips: 36

split:  1
Total ab ids: 99
Test Acc: 0.7099
Given Budget:  6   Total flips:  6

Total ab ids: 99
Test Acc: 0.6320
Given Budget: 12   Total flips: 12

Total ab ids: 99
Test Acc: 0.5573
Given Budget: 18   Total flips: 18

Total ab ids: 99
Test Acc: 0.4971
Given Budget: 24   Total flips: 24

Total ab ids: 99
Test Acc: 0.4077
Given Budget: 36   Total flips: 36

split:  2
Total ab ids: 91
Test Acc: 0.7163
Given Budget:  6   Total flips:  6

Total ab ids: 91
Test Acc: 0.6014
Given Budget: 12   Total flips: 12

Total ab ids: 91
Test Acc: 0.5216
Given Budget: 18   Total flips: 18

Total ab ids: 91
Test Acc: 0.4628
Given Budget: 24   Total f

In [40]:
#dump SGC poisoned labels
with open('sbin_neurips_cv/{}_sgcBin_yhat_cv_poisoned_labels.pkl'.format(dset), 'wb') as handle:
    pkl.dump(poisoned_labels_lp, handle, protocol=4)