In [3]:
import torch
import torch.nn.functional as F
import torch.optim as optim

import cvxpy as cp
import pickle as pkl
import numpy as np
import scipy.sparse as sp
from scipy.linalg import pinv, inv
import scipy.linalg as spl
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [4]:
def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()

def propagation_matrix(adj, alpha=0.85, sigma=1, nodes=None):
    """
    Computes the propagation matrix  (1-alpha)(I - alpha D^{-sigma} A D^{sigma-1})^{-1}.
    Parameters
    ----------
    adj : sp.spmatrix, shape [n, n]
        Sparse adjacency matrix.
    alpha : float
        (1-alpha) is the teleport probability.
    sigma
        Hyper-parameter controlling the propagation style.
        Set sigma=1 to obtain the PPR matrix.
    nodes : np.ndarray, shape [?]
        Nodes for which we want to compute Personalized PageRank.
    Returns
    -------
    prop_matrix : np.ndarray, shape [n, n]
        Propagation matrix.
    """
    n = adj.shape[0]
    deg = adj.sum(1).A1

    deg_min_sig = sp.diags(np.power(deg, -sigma))
    deg_sig_min = sp.diags(np.power(deg, sigma - 1))
    pre_inv = sp.eye(n) - alpha * deg_min_sig @ adj @ deg_sig_min

    # solve for x in: pre_inx @ x = b
    b = np.eye(n)
    if nodes is not None:
        b = b[:, nodes]

    return (1 - alpha) * spl.solve(pre_inv.toarray().T, b).T


def flip_label(lbl, total_classes):
    """
    Flip given label to a random false label
    """
    lbl_ = lbl.copy()
    
    lbl_class = np.argmax(lbl)
    
    possible_classes = [i for i in range(total_classes) if i != lbl_class]
    
    lbl_[lbl_class] = 0.
    lbl_[np.random.choice(possible_classes)] = 1.
    
    return lbl_


def poison_labels_random(labels, train_idx, BUDGET):
    
    labels_ = labels.copy()
    total_classes = labels_.shape[1]
    
    random_ids = np.random.choice(train_idx, size=BUDGET, replace=False)
    
    for idx in random_ids:
        labels_[idx] = flip_label(labels_[idx], total_classes)

    return labels_

In [5]:
# load dataset 
dset = 'corafull_pca'
dataset = pkl.load(open('small_val_random/{}.pkl'.format(dset), 'rb'))
X = dataset['X']
y_gt = dataset['labels'].argmax(1)
num_classes = y_gt.max() + 1

In [6]:
A = dataset['sym_adj']
A_hat = normalize_adj(A)
pie = A_hat@A_hat #propagation_matrix(A_hat) #use pie from PPNP propagation_matrix(A_hat) 
diffused_X = pie@X

In [7]:
# CV setting
CV = True

# using new LP optimization
poisoned_labels_lp = {'dataset_name': dset}
all_test_accs = []

A_square_full = (A_hat@A_hat)
A_square_X = A_square_full @ X

test_accs = []
for split_no in (range(10)):  #10
    print("split: {:2d}".format(split_no))
    temp_d = {}
    
    if CV:
        train_ids = np.append(dataset['split_{}'.format(split_no)]['train_ids'],
                              dataset['split_{}'.format(split_no)]['val_ids'])
    else:
        train_ids = dataset['split_{}'.format(split_no)]['train_ids']
        
        
        
    rand_subset = np.random.randint(len(dataset['split_{}'.format(split_no)]['test_ids']), size=100)
    test_ids = dataset['split_{}'.format(split_no)]['test_ids']  
    
    A_square = A_square_full[train_ids] 
    Y_L = dataset['labels'][train_ids]
    lamb = 1
    
    # Get y-hat 
    tids = np.append(train_ids, [6870])
    clf1 = LogisticRegression(random_state=123, C=1.).fit((diffused_X)[tids], y_gt[tids])
    preds = clf1.predict_proba(diffused_X)
    yhat = np.eye(num_classes)[preds.argmax(1)]
    
    # top false class
    margin_preds =  ((1-dataset['labels'])*preds).argmax(1)[train_ids]
    
    bin_acc = accuracy_score(yhat.argmax(1)[test_ids], y_gt[test_ids])*100
    
    print("Prediction Accuracy: {:.2f}".format(bin_acc))
    
    # precompute W = ( (A^2 X).T (A^2 X) )^{-1} (A^2 X).T
    inverse = pinv((A_square@X).T @ A_square@X  + lamb*sp.eye(X.shape[1]) ) 
    P = inverse @ (A_square@X).T # @ Y_L inside opt
    print("computed inverse")
    
    #numeric labels
    y_gt = dataset['labels'].argmax(1)

    class_a, class_b = np.bincount(y_gt).argsort()[::-1][:2]
    
    #closed form solution for SGCbin 
    X_train = diffused_X[train_ids] 
    X_test = diffused_X[test_ids] 
    Y_train = dataset['labels'][train_ids]
    Y_test = dataset['labels'][test_ids]

    ordered_classes = np.bincount(y_gt).argsort()[::-1]
    y_gt_copy = y_gt[train_ids].copy()

    for i in range(0, len(ordered_classes), 2):
        class_a, class_b = ordered_classes[i], ordered_classes[i+1]
    
        # construct binary flips (H_train)
        class_a_ids = np.where(y_gt[train_ids] == class_a)[0]
        class_b_ids = np.where(y_gt[train_ids] == class_b)[0]

        y_gt_copy[class_a_ids] = class_b
        y_gt_copy[class_b_ids] = class_a

    H_train = np.eye(num_classes)[y_gt_copy] # for top-2 class binary flips 
    #H_train = np.eye(num_classes)[margin_preds] # margin predictions (top false class)
    #H_train = poison_labels_random(dataset['labels'], train_ids, len(train_ids))[train_ids]

    c = np.einsum('ik,kj,ij->k', X_test @ P, Y_train-H_train, Y_test)
    argsort_c = np.argsort(-c)
    
    for poison_per in [5, 10, 15, 20, 30]: #[1, 3, 5]:
        
        #print("Poison percentage: {:2d}%".format(poison_per))
        poison_budget = np.ceil(len(train_ids)*(poison_per/100))

        flip_ids = argsort_c[:int(poison_budget)]
        
        # construct poisoned labels
        y_poison = dataset['labels'].copy()
        y_poison[train_ids[flip_ids]] = H_train[flip_ids]
        predictions = A_square_X @ P @ y_poison[train_ids]
        
        t_acc =  accuracy_score(predictions[test_ids].argmax(1), y_gt[test_ids])
        
        print("Given Budget: {:2d}   Total flips: {:2d} TestAcc: {:.2f}".format(int(poison_budget), int((y_poison[train_ids].argmax(1) != y_gt[train_ids]).sum()), t_acc*100))
        print()
        
        test_accs.append(t_acc)
        
        # log poisoned labels to dict
        if 'split_{}'.format(split_no) in poisoned_labels_lp:
           poisoned_labels_lp['split_{}'.format(split_no)]['{}_percent_poison'.format(poison_per)] = y_poison
        else:
           poisoned_labels_lp['split_{}'.format(split_no)] = {}
           poisoned_labels_lp['split_{}'.format(split_no)]['{}_percent_poison'.format(poison_per)] = y_poison
        
print("Average Test Acc: {:.2f} ({:.2f})".format(np.mean(test_accs)*100, np.std(test_accs)*100))

split:  0
Prediction Accuracy: 66.52
computed inverse
2800
Given Budget: 140   Total flips: 140 TestAcc: 63.06

Given Budget: 280   Total flips: 280 TestAcc: 55.88

Given Budget: 420   Total flips: 420 TestAcc: 46.97

Given Budget: 560   Total flips: 560 TestAcc: 39.13

Given Budget: 840   Total flips: 840 TestAcc: 23.28

split:  1
Prediction Accuracy: 66.24
computed inverse
2800
Given Budget: 140   Total flips: 140 TestAcc: 62.74

Given Budget: 280   Total flips: 280 TestAcc: 56.46

Given Budget: 420   Total flips: 420 TestAcc: 46.52

Given Budget: 560   Total flips: 560 TestAcc: 37.92

Given Budget: 840   Total flips: 840 TestAcc: 25.73

split:  2
Prediction Accuracy: 67.21
computed inverse
2800
Given Budget: 140   Total flips: 140 TestAcc: 63.60

Given Budget: 280   Total flips: 280 TestAcc: 58.38

Given Budget: 420   Total flips: 420 TestAcc: 48.16

Given Budget: 560   Total flips: 560 TestAcc: 39.12

Given Budget: 840   Total flips: 840 TestAcc: 24.59

split:  3
Prediction Accurac

In [8]:
#dump SGC poisoned labels
with open('{}_sgcOpt_multibinary_cv_poisoned_labels.pkl'.format(dset), 'wb') as handle:
    pkl.dump(poisoned_labels_lp, handle, protocol=4)