In [1]:
import os
import json
import pickle
import torch
import glob
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import random_split
import time
import networkx
import torchvision
import torchvision.transforms as transforms

import scipy.io as sio
import argparse
from models import Model

In [2]:
import torch_geometric as geo

import random
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from matplotlib import pyplot as plt
%matplotlib inline
os.environ["CUDA_VISIBLE_DEVICES"] = "0"#, 1, 2"

%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(out.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()

In [3]:
dataset_name = 'PROTEINS'

In [4]:
def getDataset(root, name, transform):
    if name.lower() in ['cora', 'pubmed', 'citeseer']:
        dataset = geo.datasets.Planetoid(root=root, name=name, transform=transform)
    elif name.lower() in ['mutag', 'imdb-binary', 'ethanol', 'proteins']:
        dataset =geo.datasets.TUDataset(root=root, name=name, transform=transform,use_node_attr=True)
    else:
        raise NotImplementedError("{} not supported!".format(name))
    return dataset

In [5]:
dataset = getDataset('data', dataset_name, None)

In [6]:
print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Contains isolated nodes: {data.contains_isolated_nodes()}')
print(f'Contains self-loops: {data.contains_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: PROTEINS(1113):
Number of graphs: 1113
Number of features: 4
Number of classes: 2

Data(edge_index=[2, 162], x=[42, 4], y=[1])
Number of nodes: 42
Number of edges: 162
Average node degree: 3.86
Contains isolated nodes: False
Contains self-loops: False
Is undirected: True


## define model architecture 

In [7]:
if dataset_name == 'PROTEINS':
    print("USING Hierarchical Graph Pooling with Structure Learning")
    with open('config-{}.pickle'.format(dataset_name), 'rb') as handle:
        args = pickle.load(handle)
    args.device = 'cpu'
    print(args)
    num_training = int(len(dataset) * 0.8)
    num_val = int(len(dataset) * 0.1)
    num_test = len(dataset) - (num_training + num_val)
    training_set, validation_set, test_set = random_split(dataset, [num_training, num_val, num_test])
    
    train_loader = geo.data.DataLoader(training_set, batch_size=args.batch_size, shuffle=True)
    val_loader = geo.data.DataLoader(validation_set, batch_size=args.batch_size, shuffle=False)
    test_loader = geo.data.DataLoader(test_set, batch_size=args.batch_size, shuffle=False)
    try:
        model = Model(args).to(args.device)
    except RuntimeError:
        args.device = 'cpu'
        model = Model(args).to(args.device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    
    def train():
        min_loss = 1e10
        patience_cnt = 0
        val_loss_values = []
        best_epoch = 0

        t = time.time()
        model.train()
        for epoch in range(args.epochs):
            loss_train = 0.0
            correct = 0
            for i, data in enumerate(train_loader):
                optimizer.zero_grad()
                data = data.to(args.device)
                out = model(data)
                loss = F.nll_loss(out, data.y)
                loss.backward()
                optimizer.step()
                loss_train += loss.item()
                pred = out.max(dim=1)[1]
                correct += pred.eq(data.y).sum().item()
            acc_train = correct / len(train_loader.dataset)
            acc_val, loss_val = compute_test(val_loader)
            print('Epoch: {:04d}'.format(epoch + 1), 'loss_train: {:.6f}'.format(loss_train),
                  'acc_train: {:.6f}'.format(acc_train), 'loss_val: {:.6f}'.format(loss_val),
                  'acc_val: {:.6f}'.format(acc_val), 'time: {:.6f}s'.format(time.time() - t))

            val_loss_values.append(loss_val)
            torch.save(model.state_dict(), 'models/PROTEINS/{}.pth'.format(epoch))
            if val_loss_values[-1] < min_loss:
                min_loss = val_loss_values[-1]
                best_epoch = epoch
                patience_cnt = 0
            else:
                patience_cnt += 1

            if patience_cnt == args.patience:
                break

            files = glob.glob('models/PROTEINS/*.pth')
            for f in files:
                epoch_nb = int(f.split('/')[-1].split('.')[0])
                if epoch_nb < best_epoch:
                    os.remove(f)

        files = glob.glob('models/PROTEINS/*.pth')
        for f in files:
            epoch_nb = int(f.split('/')[-1].split('.')[0])
            if epoch_nb > best_epoch:
                os.remove(f)
        print('Optimization Finished! Total time elapsed: {:.6f}'.format(time.time() - t))

        return best_epoch


    def compute_test(loader):
        model.eval()
        correct = 0.0
        loss_test = 0.0
        for data in loader:
            data = data.to(args.device)
            out = model(data)
            pred = out.max(dim=1)[1]
            correct += pred.eq(data.y).sum().item()
            loss_test += F.nll_loss(out, data.y).item()
        return correct / len(loader.dataset), loss_test
    
else:
    class GCN_node(torch.nn.Module):
        def __init__(self, hidden_channels):
            super(GCN_node, self).__init__()
            torch.manual_seed(0)
            self.conv1 = geo.nn.GCNConv(dataset.num_features, hidden_channels)
            self.conv2 = geo.nn.GCNConv(hidden_channels, dataset.num_classes)

        def forward(self, x, edge_index):
            x = self.conv1(x, edge_index)
            x = x.relu()
            x = nn.functional.dropout(x, p=0.5, training=self.training)
            x = self.conv2(x, edge_index)
            return x

    class GCN_graph(torch.nn.Module):
        def __init__(self, hidden_channels):
            super(GCN_graph, self).__init__()
            torch.manual_seed(0)
            self.conv1 = geo.nn.GCNConv(dataset.num_node_features, hidden_channels)
            self.conv2 = geo.nn.GCNConv(hidden_channels, hidden_channels)
            self.conv3 = geo.nn.GCNConv(hidden_channels, hidden_channels)
            self.lin = nn.Linear(hidden_channels, dataset.num_classes)

        def forward(self, x, edge_index, batch):
            # not using edge attributes as it's hard to fake
            # 1. Obtain node embeddings 
            x = self.conv1(x, edge_index)
            x = x.relu()
            x = self.conv2(x, edge_index)
            x = x.relu()
            x = self.conv3(x, edge_index)

            # 2. Readout layer
            x = geo.nn.global_mean_pool(x, batch)  # [batch_size, hidden_channels]

            # 3. Apply a final classifier
            x = nn.functional.dropout(x, p=0.5, training=self.training)
            x = self.lin(x)

            return x
    device = torch.device('cuda')
    victim_model = GCN_graph(hidden_channels=512).to(device)
    optimizer = torch.optim.Adam(victim_model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()

    def train():
        victim_model.train()

        for data in train_loader:  # Iterate in batches over the training dataset.
            data = data.to(device)
            out = victim_model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
            loss = criterion(out, data.y)  # Compute the loss.
            loss.backward()  # Derive gradients.
            optimizer.step()  # Update parameters based on gradients.
            optimizer.zero_grad()  # Clear gradients.

    def test(loader):
        victim_model.eval()
        correct = 0
        for data in loader:  # Iterate in batches over the training/test dataset.
            data = data.to(device)
            out = victim_model(data.x, data.edge_index, data.batch)  
            pred = out.argmax(dim=1)  # Use the class with highest probability.
            correct += int((pred == data.y).sum())  # Check against ground-truth labels.
        return correct / len(loader.dataset)  # Derive ratio of correct predictions.

USING Hierarchical Graph Pooling with Structure Learning
Namespace(batch_size=512, dataset='PROTEINS', device='cpu', dropout_ratio=0.0, epochs=1000, lamb=1.0, lr=0.001, nhid=128, num_classes=2, num_features=4, patience=100, pooling_ratio=0.5, sample_neighbor=True, seed=777, sparse_attention=True, structure_learning=True, weight_decay=0.001)


In [8]:
trained = True

## freeze model parameters and make it deterministic

In [9]:
if not trained:
    if dataset_name != "PROTEINS":
        for epoch in range(1, 501):
            train()
            if epoch % 20 == 0:
                train_acc = test(train_loader)
                test_acc = test(test_loader)
                print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')
    else:
        # Model training
        best_model = train()
        # Restore best model for test set
        model.load_state_dict(torch.load('models/PROTEINS/{}.pth'.format(best_model)))
        test_acc, test_loss = compute_test(test_loader)
        print('Test set results, loss = {:.6f}, accuracy = {:.6f}'.format(test_loss, test_acc))
else:
    model.load_state_dict(torch.load('models/PROTEINS/220.pth', map_location=args.device))
    model.eval()
    test_acc, test_loss = compute_test(test_loader)
    print('Test set results, loss = {:.6f}, accuracy = {:.6f}'.format(test_loss, test_acc))
    for param in model.parameters():
        param.requires_grad = False

Test set results, loss = 0.477780, accuracy = 0.812500


## check all possible candidates (assume prior knowledge)

In [10]:
cands = {}
for i in range(len(dataset)):
    for j in range(dataset[i].x.shape[0]):
        key = '{} {}'.format(dataset[i].x[j, 0].item(), dataset[i].x[j, 1:].argmax().item())
        try:
            cands[key] += 1
        except KeyError:
            cands[key] = 1

## load generated class impressions

In [11]:
cldataList = []
for i in range(2):
    label = torch.Tensor([i]).long()
    for fin in os.listdir(os.path.join('data', dataset_name, 'classImpression', str(i)+'_tropology')):
        tmp = torch.load(os.path.join('data', dataset_name, 'classImpression', str(i)+'_tropology', fin))
        tmp.y = label
        tmp = geo.data.Data(x=tmp.x, edge_index=tmp.edge_index, edge_attr=tmp.edge_attr, y=tmp.y)
        cldataList.append(tmp.to(args.device))
        if (tmp.edge_index.max().item() - tmp.x.shape[0]) != -1:
            print(fin, i)
        if (tmp.edge_index.shape[1] != tmp.edge_attr.shape[0]):
            print(fin, i)

In [44]:
class surrogateData(geo.data.Dataset):
    def __init__(self, dataList):
        super().__init__()
        self.data = dataList
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        self.data[index].x.long
        return self.data[index]

In [46]:
surData = surrogateData(cldataList)

In [64]:
print('======================')
print(f'Number of graphs: {len(surData)}')
print(f'Number of features: {surData.num_features}')
# print(f'Number of classes: {surData.num_classes}')
num_nodes = 0
num_edges = 0
for i in range(len(surData)):
    data = surData[i]  # Get the first graph object.
    num_nodes += data.num_nodes
    num_edges += (data.edge_attr>=0.98).sum()
print(num_edges.item()/num_nodes)

Number of graphs: 677
Number of features: 4
45.92490515454739


In [69]:
idx2adj(data)

tensor([[0., 1., 1.,  ..., 1., 1., 1.],
        [1., 0., 1.,  ..., 1., 1., 1.],
        [1., 1., 0.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 0., 1., 1.],
        [1., 1., 1.,  ..., 1., 0., 1.],
        [1., 1., 1.,  ..., 1., 1., 0.]])

In [67]:
data.edge_index[:, data.edge_attr >= 0.98]

tensor([[  2,   3,   3,  ..., 304, 305, 306],
        [  0,   1,   2,  ...,   0,   0,   0]])

## randomly initialize a node as our universal trigger (previous approach)

In [12]:
def getNodes(n):    
    # node features are 4D vectors, the first dimension means the van de wall force and the next 3 are one-hot-encoded category
    tmp = torch.cat((torch.randint(-500, 800, (n,1), device=args.device),
                     torch.nn.functional.one_hot(torch.randint(0, 3, (n,), device=args.device), num_classes=3)), dim=1)
    return tmp.float().clone()

In [13]:
def getCands(n, idx=None):
    # get n nodes based on candidate set, idx is the index of node feature
    if not idx:
        idxs = random.choices(list(range(len(list(cands.keys())))), k=n)
    else:
        idxs = idx
    dataList = []
    for i in idxs:
        vandewallF, encoded = list(cands.keys())[i].split()
        dataList.append(torch.cat((torch.as_tensor([float(vandewallF)], device=args.device).long(), 
                                   F.one_hot(torch.as_tensor([int(encoded)], device=args.device).long(), num_classes=3).squeeze())).unsqueeze(0))
    return torch.cat(dataList, dim=0).float().clone()

In [14]:
def getTrigger(n, idx=None, weighted=False):
    # idx is the list of required indecies
    if idx:
        assert n == len(idx)
    trigger = geo.data.Data()
    trigger.x = getCands(n, idx)
    adv_adj = torch.zeros(size=(n, n)).bool()
    for i in range(n):
        adv_adj[i, i:].random_(0, 2)
    adv_adj = adv_adj.int()
    for i in range(n):
        for j in range(i, n):
            adv_adj[j, i] = adv_adj[i, j]
    if n > 1:
        try:
            trigger.edge_index, _ = geo.utils.remove_self_loops(adj2idx(adv_adj).long())
        except RuntimeError:
            # in case of no edge generated
            return getTrigger(n, idx, weighted)
        if geo.utils.contains_isolated_nodes(trigger.edge_index, num_nodes=n):
            return getTrigger(n, idx, weighted)
        else:
            if weighted:
                trigger.edge_attr = torch.ones((trigger.edge_index.shape[1], ))
            return trigger
    else:
        return trigger

In [15]:
def idx2adj(data):
    device = data.edge_index.device
    edge_index = torch.zeros(size=(data.num_nodes, data.num_nodes), device=device)
    for i in range(data.edge_index.shape[1]):
        edge_index[data.edge_index[0][i]][data.edge_index[1][i]] = 1
    return edge_index

def adj2idx(edge_index):
    assert edge_index.shape[0] == edge_index.shape[1]
    tmp = []
    for i in range(edge_index.shape[0]):
        for j in range(edge_index.shape[0]):
            if edge_index[i][j] == 1:
                tmp.append([i, j])
    return torch.Tensor(tmp).permute(1,0).to(edge_index.device)

In [16]:
def append(ori_graph, appendix, anchor_pos):
    # function to append a group of nodes to original graph
    idx_anchor = (ori_graph.edge_index[0] > anchor_pos).int().argmin() + 1
    first_half = ori_graph.edge_index[:, :idx_anchor]
    second_half = ori_graph.edge_index[:, idx_anchor + 1:]
    new_edge0 = torch.as_tensor([[anchor_pos], [ori_graph.num_nodes]], 
                             device=ori_graph.x.device).long()
    new_edge1 = torch.as_tensor([[ori_graph.num_nodes], [anchor_pos]],
                             device=ori_graph.x.device).long()
    if appendix.num_nodes > 1:
        new_edge_index = torch.cat([first_half, new_edge0, 
                                    second_half, new_edge1,
                                    appendix.edge_index + ori_graph.num_nodes],
                                   dim=1)
    else:
        new_edge_index = torch.cat([first_half, new_edge0, second_half, new_edge1], dim=1)
    new_x = torch.cat([ori_graph.x, appendix.x], dim=0)
    ori_graph.edge_index = new_edge_index
    ori_graph.x = new_x
    ori_graph.num_nodes += appendix.num_nodes
    if ori_graph.edge_attr is not None:
#         print("Appending to a weighted graph")
        first_half = ori_graph.edge_attr[:idx_anchor]
        second_half = ori_graph.edge_attr[idx_anchor + 1:]
        new_edge0 = torch.as_tensor([1.0], device=ori_graph.x.device)
        new_edge1 = torch.as_tensor([1.0], device=ori_graph.x.device)
        if appendix.num_nodes > 1:
            edge_attr = torch.cat([first_half, new_edge0, second_half, new_edge1, appendix.edge_attr])
        else:
            edge_attr = torch.cat([first_half, new_edge0, second_half, new_edge1])
        ori_graph.edge_attr = edge_attr
    return ori_graph

In [17]:
def batch_from_list(dataList):
    # generate a batched data from a list of geometric data
    # using this since wrong number of batch for surrogate data
    outp = geo.data.Batch()
    data = dataList[0]
    keys = data.keys
    keys.append('batch')
    batchedData = {key: [] for key in keys}
    cnt = 0
    prev_nodes = 0
    for data in dataList:
        for key in data.keys:
            if key == 'edge_index':
                data[key] += prev_nodes
            batchedData[key].append(data[key])
        batchedData['batch'].append(torch.ones((data.x.shape[0], ), dtype=torch.int64) * cnt)
        prev_nodes += data.x.shape[0]
        cnt += 1
    for key in keys:
        if key == 'edge_index':
            catDim = -1
        else:
            catDim = 0
        outp[key] = torch.cat(batchedData[key], dim=catDim)
    return outp

In [18]:
def BatchAppend(ori_graphs, trigger, pos_mode='deg'):
    # taking list of graphs, graph as input, return batch of graphes
    modi_graphs = []
    for i in range(len(ori_graphs)):
        tmp = ori_graphs[i].clone()
        pos_mode = pos_mode.lower()
        if pos_mode == 'deg':
            glueLoc = geo.utils.degree(tmp.edge_index[0]).argmax() # append to node with the highest degree
        elif pos_mode == 'min':
            # for experiment only, every other centrality measures take the highest value
            glueLoc = geo.utils.degree(tmp.edge_index[0]).argmin() # append to node with the lowest degree
        elif pos_mode == 'eig':
            # eigen vector centrality measures
            G = geo.utils.to_networkx(tmp)
            eigen_centrality = networkx.algorithms.centrality.eigenvector_centrality_numpy(G)
            glueLoc = sorted(eigen_centrality, key=eigen_centrality.get, reverse=True)[0]
        elif pos_mode == 'btw':
            # betweenness centrality measures
            G = geo.utils.to_networkx(tmp)
            btw_centrality = networkx.algorithms.centrality.betweenness_centrality(G)
            glueLoc = sorted(btw_centrality, key=btw_centrality.get, reverse=True)[0]
        else:
            raise NotImplementedError("Only degree, eigen-vector, and betweenness centrality measures!")
        modi_graph = append(tmp, trigger, glueLoc)
        modi_graphs.append(modi_graph)
#         print(modi_graph, modi_graphs)
#     modi_graphs = geo.data.Batch.from_data_list(modi_graphs).to(args.device)
    modi_graphs = batch_from_list(modi_graphs)
    return modi_graphs

## generate list of all possible triggers

In [19]:
triggerList = []
for i in range(len(cands.keys())):
    triggerList.append(getTrigger(1, [i]))

## adopt hot-flip universal adversarial attack
1. Generate candidates for universal trigger, it should contains all feasible nodes
2. compute replace nodes in triggers by argmin(e_i - e_adv_i)^T \nabla_e_adv_i loss (note that embedding can be extracted from model)

### question: how to get individual embedding? 
* difference by dropping it?
* different edges will result in different embedding, how to solve it?
* current solution: limit trigger length to 1 (bit like an exhaustive search?)
* single node embedding by difference of graph embeddings
* gradient of node embedding wrt loss by difference of gradients of graph embeddings wrt losses

In [20]:
def getOutput(embed):
    outp = F.relu(model.lin1(embed))
    outp = F.dropout(outp, p=model.dropout_ratio, training=model.training)
    outp = F.relu(model.lin2(outp))
    outp = F.dropout(outp, p=model.dropout_ratio, training=model.training)
    outp = F.log_softmax(model.lin3(outp), dim=-1)
    return outp

## sample code for non-batched graph UAA

In [21]:
# modi_graph = BatchAppend([dataset[0]], trigger)

# modi_graph.x.requires_grad = True

# embed_adv = model(modi_graph)

# ori_graph = geo.data.Batch.from_data_list([dataset[0]]).to(args.device)
# ori_graph.x.requires_grad = True

# embed_ori = model(ori_graph)
# embed_others = model(BatchAppend([dataset[0]], getTrigger(1)).to(args.device))

# output = getOutput(embed_adv)
# loss = F.nll_loss(output, dataset[0].y)

# grad_embed_adv = torch.autograd.grad(loss, embed_adv)#, allow_unused=True)

# output = getOutput(embed_ori)
# loss = F.nll_loss(output, dataset[0].y)
# grad_embed_ori = torch.autograd.grad(loss, embed_ori)#, allow_unused=True)

# torch.matmul(grad_embed_ori[0] - grad_embed_adv[0], (embed_others - embed_adv).T)

## sample code for batched graph UAA

In [22]:
# modi_graph = BatchAppend(dataset[:8], trigger)

# modi_graph.x.requires_grad = True

# embed_adv = model(modi_graph)

# ori_graph = geo.data.Batch.from_data_list(dataset[:8]).to(args.device)
# ori_graph.x.requires_grad = True

# embed_ori = model(ori_graph)
# embed_others = model(BatchAppend(dataset[:8], getTrigger(1)).to(args.device))

# labels = geo.data.Batch.from_data_list(dataset[:8]).y
# output = getOutput(embed_adv)
# loss = F.nll_loss(output, labels)

# grad_embed_adv = torch.autograd.grad(loss, embed_adv)#, allow_unused=True)

# output = getOutput(embed_ori)
# loss = F.nll_loss(output, labels)
# grad_embed_ori = torch.autograd.grad(loss, embed_ori)#, allow_unused=True)

# torch.matmul((embed_others - embed_adv).mean(dim=0).unsqueeze(0), (grad_embed_ori[0] - grad_embed_adv[0]).mean(dim=0).unsqueeze(-1)).item()

## generate universal trigger via graph embedding difference

In [23]:
batch_size = 1

In [24]:
result = {}

In [25]:
pos_modes = ['deg', 'min', 'btw', 'eig']

In [None]:
for pos_mode in pos_modes:
    log = {}
    trigger_idx = random.choice(range(len(triggerList)))
    trigger = triggerList[trigger_idx].to(args.device)
    graph_idx = list(range(len(training_set)))
    start = time.time()
    while len(graph_idx) > 0:
        cur_batch = random.choices(graph_idx, k=batch_size)
        graph_idx = [tmp for tmp in graph_idx if tmp not in cur_batch]
        modi_graph = BatchAppend([training_set[i].to(args.device)
                                 for i in cur_batch], trigger, pos_mode)
        modi_graph.x.requires_grad = True
        embed_adv = model(modi_graph)

        ori_graph = geo.data.Batch.from_data_list(
            [training_set[i].to(args.device) for i in cur_batch]).to(args.device)
        ori_graph.x.requires_grad = True

        embed_ori = model(ori_graph)

        labels = geo.data.Batch.from_data_list(
            [training_set[i].to(args.device) for i in cur_batch]).y

        output = getOutput(embed_adv)
        loss = F.nll_loss(output, labels)
        grad_embed_adv = torch.autograd.grad(
            loss, embed_adv)  # , allow_unused=True)

        output = getOutput(embed_ori)
        loss = F.nll_loss(output, labels)
        grad_embed_ori = torch.autograd.grad(
            loss, embed_ori)  # , allow_unused=True)

        cur_score = torch.ones((len(triggerList, )))
        for triggerCand in range(len(triggerList)):
            embed_others = model(BatchAppend(
                [training_set[i].to(args.device) for i in cur_batch],
                triggerList[triggerCand],
                pos_mode
            ).to(args.device))
            cur_score[triggerCand] = torch.matmul((embed_others - embed_adv).mean(dim=0).unsqueeze(0),
                                                  (grad_embed_ori[0] - grad_embed_adv[0]).mean(dim=0).unsqueeze(-1)).item()
        trigger_idx = cur_score.argmin(-1)
    time_used = time.time() - start
    modi_graph = BatchAppend(test_set, triggerList[trigger_idx], pos_mode)
    embed_adv = model(modi_graph)
    output = getOutput(embed_adv)
    accu = (output.argmax(-1) ==
            geo.data.Batch.from_data_list(test_set).y).int().sum().item() / len(test_set)
    log['trigger_idx'] = trigger_idx.item()
    log['accu'] = accu
    log['timeCost'] = time_used
    result[pos_mode] = log


In [43]:
result

{'length: 1 mode: deg': {'trigger_idx': [82],
  'accu': 0.5714285714285714,
  'timeCost': 992.6306567192078},
 'length: 1 mode: min': {'trigger_idx': [106],
  'accu': 0.5357142857142857,
  'timeCost': 946.7149651050568},
 'length: 1 mode: btw': {'trigger_idx': [80],
  'accu': 0.7410714285714286,
  'timeCost': 938.4616959095001},
 'length: 1 mode: eig': {'trigger_idx': [106],
  'accu': 0.5892857142857143,
  'timeCost': 938.4162719249725},
 'length: 2 mode: deg': {'trigger_idx': [106, 106],
  'accu': 0.5178571428571429,
  'timeCost': 936.9213075637817},
 'length: 2 mode: min': {'trigger_idx': [106, 106],
  'accu': 0.5535714285714286,
  'timeCost': 936.5684282779694},
 'length: 2 mode: btw': {'trigger_idx': [106, 106],
  'accu': 0.5714285714285714,
  'timeCost': 937.3475587368011},
 'length: 2 mode: eig': {'trigger_idx': [106, 106],
  'accu': 0.5357142857142857,
  'timeCost': 934.8087739944458},
 'length: 3 mode: deg': {'trigger_idx': [106, 106, 106],
  'accu': 0.5803571428571429,
  'time

In [None]:
with open('graphEmbeddingResult.json', 'w+') as fout:
    json.dump(result, fout)

In [None]:
ori_graph = geo.data.Batch.from_data_list(test_set).to(args.device)
embed_ori = model(ori_graph)
output = getOutput(embed_ori)
(output.argmax(-1) == geo.data.Batch.from_data_list(test_set).y).int().sum().item() / len(test_set)

### experiment records:
* hotflip attack embedding difference version: 
    * to max degree, 76.19% accu
    * to lowest degree, 75.29% accu

## new idea: use another similarity metric rather than difference, use node feature as embedding (variable trigger length)

### just replace embedding:

In [None]:
result = {}
for triggerLen in range(1, 4):
    for pos_mode in pos_modes:
        start = time.time()
        log = {}
        graph_idx = list(range(len(training_set)))
        trigger_idx = random.choices(range(len(triggerList)), k=triggerLen)
        while len(graph_idx) > 0:
            trigger = getTrigger(n=triggerLen, idx=trigger_idx)
            cur_batch = random.choices(graph_idx, k=batch_size)
            graph_idx = [tmp for tmp in graph_idx if tmp not in cur_batch]
            modi_graph = BatchAppend([training_set[i] for i in cur_batch], trigger, pos_mode)
            modi_graph.x.requires_grad_()
            embed_adv = model(modi_graph)

            ori_graph = geo.data.Batch.from_data_list([training_set[i] for i in cur_batch]).to(args.device)
            ori_graph.x.requires_grad_()

            embed_ori = model(ori_graph)

            labels = geo.data.Batch.from_data_list([training_set[i] for i in cur_batch]).y

            output = getOutput(embed_adv)
            loss = F.nll_loss(output, labels)
            grad_embed_adv = torch.autograd.grad(loss, modi_graph.x)#, allow_unused=True)

            output = getOutput(embed_ori)
            loss = F.nll_loss(output, labels)
            grad_embed_ori = torch.autograd.grad(loss, ori_graph.x)#, allow_unused=True)

            for i in range(len(trigger_idx)):
                cur_score = torch.ones((len(triggerList, )))
                for triggerCand in range(len(triggerList)):
                    cur_score[triggerCand] = torch.matmul((triggerList[triggerCand].x - modi_graph.x[-(triggerLen-i)]).mean(dim=0).unsqueeze(0),
                                                          (grad_embed_adv[0][-(triggerLen-i)]).unsqueeze(-1)).item()
                trigger_idx[i] = cur_score.argmin(-1).item()
        time_used = time.time() - start
        trigger = getTrigger(n=triggerLen, idx=trigger_idx)
        modi_graph = BatchAppend(test_set, trigger, pos_mode)
        embed_adv = model(modi_graph)
        output = getOutput(embed_adv)
        accu = (output.argmax(-1) == geo.data.Batch.from_data_list(test_set).y).int().sum().item() / len(test_set)
        log['trigger_idx'] = trigger_idx
        log['accu'] = accu
        log['timeCost'] = time_used
        result['length: {} mode: {}'.format(triggerLen, pos_mode)] = log

In [None]:
result

In [None]:
with open('nodeEmbeddingResult.json', 'w+') as fout:
    json.dump(result, fout)

### records:
* benign: 77.89%
* length=1: 63.61% (lowest degree), 
* length=2: 66.49%
* length=3: 65.41%

In [None]:
# for i in range(len(list(cands.keys()))):
#     trigger = getTrigger(n=1, idx=[i])
#     modi_graph = BatchAppend(dataset, trigger)
#     embed_adv = model(modi_graph)
#     output = getOutput(embed_adv)
#     accu = (output.argmax(-1) == geo.data.Batch.from_data_list(dataset).y).int().sum().item() / len(dataset)
#     print(i, accu)

In [None]:
# len(list(cands.keys()))

## on surrogate data

In [None]:
result = {}
for triggerLen in range(1, 4):
    for pos_mode in pos_modes:
        log = {}
        start = time.time()
        graph_idx = list(range(len(cldataList)))
        trigger_idx = random.choices(range(len(triggerList)), k=triggerLen)
        while len(graph_idx) > 0:
            trigger = getTrigger(n=triggerLen, idx=trigger_idx, weighted=True)
            cur_batch = random.choices(graph_idx, k=batch_size)
            graph_idx = [tmp for tmp in graph_idx if tmp not in cur_batch]
            modi_graph = BatchAppend([cldataList[i] for i in cur_batch], trigger)
            modi_graph.x.requires_grad_()
            embed_adv = model(modi_graph)

            ori_graph = geo.data.Batch.from_data_list([cldataList[i] for i in cur_batch]).to(args.device)
            ori_graph.x.requires_grad_()

            embed_ori = model(ori_graph)

            labels = geo.data.Batch.from_data_list([cldataList[i] for i in cur_batch]).y

            output = getOutput(embed_adv)
            loss = F.nll_loss(output, labels)
            grad_embed_adv = torch.autograd.grad(loss, modi_graph.x)#, allow_unused=True)

            output = getOutput(embed_ori)
            loss = F.nll_loss(output, labels)
            grad_embed_ori = torch.autograd.grad(loss, ori_graph.x)#, allow_unused=True)

            for i in range(len(trigger_idx)):
                cur_score = torch.ones((len(triggerList, )))
                for triggerCand in range(len(triggerList)):
                    cur_score[triggerCand] = torch.matmul((triggerList[triggerCand].x - modi_graph.x[-(triggerLen-i)]).mean(dim=0).unsqueeze(0),
                                                          (grad_embed_adv[0][-(triggerLen-i)]).unsqueeze(-1)).item()
                trigger_idx[i] = cur_score.argmin(-1).item()
        time_used = time.time() - start
        trigger = getTrigger(n=triggerLen, idx=trigger_idx, weighted=True)
        modi_graph = BatchAppend(test_set, trigger, pos_mode)
        embed_adv = model(modi_graph)
        output = getOutput(embed_adv)
        accu = (output.argmax(-1) == geo.data.Batch.from_data_list(test_set).y).int().sum().item() / len(test_set)
        log['trigger_idx'] = trigger_idx
        log['accu'] = accu
        log['timeCost'] = time_used
        result['length: {} mode: {}'.format(triggerLen, pos_mode)] = log

In [None]:
(grad_embed_adv[0][-(triggerLen-i)]).unsqueeze(-1).shape, (triggerList[triggerCand].x - modi_graph.x[-(triggerLen-i)]).mean(dim=0).unsqueeze(0).shape

In [None]:
triggerLen

In [None]:
min([tmp.x.shape[0] for tmp in cldataList])

In [None]:
with open('nodeEmbeddingResult_ClsImprs.json', 'w+') as fout:
    json.dump(result, fout)

In [None]:
fileNames = [tmp for tmp in os.listdir() if tmp.endswith('.json') and 'Result' in tmp]

In [None]:
overallResult = pd.DataFrame()
for fileName in fileNames:
    with open(fileName, 'r') as fin:
        tmp = json.load(fin)
    for key in list(tmp.keys()):
        overallResult = overallResult.append(pd.DataFrame({tmpKey: str(tmp[key][tmpKey]) for tmpKey in list(tmp[key].keys())}, 
                                index=['{} {}'.format(fileName.split('.')[0], key)]))

In [None]:
overallResult.to_csv('OverallResult_updated.csv')

In [73]:
result = {}
graph_idx = list(range(len(cldataList)))
for triggerLen in range(1, 4):
    for pos_mode in pos_modes:
        log = {}
        tmp = 0
        for i in range(5): # repeat several times
            trigger_idx = random.choices(range(len(triggerList)), k=triggerLen)
            trigger = getTrigger(n=triggerLen, idx=trigger_idx, weighted=True)
            modi_graph = BatchAppend(test_set, trigger, pos_mode)
            embed_adv = model(modi_graph)
            output = getOutput(embed_adv)
            tmp += (output.argmax(-1) == geo.data.Batch.from_data_list(test_set).y).int().sum().item() / len(test_set)
        log['accu'] = tmp / 5.0
        result['length: {} mode: {}'.format(triggerLen, pos_mode)] = log

In [76]:
pd.DataFrame(result).to_csv('random_baseline.csv')