In [1]:
import os
import numpy as np
from scipy.linalg import eigh
from scipy import sparse
from node_embedding_attack.utils import *
from node_embedding_attack.embedding import *
from node_embedding_attack.perturbation_attack import *

### Load and preprocess the data

In [4]:
def load_cora(data_dir, largest_cc=False):
    """Only loads the graph structure ignoring node features"""
    g_nx = nx.read_edgelist(path=os.path.expanduser(os.path.join(data_dir, "cora.cites")))
    
    for edge in g_nx.edges(data=True):
        edge[2]['label'] = 'cites'

    if largest_cc:
        # Select the largest connected component. For clarity we ignore isolated
        # nodes and subgraphs; having these in the data does not prevent the
        # algorithm from running and producing valid results.
        g_nx_ccs = (g_nx.subgraph(c).copy() for c in nx.connected_components(g_nx))
        g_nx = max(g_nx_ccs, key=len)
        print("Largest subgraph statistics: {} nodes, {} edges".format(
            g_nx.number_of_nodes(), g_nx.number_of_edges()))   

    adj_matrix = nx.to_numpy_array(g_nx)
    adj_matrix = sparse.csr_matrix(adj_matrix)
        
    return g_nx, adj_matrix

In [5]:
def load_citeseer(data_home):
    """Only returns the graph structure ignoring node features"""

    df = pd.read_csv(
        os.path.join(data_home, "citeseer.content"),
        sep=r"\s+",
        header=None,
        index_col=0,
    )
    df.index = df.index.map(str)

    features_df = df.iloc[:, :-1]
    labels_df = df.iloc[:, -1]

    edge_list_df = pd.read_csv(
        os.path.join(data_home, "citeseer.cites"), sep=r"\s+", dtype=str, header=None
    )

    idx_map = {j: i for i, j in enumerate(df.index)}

    H = nx.from_pandas_edgelist(edge_list_df, 0, 1)
    G = nx.relabel.relabel_nodes(H, idx_map)

    # This dataset has about 15 nodes in the edge list that don't have corresponding entries
    # in citeseer.content, that is don't have features. We need to identify them and then remove
    # them from the graph along with all the edges to/from them.
    nodes_to_remove = [n for n in G.nodes() if type(n) == str]
    G.remove_nodes_from(nodes_to_remove)

    adj_matrix = nx.to_scipy_sparse_matrix(G, nodelist=sorted(G.nodes()), format="coo")

    return G, adj_matrix


In [None]:
if False:
    data_dir = os.path.expanduser("~/data/cora/")
    cora_location = os.path.expanduser(os.path.join(data_dir, "cora.cites"))
    g_nx = nx.read_edgelist(path=cora_location)
    adj_matrix = nx.to_numpy_array(g_nx)
    adj_matrix = sparse.csr_matrix(adj_matrix)

### Select the dataset

In [7]:
dataset = "cora"  # or citeseer
load_data = load_cora
if dataset=="cora":
    data_dir = "~/data/cora"
elif dataset == "citeseer":
    data_dir = "~/data/citeseer/"
    load_data = load_citeseer
# How about polblogs?

In [11]:
g_nx, adj_matrix = load_data(data_dir)
n_nodes = adj_matrix.shape[0]
# The example code selects the largest component but we won't do that
# here. The question is, does this effect the quality of attacks?

In [13]:
adj_matrix.shape, n_nodes

((2708, 2708), 2708)

In [14]:
type(adj_matrix)

scipy.sparse.csr.csr_matrix

In [None]:
if False:
    graph = load_dataset('data/cora.npz')
    adj_matrix = graph['adj_matrix']
    labels = graph['labels']

    adj_matrix, labels = standardize(adj_matrix, labels)
    n_nodes = adj_matrix.shape[0]

### Set hyperparameters

In [29]:
n_flips = 1000
dim = 32
window_size = 5
attack_method = "add"

### Store attacked graph

In [24]:
def attack_graph(adj_matrix, n_flips, dim, 
                 window_size, seed=0, method="add"):
    """Method for attacking a graph by adding or removing edges"""
    if method=="add":
        candidates = generate_candidates_addition(adj_matrix=adj_matrix,
                                                  n_candidates=n_flips, 
                                                  seed=seed)
    else:
        candidates = generate_candidates_removal(adj_matrix=adj_matrix, 
                                                 seed=seed)
        
    our_flips = perturbation_top_flips(adj_matrix, 
                                       candidates, 
                                       n_flips, 
                                       dim, 
                                       window_size)
    #
    A = np.array(adj_matrix.todense())
    
    A_flipped = A.copy()
    A_flipped[candidates[:, 0], candidates[:, 1]] = 1 - A[candidates[:, 0], candidates[:, 1]]
    A_flipped[candidates[:, 1], candidates[:, 0]] = 1 - A[candidates[:, 1], candidates[:, 0]]
    
    return A_flipped  # The attacked adjacency matrix

In [30]:
A_flipped = attack_graph(adj_matrix, 
                         n_flips, 
                         dim, 
                         window_size, 
                         seed=0, 
                         method=attack_method)

### Save attacked graph to disk

In [31]:
ele = 'attack'
#corrupted_A = corrupt_adjacency(A, ele, l)
dir_name = os.path.join("attacked_datasets", 
                        dataset, 
                        ele, attack_method )
print(dir_name)
i = 1

attacked_datasets/cora/attack/add


In [33]:
if not os.path.exists(dir_name):
    os.makedirs(dir_name)
file_name = dataset + "_" + ele + "_"+attack_method+"_"+str(n_flips)+"_v"+str(i)
print(f"file_name: {file_name}")
np.save(os.path.join(dir_name,file_name), A_flipped)

file_name: cora_attack_add_1000_v1


In [35]:
num_flips = [ -2000, -1000, -500, 500, 1000, 2000, 5000 ]

In [37]:
for n_flips in num_flips:
    print(f"Calculating for n_flips={n_flips}")
    if n_flips < 0:
        method = "remove"
        n_flips = -n_flips
    else:
        method = "add"
    A_flipped = attack_graph(adj_matrix=adj_matrix, 
                             n_flips=n_flips, 
                             dim=dim, 
                             window_size=window_size, 
                             method=method, 
                             seed=0)
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    file_name = dataset + "_" + ele + "_"+attack_method+"_"+str(n_flips)+"_"+method #+"_v"+str(i)
    print(f"file_name: {file_name}")
    np.save(os.path.join(dir_name,file_name), A_flipped)
    
    graph = nx.from_numpy_array(A_flipped)
    file_name += ".gpickle"
    nx.write_gpickle(graph, os.path.join(dir_name, file_name))

Calculating for n_flips=-2000
file_name: cora_attack_add_2000_remove
Calculating for n_flips=-1000
file_name: cora_attack_add_1000_remove
Calculating for n_flips=-500
file_name: cora_attack_add_500_remove
Calculating for n_flips=500
file_name: cora_attack_add_500_add
Calculating for n_flips=1000
file_name: cora_attack_add_1000_add
Calculating for n_flips=2000
file_name: cora_attack_add_2000_add
Calculating for n_flips=5000
file_name: cora_attack_add_5000_add


In [38]:
adj_matrix.shape

(2708, 2708)