# GraphSAGE Implementation
> Link Stealing Attack

#### Imports

In [1]:
import argparse
import time
import numpy as np
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
from dgl import DGLGraph
from dgl.data import register_data_args, load_data
from dgl.nn.pytorch.conv import SAGEConv
import matplotlib.pyplot as plt

Using backend: pytorch


#### Random

In [2]:
import random
random.seed(1)

#### Disable Warnings

In [3]:
import warnings
warnings.filterwarnings('ignore')

#### Dataset

- social network
- nodes being people
- connected if they know each other
- features contain
    - `City` they live in
    - `Street` they live in
    - `Age`
- `School` being the label which the GNN aims to predict

In [4]:
def generate_DGLGraph(node_amount):
    print(" [+] Initialize Graph")
    # initialize Graph
    G = dgl.DGLGraph()
    
    # Add node_amount nodes
    G.add_nodes(node_amount)
    G.ndata['feat'] = torch.zeros((node_amount, 3), dtype=torch.float)
    G.ndata['label'] = torch.zeros(node_amount, dtype=torch.int64)
    
    print(" [+] Generate Features and Labels")
    for i in range(node_amount):
        # print(f' [+] Generate Features and Labels: {i} / {node_amount}', end='\r')
        # features
        street = random.randint(0, 49)
        city   = 0 if street < 10 else int(street / 10)
        age    = random.randint(5, 20)
        G.nodes[i].data['feat'] = torch.tensor([[city, street, age]], dtype=torch.float)
        
        # labels
        school = random.randint(0, 9)
        G.nodes[i].data['label'] = torch.tensor([school], dtype=torch.int64)
    
    # edges
    pairs = []
    src_list, dst_list = [], []
    print(" [+] Generate Edges")
    for i in range(node_amount):
        for j in range(node_amount):
            # print(f' [+] Generate Edges: {i * j} / {node_amount * node_amount}', end='\r')
            # no self loops
            if i == j:
                continue
            
            if know_each_other(G.ndata, i, j) and (i, j) not in pairs:
                src_list.append(i)
                src_list.append(j)
                dst_list.append(j)
                dst_list.append(i)
                pairs.append((i, j))
                pairs.append((j, i))
                
    G.add_edges(src_list, dst_list)
    
    # masks
    print(" [+] Generate Masks")
    G.ndata['train_mask'] = torch.zeros(node_amount, dtype=torch.bool)
    G.ndata['val_mask'] = torch.zeros(node_amount, dtype=torch.bool)
    G.ndata['test_mask'] = torch.zeros(node_amount, dtype=torch.bool)
    
    train_val_split = int(node_amount * 0.2)
    val_test_split  = train_val_split + int(node_amount * 0.4)
        
    for a in range(node_amount):
        G.ndata['train_mask'][a] = a < train_val_split
        G.ndata['val_mask'][a] = a >= train_val_split and a < val_test_split
        G.ndata['test_mask'][a] = a >= val_test_split
   
    # return Graph, number of classes, number of edges / 2
    return G, 10, len(pairs) / 2

The label `School` is also used for deciding whether they know each other or not

In [5]:
def know_each_other(data, i, j, threshold=0.5):
    score = 0.0
    
    x, x_school = data['feat'][i], data['label'][i].item()
    y, y_school = data['feat'][j], data['label'][j].item()
    
    # City : Street : School & Age
    score += 0.25 if x[0].item() == y[0].item() else 0.1
    score += 0.25 if abs(x[1].item() - y[1].item()) < 4 else 0.1
    score += 0.5 if x_school == y_school and abs(x[2].item() - y[2]) < 2 else 0.2 if x_school == y_school else 0.0 

    return True if score >= threshold else False

#### Generate new Social Network

In [6]:
AMOUNT_OF_MEMBERS = 300
social_network_graph, num_classes, num_edges = generate_DGLGraph(AMOUNT_OF_MEMBERS)

 [+] Initialize Graph
 [+] Generate Features and Labels
 [+] Generate Edges
 [+] Generate Masks


In [7]:
#nx_graph = social_network_graph.to_networkx().to_undirected()
#pos = nx.kamada_kawai_layout(nx_graph)
#nx.draw(nx_graph, pos, node_color=[[.6, .4, .8]])

#### GraphSAGE model

In [8]:
class GraphSAGE(nn.Module):
    
    def __init__(self,
                 in_feats,
                 n_hidden,
                 n_classes,
                 n_layers,
                 activation,
                 dropout,
                 aggregator_type):
        super(GraphSAGE, self).__init__()
        self.layers = nn.ModuleList()
        self.dropout = nn.Dropout(dropout)
        self.activation = activation

        # input layer
        self.layers.append(SAGEConv(in_feats, n_hidden, aggregator_type))
        # hidden layers
        for i in range(n_layers - 1):
            self.layers.append(SAGEConv(n_hidden, n_hidden, aggregator_type))
        # output layer
        self.layers.append(SAGEConv(n_hidden, n_classes, aggregator_type)) # activation None

        
    def forward(self, graph, inputs):
        h = self.dropout(inputs)
        for l, layer in enumerate(self.layers):
            h = layer(graph, h)
            if l != len(self.layers) - 1:
                h = self.activation(h)
                h = self.dropout(h)
        return h

#### Classification

In [9]:
def evaluate(model, graph, features, labels, nid):
    model.eval()
    with torch.no_grad():
        logits = model(graph, features)
        logits = logits[nid]
        labels = labels[nid]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

In [10]:
def main(g, num_classes, num_edges, gpu=0):
    # load and preprocess dataset
    features = g.ndata['feat']
    labels = g.ndata['label']
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']
    in_feats = features.shape[1]
    n_classes = num_classes
    n_edges = num_edges
    
    print("""--------Data statistics--------
    
    Edges   %d
    Classes %d
    
    Train samples %d
    Val samples   %d
    Test samples  %d""" %
          (n_edges, n_classes,
           train_mask.int().sum().item(),
           val_mask.int().sum().item(),
           test_mask.int().sum().item()))

    if gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(gpu)
        features = features.cuda()
        labels = labels.cuda()
        train_mask = train_mask.cuda()
        val_mask = val_mask.cuda()
        test_mask = test_mask.cuda()
        print("\n    Cuda in use", gpu)

    train_nid = train_mask.nonzero().squeeze()
    val_nid = val_mask.nonzero().squeeze()
    test_nid = test_mask.nonzero().squeeze()

    # graph preprocess and calculate normalization factor
    g = dgl.remove_self_loop(g)
    n_edges = g.number_of_edges()
    if cuda:
        g = g.int().to(gpu)

    # create GraphSAGE model
    model = GraphSAGE(in_feats,
                      16,
                      n_classes,
                      2,
                      F.relu,
                      0.5,
                      'gcn')

    if cuda:
        model.cuda()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.02, weight_decay=5e-4)

    # initialize graph
    dur = []
    print("\n\n--------Training process--------\n")
    for epoch in range(200):
        model.train()
        t0 = time.time()

        # forward
        logits = model(g, features)
        loss = F.cross_entropy(logits[train_nid], labels[train_nid])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        dur.append(time.time() - t0)

        acc = evaluate(model, g, features, labels, val_nid)
        print("Epoch {:03d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | "
              "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur), loss.item(),
                                            acc, n_edges / np.mean(dur) / 1000))

    print()
    acc = evaluate(model, g, features, labels, test_nid)
    print("Test Accuracy {:.4f}".format(acc))

In [11]:
main(social_network_graph, num_classes, num_edges)

--------Data statistics--------
    
    Edges   6487
    Classes 10
    
    Train samples 60
    Val samples   120
    Test samples  120

    Cuda in use 0


--------Training process--------

Epoch 000 | Time(s) 0.4076 | Loss 10.2357 | Accuracy 0.0917 | ETputs(KTEPS) 31.83
Epoch 001 | Time(s) 0.2064 | Loss 5.1872 | Accuracy 0.0583 | ETputs(KTEPS) 62.86
Epoch 002 | Time(s) 0.1389 | Loss 4.8330 | Accuracy 0.1250 | ETputs(KTEPS) 93.40
Epoch 003 | Time(s) 0.1050 | Loss 5.5929 | Accuracy 0.1667 | ETputs(KTEPS) 123.54
Epoch 004 | Time(s) 0.0847 | Loss 4.6502 | Accuracy 0.1750 | ETputs(KTEPS) 153.09
Epoch 005 | Time(s) 0.0712 | Loss 3.4021 | Accuracy 0.1333 | ETputs(KTEPS) 182.13
Epoch 006 | Time(s) 0.0615 | Loss 3.0734 | Accuracy 0.1167 | ETputs(KTEPS) 210.85
Epoch 007 | Time(s) 0.0543 | Loss 2.8069 | Accuracy 0.0917 | ETputs(KTEPS) 239.04
Epoch 008 | Time(s) 0.0487 | Loss 2.8461 | Accuracy 0.0917 | ETputs(KTEPS) 266.31
Epoch 009 | Time(s) 0.0442 | Loss 2.6663 | Accuracy 0.0917 | ETputs(KT

Epoch 113 | Time(s) 0.0071 | Loss 2.1007 | Accuracy 0.0750 | ETputs(KTEPS) 1835.77
Epoch 114 | Time(s) 0.0071 | Loss 2.2674 | Accuracy 0.0833 | ETputs(KTEPS) 1839.53
Epoch 115 | Time(s) 0.0070 | Loss 2.1387 | Accuracy 0.0750 | ETputs(KTEPS) 1841.47
Epoch 116 | Time(s) 0.0070 | Loss 2.1044 | Accuracy 0.0667 | ETputs(KTEPS) 1849.27
Epoch 117 | Time(s) 0.0070 | Loss 2.2228 | Accuracy 0.0750 | ETputs(KTEPS) 1857.52
Epoch 118 | Time(s) 0.0070 | Loss 2.2184 | Accuracy 0.0833 | ETputs(KTEPS) 1865.90
Epoch 119 | Time(s) 0.0069 | Loss 2.2111 | Accuracy 0.1000 | ETputs(KTEPS) 1874.26
Epoch 120 | Time(s) 0.0069 | Loss 2.1644 | Accuracy 0.1000 | ETputs(KTEPS) 1882.14
Epoch 121 | Time(s) 0.0069 | Loss 2.1360 | Accuracy 0.0917 | ETputs(KTEPS) 1890.27
Epoch 122 | Time(s) 0.0068 | Loss 2.2020 | Accuracy 0.0750 | ETputs(KTEPS) 1898.29
Epoch 123 | Time(s) 0.0068 | Loss 2.0680 | Accuracy 0.0750 | ETputs(KTEPS) 1905.55
Epoch 124 | Time(s) 0.0068 | Loss 2.0334 | Accuracy 0.0750 | ETputs(KTEPS) 1913.58
Epoc