# Pytorch GAE (Graph Auto-Encoder)

## Dataset

For PyG dataset, refer to [Creating your own datasets](https://rusty1s.github.io/pytorch_geometric/build/html/notes/create_dataset.html) and [Data handling of graphs](https://rusty1s.github.io/pytorch_geometric/build/html/notes/introduction.html#data-handling-of-graphs).  

We only need ```data.x``` and ```data.edge_index``` for original GAE.  

For bank transfer anomaly detection, we may need extra field - ```data.edge_attr```, so the edges are weighted.  
Also, we wish to deal with directed edges.  
I guess all these info could be merged into adjacency matrix.  

[GCNConv.forward](https://rusty1s.github.io/pytorch_geometric/build/html/_modules/torch_geometric/nn/conv/gcn_conv.html#GCNConv.forward)  
Since GCNConv (encoder of GAE) supports ```edge_weight```, ```data.edge_attr``` could store amount as weight and be fed into edge_weight.  

For node feature, we only have account custom type - 0-3.  
For edge features, we have:  
  1. amount - total transfer amount during an given period.  
  2. count - total transfer count  
  3. source - transfer channel  
  

In [None]:
from tqdm import tqdm
import multiprocessing


In [34]:
# data preprocessing

import os
import pandas as pd

import torch

# consts
root_path = '/home/voyager/project/lz-graph/data/'

node_df = pd.read_csv(os.path.join(root_path, 'nodes.csv'))
type_list = node_df['CUSTOMTYPE']

x = torch.tensor([[value] for value in type_list], dtype=torch.float)

link_df = pd.read_csv(os.path.join(root_path, 'stat_links.csv'))
source_list = link_df['source']
target_list = link_df['target']
amount_list = link_df['sum'] # total transfer amount

edge_index = torch.tensor([source_list, target_list], dtype=torch.long)
edge_attr = torch.tensor([[amount] for amount in amount_list], dtype=torch.float)


In [35]:
# push graph into networkx

import networkx as nx

G = nx.DiGraph()
G.add_nodes_from(range(len(type_list)))
G.add_edges_from([(source_list[i], target_list[i]) for i in range(len(source_list))])

print('networkX G - nodes: {}, edges: {}\n'.format(G.number_of_nodes(), G.number_of_edges()))


networkX G - nodes: 1631199, edges: 2770964



In [50]:
# pull out subgraph of each node

# consts
bfs_depth = 3

for node in tqdm(range(len(type_list))):
    edges = list(nx.bfs_edges(G, node, depth_limit=bfs_depth))
    nodes = [node] + [v for u, v in edges]

print('bfs (depth {}) subgraph for node 500 - nodes: {}, edges: {}'.format(bfs_depth, len(list(nodes)), len(list(edges))))



  0%|          | 0/1631199 [00:00<?, ?it/s][A[A

  0%|          | 1/1631199 [00:03<1666:27:21,  3.68s/it][A[A

  0%|          | 2/1631199 [00:03<1196:06:13,  2.64s/it][A[A

  0%|          | 3/1631199 [00:05<1107:41:42,  2.44s/it][A[A

  0%|          | 5/1631199 [00:08<983:03:42,  2.17s/it] [A[A

  0%|          | 6/1631199 [00:09<716:33:16,  1.58s/it][A[A

  0%|          | 7/1631199 [00:09<527:46:32,  1.16s/it][A[A

  0%|          | 10/1631199 [00:11<459:20:59,  1.01s/it][A[A

  0%|          | 12/1631199 [00:11<334:40:04,  1.35it/s][A[A

  0%|          | 13/1631199 [00:13<502:25:58,  1.11s/it][A[A

  0%|          | 14/1631199 [00:16<769:14:02,  1.70s/it][A[A

  0%|          | 15/1631199 [00:16<566:43:35,  1.25s/it][A[A

  0%|          | 20/1631199 [00:18<451:21:36,  1.00it/s][A[A

  0%|          | 21/1631199 [00:18<344:20:22,  1.32it/s][A[A

  0%|          | 22/1631199 [00:19<267:05:51,  1.70it/s][A[A

  0%|          | 23/1631199 [00:19<213:14:54,  2.12it

KeyboardInterrupt: 

In [33]:
import os
import pandas as pd

from torch_geometric.data import InMemoryDataset
from torch_geometric.data import Data

# consts

class TranStatDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(TranStatDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return ['nodes.csv', 'stat_links.csv']

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        # we don't download
        pass

    def process(self):        
        # Read data into huge `Data` list.
        
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
        
        data_list = [data]

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])
        
dataset = TranStatDataset(root_path, transform=T.NormalizeFeatures())
data = dataset[0]

print('PyG data - {}'.format(data))

Processing...
networkX G - nodes: 1631199, edges: 2770964

bfs subgraph for node 1 - nodes: 822252, edges: 0
Done!
PyG data - Data(edge_attr=[2770964, 1], edge_index=[2, 2770964], x=[1631199, 1])


# GAE Model

[Variational Graph Auto-Encoders](https://arxiv.org/pdf/1611.07308.pdf)

GAE and VGAE are originally designed for link prediction.  
  
Given an undirected, unweighted graph G(V, E), introduce an adjacency matrix A and node feature matrix X.  
The problem is to reconstruct adjacency matrix A*.  
  
In practice, we could use GAE/VGAEs to reconstruct A* (from actual A and X) to predict transfers between accounts.  
Predicted edges differ from actual are considered anomalies.  

In [18]:
import torch
import torch.nn.functional as F

import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, GAE, VGAE

# consts
MODEL_TYPE = 'GAE' # ['GAE', 'VGAE']

# vars
kwargs = {'GAE': GAE, 'VGAE': VGAE}
channels = 16 # embedding dimentions

class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Encoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True)
        if MODEL_TYPE in ['GAE']:
            self.conv2 = GCNConv(2 * out_channels, out_channels, cached=True)
        elif MODEL_TYPE in ['VGAE']:
            self.conv_mu = GCNConv(2 * out_channels, out_channels, cached=True)
            self.conv_logvar = GCNConv(
                2 * out_channels, out_channels, cached=True)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        if MODEL_TYPE in ['GAE']:
            return self.conv2(x, edge_index)
        elif MODEL_TYPE in ['VGAE']:
            return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index)
        
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = kwargs[MODEL_TYPE](Encoder(dataset.num_features, channels)).to(device)

data.train_mask = data.val_mask = data.test_mask = data.y = None
data = model.split_edges(data)
x, edge_index = data.x.to(device), data.edge_index.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

RuntimeError: $ Torch: not enough memory: you tried to allocate 2478GB. Buy new RAM! at /opt/conda/conda-bld/pytorch_1549628766161/work/aten/src/TH/THGeneral.cpp:201

# Training pipeline:

In [None]:
# consts
EPOCHS = 200

def train():
    model.train()
    optimizer.zero_grad()
    
    z = model.encode(x, edge_index)
    loss = model.recon_loss(z, data.train_pos_edge_index)
    if MODEL_TYPE in ['VGAE']:
        loss = loss + 0.001 * model.kl_loss()
        
    loss.backward()
    optimizer.step()

def test(pos_edge_index, neg_edge_index):
    model.eval()
    
    with torch.no_grad():
        z = model.encode(x, edge_index)
        
    return model.test(z, pos_edge_index, neg_edge_index)
    
for epoch in range(EPOCHS):
    train()
    auc, ap = test(data.val_pos_edge_index, data.val_neg_edge_index)
    
    print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

In [20]:
import requests
r = requests.get('https://office.orientsoft.cn:8443')
print(r.text)

<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8" />
    <meta http-equiv="x-ua-compatible" content="ie=edge,chrome=1" />
    <meta name="viewport" content="width=device-width" />
    <title>moop-ui</title>
    <link rel="stylesheet" href="/static/css/bootstrap4.min.css">
    <link rel="stylesheet" href="/static/css/open-iconic-bootstrap.min.css">
    <link rel="stylesheet" href="/static/css/main.css">
  <link rel="shortcut icon" href="/favicon.png"><link href="/css\index.css" rel="stylesheet"></head>

  <body>
    <div id="ice-container"></div>
    <script src="/static/js/jquery.slim.min.js" ></script>
    <script src="/static/js/popper.min.js" ></script>
    <script src="/static/js/bootstrap.min.js" ></script>
  <script type="text/javascript" src="/js\index.js"></script></body>
</html>

