# Pytorch GAE (Graph Auto-Encoder)

## Dataset

For PyG dataset, refer to [Creating your own datasets](https://rusty1s.github.io/pytorch_geometric/build/html/notes/create_dataset.html) and [Data handling of graphs](https://rusty1s.github.io/pytorch_geometric/build/html/notes/introduction.html#data-handling-of-graphs).  

We only need ```data.x``` and ```data.edge_index``` for original GAE.  

For bank transfer anomaly detection, we may need extra field - ```data.edge_attr```, so the edges are weighted.  
Also, we wish to deal with directed edges.  
I guess all these info could be merged into adjacency matrix.  

[GCNConv.forward](https://rusty1s.github.io/pytorch_geometric/build/html/_modules/torch_geometric/nn/conv/gcn_conv.html#GCNConv.forward)  
Since GCNConv (encoder of GAE) supports ```edge_weight```, ```data.edge_attr``` could store amount as weight and be fed into edge_weight.  

For node feature, we only have account custom type - 0-3.  
For edge features, we have:  
  1. amount - total transfer amount during an given period.  
  2. count - total transfer count  
  3. source - transfer channel  
  

In [2]:
from tqdm import tqdm_notebook as tqdm
import multiprocessing


In [3]:
# data preprocessing

import os
import pandas as pd

import torch

# consts
root_path = '/home/xd/data/lz_trans/'

node_df = pd.read_csv(os.path.join(root_path, 'extra_type.csv'))
type_list = node_df['type']

x = torch.tensor([[value] for value in type_list], dtype=torch.float)

link_df = pd.read_csv(os.path.join(root_path, 'extra_link.csv'))
source_list = link_df['source']
target_list = link_df['target']
amount_list = link_df['amount'] # total transfer amount

edge_index = torch.tensor([source_list, target_list], dtype=torch.long)
edge_attr = torch.tensor([[amount] for amount in amount_list], dtype=torch.float)


In [4]:
# push graph into networkx

import networkx as nx

G = nx.DiGraph()
G.add_nodes_from(range(len(type_list)))
G.add_edges_from([(source_list[i], target_list[i]) for i in tqdm(range(len(source_list)))])

print('networkX G - nodes: {}, edges: {}\n'.format(
    G.number_of_nodes(),
    G.number_of_edges()
))


HBox(children=(IntProgress(value=0, max=2770964), HTML(value='')))


networkX G - nodes: 2624591, edges: 2770964



In [None]:
# pull out subgraph of each node

# consts
bfs_depth = 5

for node in tqdm(range(len(type_list))):
    edges = list(nx.bfs_edges(G, node, depth_limit=bfs_depth))
    nodes = [node] + [v for u, v in edges]
    
    if (len(edges) != len(nodes) - 1) or (len(edges) > 40000):
        print('bfs (depth {}) subgraph for node {} - nodes: {}, edges: {}'.format(
            bfs_depth,
            node,
            len(nodes),
            len(edges)
        ))

HBox(children=(IntProgress(value=0, max=2624591), HTML(value='')))

bfs (depth 5) subgraph for node 2182 - nodes: 94200, edges: 94199
bfs (depth 5) subgraph for node 2249 - nodes: 94191, edges: 94190
bfs (depth 5) subgraph for node 2849 - nodes: 131912, edges: 131911
bfs (depth 5) subgraph for node 3145 - nodes: 131899, edges: 131898
bfs (depth 5) subgraph for node 3565 - nodes: 132009, edges: 132008
bfs (depth 5) subgraph for node 4701 - nodes: 94199, edges: 94198
bfs (depth 5) subgraph for node 5209 - nodes: 94412, edges: 94411
bfs (depth 5) subgraph for node 5390 - nodes: 131978, edges: 131977
bfs (depth 5) subgraph for node 6237 - nodes: 94191, edges: 94190
bfs (depth 5) subgraph for node 6342 - nodes: 131978, edges: 131977
bfs (depth 5) subgraph for node 6636 - nodes: 131899, edges: 131898
bfs (depth 5) subgraph for node 6828 - nodes: 131978, edges: 131977
bfs (depth 5) subgraph for node 6868 - nodes: 94191, edges: 94190
bfs (depth 5) subgraph for node 6983 - nodes: 94191, edges: 94190
bfs (depth 5) subgraph for node 7307 - nodes: 131899, edges: 1

In [3]:
# Since pulling out subgraph is very slow
# we are using parallel machines to speed up

from multiprocessing.managers import BaseManager

# consts
bfs_depth = 3

class QueueManager(BaseManager): pass
QueueManager.register('get_task_queue')
QueueManager.register('get_result_queue')
QueueManager.register('get_init_dict')

manager = QueueManager(address=('127.0.0.1', 50000), authkey=b'asdfjkl;')

manager.connect()
task_queue = manager.get_task_queue()
result_queue = manager.get_result_queue()

init_data = manager.get_init_dict()
print(init_data.get('node_count'))

for node in tqdm(range(len(type_list))):
    task = {
        'node': node,
        'bfs_depth': bfs_depth
    }
    
    task_queue.put(task)

results = []
for node in tqdm(range(len(type_list))):
    result = result_queue.get()
    results.append(result)
    

  0%|          | 487/1631199 [00:00<05:35, 4864.66it/s]

1631199


100%|██████████| 1631199/1631199 [00:53<00:00, 30407.86it/s]
 76%|███████▌  | 1239223/1631199 [4:13:16<136:31:11,  1.25s/it]

EOFError: 

In [60]:
import os
import pandas as pd

from torch_geometric.data import InMemoryDataset
from torch_geometric.data import Data

# consts

class TranStatDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(TranStatDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return ['nodes.csv', 'stat_links.csv']

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        # we don't download
        pass

    def process(self):        
        # Read data into huge `Data` list.
        
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
        
        data_list = [data]

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])
        
dataset = TranStatDataset(root_path, transform=T.NormalizeFeatures())
data = dataset[0]

print('PyG data - {}'.format(data))

PyG data - Data(edge_attr=[2770964, 1], edge_index=[2, 2770964], x=[1631199, 1])


# GAE Model

[Variational Graph Auto-Encoders](https://arxiv.org/pdf/1611.07308.pdf)

GAE and VGAE are originally designed for link prediction.  
  
Given an undirected, unweighted graph G(V, E), introduce an adjacency matrix A and node feature matrix X.  
The problem is to reconstruct adjacency matrix A*.  
  
In practice, we could use GAE/VGAEs to reconstruct A* (from actual A and X) to predict transfers between accounts.  
Predicted edges differ from actual are considered anomalies.  

In [18]:
import torch
import torch.nn.functional as F

import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, GAE, VGAE

# consts
MODEL_TYPE = 'GAE' # ['GAE', 'VGAE']

# vars
kwargs = {'GAE': GAE, 'VGAE': VGAE}
channels = 16 # embedding dimentions

class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Encoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True)
        if MODEL_TYPE in ['GAE']:
            self.conv2 = GCNConv(2 * out_channels, out_channels, cached=True)
        elif MODEL_TYPE in ['VGAE']:
            self.conv_mu = GCNConv(2 * out_channels, out_channels, cached=True)
            self.conv_logvar = GCNConv(
                2 * out_channels, out_channels, cached=True)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        if MODEL_TYPE in ['GAE']:
            return self.conv2(x, edge_index)
        elif MODEL_TYPE in ['VGAE']:
            return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index)
        
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = kwargs[MODEL_TYPE](Encoder(dataset.num_features, channels)).to(device)

data.train_mask = data.val_mask = data.test_mask = data.y = None
data = model.split_edges(data)
x, edge_index = data.x.to(device), data.edge_index.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

RuntimeError: $ Torch: not enough memory: you tried to allocate 2478GB. Buy new RAM! at /opt/conda/conda-bld/pytorch_1549628766161/work/aten/src/TH/THGeneral.cpp:201

# Training pipeline:

In [None]:
# consts
EPOCHS = 200

def train():
    model.train()
    optimizer.zero_grad()
    
    z = model.encode(x, edge_index)
    loss = model.recon_loss(z, data.train_pos_edge_index)
    if MODEL_TYPE in ['VGAE']:
        loss = loss + 0.001 * model.kl_loss()
        
    loss.backward()
    optimizer.step()

def test(pos_edge_index, neg_edge_index):
    model.eval()
    
    with torch.no_grad():
        z = model.encode(x, edge_index)
        
    return model.test(z, pos_edge_index, neg_edge_index)
    
for epoch in range(EPOCHS):
    train()
    auc, ap = test(data.val_pos_edge_index, data.val_neg_edge_index)
    
    print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

In [20]:
import requests
r = requests.get('https://office.orientsoft.cn:8443')
print(r.text)

<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8" />
    <meta http-equiv="x-ua-compatible" content="ie=edge,chrome=1" />
    <meta name="viewport" content="width=device-width" />
    <title>moop-ui</title>
    <link rel="stylesheet" href="/static/css/bootstrap4.min.css">
    <link rel="stylesheet" href="/static/css/open-iconic-bootstrap.min.css">
    <link rel="stylesheet" href="/static/css/main.css">
  <link rel="shortcut icon" href="/favicon.png"><link href="/css\index.css" rel="stylesheet"></head>

  <body>
    <div id="ice-container"></div>
    <script src="/static/js/jquery.slim.min.js" ></script>
    <script src="/static/js/popper.min.js" ></script>
    <script src="/static/js/bootstrap.min.js" ></script>
  <script type="text/javascript" src="/js\index.js"></script></body>
</html>

