In [32]:
#%%
import torch
from torch_geometric.datasets import Planetoid
from torch_geometric.nn.models import VGAE
import anomaly_injection

In [47]:
dataset = Planetoid(root="data/Cora", name="Cora")
# dataset = Planetoid(root="data/CiteSeer", name="CiteSeer")
# dataset = Planetoid(root="data/PubMed", name="PubMed")

In [48]:
data = dataset[0]
total_anomaly_count = data.x.shape[0] // 10 # 10% of the dataset
struct_anomaly_count = total_anomaly_count // 2
att_anomaly_count = total_anomaly_count // 2
# Injecting structural anomalies

# Injecting attribute anomalies
new_data, att_anomalies = anomaly_injection.attribute_perturb(data, att_anomaly_count)\
# Injecting structural anomalies
num_per_clique = 5
new_data, struct_anomalies = anomaly_injection.structurally_perturb(new_data, num_per_clique, struct_anomaly_count // num_per_clique)
struct_anomalies = struct_anomalies.flatten()

Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...


In [49]:
print(f"Total anomalies: {total_anomaly_count}")
print(f"Structural anomalies: {struct_anomalies.shape[0]}")
print(f"Attribute anomalies: {att_anomalies.shape[0]}")

Total anomalies: 270
Structural anomalies: 135
Attribute anomalies: 135


In [50]:
import os
os.makedirs("perturbed_data/Cora", exist_ok=True)
torch.save(new_data, "perturbed_data/Cora/perturbed_data.pt")
torch.save(att_anomalies, "perturbed_data/Cora/att_anomalies.pt")
torch.save(struct_anomalies, "perturbed_data/Cora/struct_anomalies.pt")

In [26]:
dataset = Planetoid(root="data/CiteSeer", name="CiteSeer")

In [27]:
import anomaly_injection

In [28]:
data = dataset[0]

In [29]:
data.edge_index[:,2]

tensor([486,   1])

In [None]:
new_data, struct_anomalies = anomaly_injection.structurally_perturb(data, 5, )
new_new_data, att_anomalies = anomaly_injection.attribute_perturb(new_data, 475)

Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resamp

KeyboardInterrupt: 

In [None]:
all_anomalies = torch.concat([struct_anomalies.flatten(), att_anomalies]).unique()
print(all_anomalies.shape)

torch.Size([936])


In [None]:
print(type(dataset))
# %%
data = dataset[0]

print(data.num_nodes)
print(data.keys())

<class 'torch_geometric.datasets.planetoid.Planetoid'>
3327
['edge_index', 'y', 'train_mask', 'val_mask', 'test_mask', 'x']


In [12]:
data.edge_index

tensor([[ 628,  158,  486,  ..., 2820, 1643,   33],
        [   0,    1,    1,  ..., 3324, 3325, 3326]])

In [24]:
torch.save(new_new_data, "pubmed_anomalous_graph.pt")

In [25]:
torch.save(all_anomalies, "pubmed_anomalous_nodes.pt")

In [3]:
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv
from torch_geometric.utils import train_test_split_edges

In [4]:
class VariationalGCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(VariationalGCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True) # cached only for transductive learning
        self.conv_mu = GCNConv(2 * out_channels, out_channels, cached=True)
        self.conv_logstd = GCNConv(2 * out_channels, out_channels, cached=True)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)

In [5]:
data.edge_index.shape

torch.Size([2, 114615892])

In [6]:
out_channels = 2
num_features = dataset.num_features
epochs = 300


model = VGAE(VariationalGCNEncoder(num_features, out_channels))  # new line

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
x = data.x.to(device)

train_edge_mask = data.train_mask[data.edge_index[0]] & data.train_mask[data.edge_index[1]]

train_pos_edge_index = data.edge_index[:,train_edge_mask].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [7]:
def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, train_pos_edge_index)
    loss = model.recon_loss(z, train_pos_edge_index)
    
    loss = loss + (1 / data.num_nodes) * model.kl_loss()  # new line
    loss.backward()
    optimizer.step()
    return float(loss)


def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
    return model.test(z, pos_edge_index, neg_edge_index)

In [8]:
from torch.utils.tensorboard import SummaryWriter

In [None]:
writer = SummaryWriter('runs/GAE1_experiment_'+'2d_100_epochs')
for epoch in range(1, epochs + 1):
    loss = train()
    auc, ap = test(data.test_pos_edge_index, data.test_neg_edge_index)
    print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))
    
    
    writer.add_scalar('auc train',auc,epoch) # new line
    writer.add_scalar('ap train',ap,epoch)   # new line

In [None]:

#%%



# data.edge_index is 2 by num_edges tensor 
# column = [i,j] means there is an edge from node i to node j
# data.y is the labels 
#   - not predicting this label. can add as feature or ignore
# data.x is n by d, where n is number of nodes and d is number of features

# how are we using train, val, test split?

# TODO: encode and decode data
# TODO: make anomaly detector for data (missing/new edges, significantly different features)
#       - sort all nodes, most to least likely to be anomaly?

# TODO: 


# might be helpful: https://github.com/Flawless1202/VGAE_pyG/
# also, the VGAE() class
# https://github.com/DaehanKim/vgae_pytorch
# https://antoniolonga.github.io/Pytorch_geometric_tutorials/posts/post6.html
#%%
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv
from torch_geometric.utils import train_test_split_edges


#%%

# GAN: https://github.com/hwwang55/GraphGAN
# https://arxiv.org/abs/1711.08267
# https://medium.com/@_psycoplankton/graphgan-generative-adversarial-networks-for-graphs-ff4584375a81



def rank_anomalous(original_data, reconstructed_data):
    
    # score for a node: something like
    # num new edges (or edge deletions) with that node + || original feature - reconstructed feature||
    # or something
    
    scores = [0]*original_data.num_nodes
    for i in range(original_data.num_nodes):
        scores += norm of original_data[i]-reconstructed_data[i]
    
    del_edges = (original_data.edge_index setminus reconstructed_data.edge_index)
    ins_edges = (reconstructed_data.edge_index setminus original_data.edge_index)
    for edge in del_edges.union(ins_edges):
        scores[edge[0]] += 1
        scores[edge[1]] += 1
    
    indices = list(range(original_data.num_nodes))
    indices.sort(key = lambda i: -scores[i])
    return indices