<a href="https://colab.research.google.com/github/SheidaEmdadi/ML_Final/blob/main/ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
print(torch.__version__)


In [None]:
import numpy as np
import scipy.sparse as sp
import time
import random
import pandas as pd

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_geometric.nn import GCNConv
from torch_geometric.utils import to_dense_adj
from pygod.utils import load_data

from sklearn.metrics import  roc_auc_score

from IPython.display import display

import matplotlib as mpl


#Helper Function

In [None]:
def _aug_random_edge(nb_nodes, edge_index, perturb_percent=0.2, self_loop = True):


    total_edges = edge_index.shape[1]
    avg_degree = int(total_edges/nb_nodes)


    edge_dict = {}
    for i in range(nb_nodes):
        edge_dict[i] = set()

    for edge in edge_index:
        i,j = edge[0],edge[1]
        i = i.item()
        j = j.item()
        edge_dict[i].add(j)
        edge_dict[j].add(i)


    for i in range(nb_nodes):

            d = len(edge_dict[i])

            num_edge_to_drop = int(d * perturb_percent)

            node_list = list(edge_dict[i])
            num_edge_to_drop = min(num_edge_to_drop, d)
            sampled_nodes = random.sample(node_list, num_edge_to_drop)

            for j in sampled_nodes:
                edge_dict[i].discard(j)
                edge_dict[j].discard(i)

    node_list = [i for i in range(nb_nodes)]

    add_list = []
    for i in range(nb_nodes):

        d = len(edge_dict[i])
        num_edge_to_add = int(d * perturb_percent)

        sampled_nodes = random.sample(node_list, num_edge_to_add)
        for j in sampled_nodes:
            add_list.append((i,j))

    if self_loop:
        for i in range(nb_nodes):
            edge_dict[i].add(i)

    updated_edges = set()
    for i in range(nb_nodes):
        for j in edge_dict[i]:
            updated_edges.add((i,j))
            updated_edges.add((j,i))

    row = []
    col = []
    for edge in updated_edges:
        u = edge[0]
        v = edge[1]
        row.append(u)
        col.append(v)

    aug_edge_index = [row,col]
    aug_edge_index = torch.tensor(aug_edge_index)

    return aug_edge_index


def preprocess_features(features):

    features = features.squeeze()
    rowsum = np.array(features.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    features = r_mat_inv.dot(features)
    return features


#MLP

In [None]:
class MLP(nn.Module):
    def __init__(self, num_layers, input_dim, hidden_dim, output_dim):

        super(MLP, self).__init__()

        self.num_layers = num_layers

        self.linears = torch.nn.ModuleList()
        self.batch_norms = torch.nn.ModuleList()

        self.linears.append(nn.Linear(input_dim, hidden_dim))
        for layer in range(num_layers - 2):
            self.linears.append(nn.Linear(hidden_dim, hidden_dim))
        self.linears.append(nn.Linear(hidden_dim, output_dim))

        for layer in range(num_layers - 1):
            self.batch_norms.append(nn.BatchNorm1d((hidden_dim)))

    def forward(self, x):
            h = x
            for layer in range(self.num_layers - 1):
                h = self.linears[layer](h)

                if len(h.shape) > 2:
                    h = torch.transpose(h, 0, 1)
                    h = torch.transpose(h, 1, 2)


                if len(h.shape) > 2:
                    h = torch.transpose(h, 1, 2)
                    h = torch.transpose(h, 0, 1)

                h = F.relu(h)

            return self.linears[self.num_layers - 1](h)

#GNN Model (GCN)

In [None]:
class GNN(nn.Module):
    def __init__(self, in_dim, out_dim):

        super(GNN, self).__init__()

        self.mlp0 = MLP(3, in_dim, out_dim, out_dim)

        self.graphconv1 = GCNConv(out_dim, out_dim, aggr='mean')


        self.mlp1 = nn.Linear(out_dim,1)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()

    def forward(self, x, edge_index):
        h0 = self.mlp0(x)
        h1 = self.graphconv1(h0,edge_index)
        h2 = self.mlp1(h1)
        h2 = self.relu(h2)
        p = torch.exp(h2)
        return p



#Datasets

In [None]:
weibo = dict( dataset = "weibo",
                perturb_percent=  0.05,
                seed = 42,
                nb_epochs = 200,
                hidden_dim = 64,
                lr = 0.1,
                l2_coef = 100,
                self_loop = False,
                preprocess_feat = False
              )


reddit = dict( dataset = "reddit",
                perturb_percent=  0.05,
                seed = 10,
                nb_epochs = 100,
                hidden_dim = 16,
                lr = 0.001,
                l2_coef = 10.0,
                self_loop = True,
                preprocess_feat = True
              )


disney = dict( dataset = "disney",
                perturb_percent=  0.05,
                seed = 42,
                nb_epochs = 200,
                hidden_dim = 16,
                lr = 0.01,
                l2_coef = 0,
                self_loop = True,
                preprocess_feat = True
              )


books = dict( dataset = "books",
              perturb_percent=  0.05,
              seed = 10,
              nb_epochs = 100,
              hidden_dim = 16,
              lr = 0.01,
              l2_coef = 10,
              self_loop = True,
              preprocess_feat = True
            )

#Main

In [None]:
for i in range(4):

    dataset_name = [reddit ,disney, books, weibo]



    dataset_str = dataset_name[i]['dataset']
    perturb_percent = dataset_name[i]['perturb_percent']

    seed = dataset_name[i]['seed']
    if dataset_str == "disney":
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)

    nb_epochs = dataset_name[i]['nb_epochs']
    lr = dataset_name[i]['lr']
    l2_coef = dataset_name[i]['l2_coef']
    hidden_dim = dataset_name[i]['hidden_dim']
    self_loop = dataset_name[i]['self_loop']
    preprocess_feat = dataset_name[i]['preprocess_feat']


    print("_" * 100)
    print("dataset: " + dataset_name[i]['dataset'])
    print("_" * 100)



    num_neigh = 1

    data = load_data(dataset_str)
    edge_index = data.edge_index


    adj = to_dense_adj(edge_index).squeeze()
    features = data.x
    labels = data.y
    y = labels.bool()

    anomaly_nodes = np.nonzero(y)

    nb_nodes = features.shape[0]
    input_dim = features.shape[1]



    model = GNN(input_dim, hidden_dim)
    optimiser = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2_coef)


    if preprocess_feat:
        features = preprocess_features(features)




    start_time = time.time()
    mx_auc = 0

    aug_edge_indexes = []

    for i in range(num_neigh):
        aug_edge_index = _aug_random_edge(nb_nodes, edge_index, perturb_percent=perturb_percent, self_loop=self_loop)
        aug_edge_indexes.append(aug_edge_index)



    features = torch.FloatTensor(features[np.newaxis])


    losses = []

    for epoch in range(nb_epochs):

        model.train()
        optimiser.zero_grad()

        p_data = model(features, edge_index)
        loss = 0

        for i in range(num_neigh):

            aug_edge_index = aug_edge_indexes[i]

            shuffle_features = features
            idx = np.random.permutation(nb_nodes)
            shuffle_features = features[:, idx, :]


            p_neigh = model(shuffle_features, aug_edge_index)

            c_theta_j1 = p_neigh/p_data
            c_theta_j2 = p_data/p_neigh

            j1 = (c_theta_j1**2 + 2 * c_theta_j1).mean()
            j2 = (2 * c_theta_j2).mean()



            neigh_loss = j1 - j2
            neigh_loss = neigh_loss.mean()
            loss += neigh_loss

        loss = loss / num_neigh

        losses.append(loss)



        logits = p_data.squeeze().detach()
        auc_score = roc_auc_score(y_true = y.numpy(), y_score = logits.numpy()) * 100

        print("Epoch: ", epoch, " Loss: ", loss.item(), " AUC Score: ", auc_score)
        mx_auc = max(mx_auc, auc_score)

        loss.backward()
        optimiser.step()

    end_time = time.time()



    print("Maximum AUC: ", mx_auc)
    print("Required Time: ", end_time - start_time)

____________________________________________________________________________________________________
dataset: reddit
____________________________________________________________________________________________________
Epoch:  0  Loss:  1.2658501863479614  AUC Score:  41.99129069411979
Epoch:  1  Loss:  1.2586008310317993  AUC Score:  41.99116203333447
Epoch:  2  Loss:  1.2513861656188965  AUC Score:  41.99127782804125
Epoch:  3  Loss:  1.2442251443862915  AUC Score:  41.99112343509887
Epoch:  4  Loss:  1.2371094226837158  AUC Score:  41.97672629322102
Epoch:  5  Loss:  1.230026364326477  AUC Score:  41.97581280164522
Epoch:  6  Loss:  1.2229855060577393  AUC Score:  41.97522096203272
Epoch:  7  Loss:  1.2160072326660156  AUC Score:  41.975182363797124
Epoch:  8  Loss:  1.2091035842895508  AUC Score:  41.975143765561526
Epoch:  9  Loss:  1.202283501625061  AUC Score:  41.97511803340445
Epoch:  10  Loss:  1.1955606937408447  AUC Score:  41.975118033404456
Epoch:  11  Loss:  1.18894815444