# MC-SVD Procedure - Triad Prediction

In [11]:
import torch
import torch.nn as nn
import numpy as np
from torch.nn import init
from random import shuffle, randint
import torch.nn.functional as F
from torch_geometric.datasets import Reddit, PPI, Planetoid
from itertools import combinations, combinations_with_replacement
from sklearn.metrics import f1_score, accuracy_score
from sklearn.decomposition import TruncatedSVD
import pickle

## Define the dataset, the type of prediction and the number of samples

In [2]:
DATASET = 'cora'
PREDICTION = 'triad'
RUN_COUNT = 1
NUM_SAMPLES = 1
PATH_TO_DATASETS_DIRECTORY = './'

In [3]:
datasets = {
    'reddit': Reddit(root=PATH_TO_DATASETS_DIRECTORY + '/datasets/Reddit'),
    'cora' : Planetoid(root=PATH_TO_DATASETS_DIRECTORY + '/datasets/Cora/', name='Cora'),
    'citeseer' : Planetoid(root=PATH_TO_DATASETS_DIRECTORY + '/datasets/CiteSeer/', name='CiteSeer'),
    'pubmed' : Planetoid(root=PATH_TO_DATASETS_DIRECTORY + '/datasets/PubMed/', name='PubMed'),
}
dataset = datasets[DATASET]
data = dataset[0]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Downloading https://s3.us-east-2.amazonaws.com/dgl.ai/dataset/reddit.zip
Extracting datasets/Reddit/raw/reddit.zip
Processing...
Done!
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimi

In [4]:
predictions = {
    'node' : dataset.num_classes,
    'link' : 2,
    'triad' : 4,
}

In [12]:
dataset_types = ['train', 'validation', 'test']
triads_stores = dict()
triad_loc = '/scratch-ml00/bsriniv/pos2struct/triad_store/'

for set_nature in dataset_types :
    zero_filename = triad_loc + set_nature + '/' + DATASET + '_triad_zero.pickle'
    one_filename = triad_loc + set_nature + '/' + DATASET + '_triad_one.pickle'
    two_filename = triad_loc + set_nature + '/' + DATASET + '_triad_two.pickle'
    three_filename = triad_loc + set_nature + '/' + DATASET + '_triad_three.pickle'
    with open(zero_filename, 'rb') as f:
        zeros = pickle.load(f)
    with open(one_filename, 'rb') as f:
        ones = pickle.load(f)
    with open(two_filename, 'rb') as f:
        twos = pickle.load(f)
    with open(three_filename, 'rb') as f:
        threes = pickle.load(f)
    triads_stores[set_nature] = dict()
    triads_stores[set_nature]['zeros'] = zeros
    triads_stores[set_nature]['ones'] = ones
    triads_stores[set_nature]['twos'] = twos
    triads_stores[set_nature]['threes'] = threes

In [5]:
print("Printing Dataset Characteristics")
print("Name: ", DATASET)
print("Total Number of Nodes: ", data.num_nodes)
print("Total Number of Training Nodes: ", data.train_mask.sum().item())
print("Total Number of Val Nodes: ", data.val_mask.sum().item())
print("Total Number of Test Nodes: ", data.test_mask.sum().item())
print("Num Node Features: ", data.num_features)
print("Num Node Classes: ", dataset.num_classes)
print("Number of Edges: ", data.edge_index.shape[1])
print("Number of Samples for structural: ", NUM_SAMPLES)
print("Prediction Type: ", PREDICTION)

Printing Dataset Characteristics
Name:  cora
Total Number of Nodes:  2708
Total Number of Training Nodes:  140
Total Number of Val Nodes:  500
Total Number of Test Nodes:  1000
Num Node Features:  1433
Num Node Classes:  7
Number of Edges:  10556
Number of Samples for structural:  1
Prediction Type:  triad


In [6]:
data.train_mask = 1 - data.val_mask - data.test_mask

adj_mat = torch.zeros((data.num_nodes,data.num_nodes))
edges = data.edge_index.t()
adj_mat[edges[:,0], edges[:,1]] = 1

## Build the non-overlapping induced subgraphs

In [7]:
adj_train = adj_mat[data.train_mask].t()[data.train_mask].t()
adj_validation = adj_mat[data.val_mask].t()[data.val_mask].t()
adj_test = adj_mat[data.test_mask].t()[data.test_mask].t()



## Corrupt a small fraction of the edges

In [8]:
def corrupt_adj(adj_mat, task, percent=2):
    """ Returns the corrupted version of the adjacency matrix """
    if task == 'link':
        edges = adj_mat.triu().nonzero()
        num_edges = edges.shape[0]
        num_to_corrupt = int(percent/100.0 * num_edges)
        random_corruption = np.random.randint(num_edges, size=num_to_corrupt)
        adj_mat_corrupted = adj_mat.clone()
        false_edges, false_non_edges = [], []
        #Edge Corruption
        for ed in edges[random_corruption]:
            adj_mat_corrupted[ed[0], ed[1]] = 0
            adj_mat_corrupted[ed[1], ed[0]] = 0
            false_non_edges.append(ed.tolist())
        #Non Edge Corruption
        random_non_edge_corruption = list(np.random.randint(adj_mat.shape[0], size = 6*num_to_corrupt))
        non_edge_to_corrupt = []
        for k in range(len(random_non_edge_corruption)-1):
            to_check = [random_non_edge_corruption[k], random_non_edge_corruption[k+1]]
            if to_check not in edges.tolist():
                non_edge_to_corrupt.append(to_check)
            if len(non_edge_to_corrupt) == num_to_corrupt:
                break
        non_edge_to_corrupt = torch.Tensor(non_edge_to_corrupt).type(torch.int16)
        for n_ed in non_edge_to_corrupt:
            adj_mat_corrupted[n_ed[0], n_ed[1]] = 1
            adj_mat_corrupted[n_ed[1], n_ed[0]] = 1
            false_edges.append(n_ed.tolist())
    return adj_mat_corrupted, false_edges, false_non_edges


In [9]:
adj_train_corrupted, train_false_edges, train_false_non_edges = corrupt_adj(adj_train, 'link', percent=2)
adj_val_corrupted, val_false_edges, val_false_non_edges = corrupt_adj(adj_validation, 'link', percent=2)
adj_test_corrupted, test_false_edges, test_false_non_edges  = corrupt_adj(adj_test, 'link', percent=2)

## Define the Supervised Learning Network

In [13]:
num_neurons = 256
input_rep = num_neurons + data.num_features

class StructMLP(nn.Module):
    def __init__(self, node_set_size=1):
        super(StructMLP, self).__init__()

        self.node_set_size = node_set_size
        #Deepsets MLP

        self.ds_layer_1 = nn.Linear(input_rep, num_neurons)
        self.ds_layer_2 = nn.Linear(num_neurons, num_neurons)
        self.rho_layer_1 = nn.Linear(num_neurons, num_neurons)
        self.rho_layer_2 = nn.Linear(num_neurons, num_neurons)

        #One Hidden Layer
        self.layer1 = nn.Linear(num_neurons, num_neurons)
        self.layer2 = nn.Linear(num_neurons, predictions[PREDICTION])
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_tensor, samples):
        #Deepsets initially on each of the samples
        num_nodes = input_tensor.shape[1]
        sum_tensor = torch.zeros(samples.shape[0], num_neurons).to(device)
        for i in range(input_tensor.shape[0]):
            #Process the input tensor to form n choose k combinations and create a zero tensor
            set_init_rep = input_tensor[i].view(-1, input_rep)
            x = self.ds_layer_1(set_init_rep)
            x = self.relu(x)
            x = self.ds_layer_2(x)
            x = x[samples]
            x = torch.sum(x, dim=1)
            x = self.rho_layer_1(x)
            sum_tensor += x

        x = sum_tensor / input_tensor.shape[0]

        #One Hidden Layer for predictor
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

    def compute_loss(self, input_tensor, samples, target):
        pred = self.forward(input_tensor, samples)
        return F.cross_entropy(pred, target)

In [14]:
if PREDICTION == 'node':
    node_set_size = 1
elif PREDICTION == 'link':
    node_set_size = 2
else:
    node_set_size = 3

mlp = StructMLP(node_set_size).to(device)
mlp_optimizer = torch.optim.Adam(mlp.parameters(), lr=0.001)
mlp_model = 'best_mlp_model.model'

## Training the Supervised Learning Network

In [15]:
def sample_triads(set_nature, small_samples=100):
    zeros_shape = triads_stores[set_nature]['zeros'].shape[0]
    ones_shape = triads_stores[set_nature]['ones'].shape[0]
    twos_shape = triads_stores[set_nature]['twos'].shape[0]
    threes_shape = triads_stores[set_nature]['threes'].shape[0]

    zeros = triads_stores[set_nature]['zeros'][np.random.randint(zeros_shape, size=min(small_samples, zeros_shape))]
    ones = triads_stores[set_nature]['ones'][np.random.randint(ones_shape, size=min(small_samples, ones_shape))]
    twos = triads_stores[set_nature]['twos'][np.random.randint(twos_shape, size=min(small_samples, twos_shape))]
    threes = triads_stores[set_nature]['threes'][np.random.randint(threes_shape, size=min(small_samples, threes_shape))]

    target_zeros = torch.zeros(zeros.shape[0])
    target_ones = torch.ones(ones.shape[0])
    target_twos = 2.0 * torch.ones(twos.shape[0])
    target_threes = 3.0 * torch.ones(threes.shape[0])

    out = torch.cat((zeros, ones, twos, threes), dim=0).view(-1,3).type(torch.long)
    target = torch.cat((target_zeros, target_ones, target_twos, target_threes), dim=0).type(torch.long)
    return out.to(device), target.to(device)

In [16]:
epochs = 50
validation_loss = 10000.0
small_samples = 200
for num_epoch in range(epochs):
    mlp_optimizer.zero_grad()
    numbers = list(np.random.randint(500, size=NUM_SAMPLES))
    hidden_samples_train = []
    for number in numbers :
        svd = TruncatedSVD(n_components=256, n_iter=10, random_state=number)
        u_train = svd.fit_transform(adj_train_corrupted)
        hidden_samples_train.append(torch.Tensor(u_train).to(device))
    for i in range(NUM_SAMPLES):
        hidden_samples_train[i] = torch.cat((hidden_samples_train[i].to(device), data.x[data.train_mask].to(device)),1)
    input_ = torch.stack(hidden_samples_train)
    input_ = input_.detach()
    sampled, target = sample_triads('train', small_samples=small_samples)
    loss = mlp.compute_loss(input_, sampled, target=target)
    print("Training Loss: ", loss.item())
    with torch.no_grad():
        #Do Validation and check if validation loss has gone down
        numbers = list(np.random.randint(500, size=NUM_SAMPLES))
        hidden_samples_validation = []
        for number in numbers :
            svd = TruncatedSVD(n_components=256, n_iter=10, random_state=number)
            u_validation = svd.fit_transform(adj_val_corrupted)
            hidden_samples_validation.append(torch.Tensor(u_validation).to(device))
        for i in range(NUM_SAMPLES):
            hidden_samples_validation[i] = torch.cat((hidden_samples_validation[i].to(device), data.x[data.val_mask].to(device)),1)
        input_val = torch.stack(hidden_samples_validation)
        input_val = input_val.detach()
        sampled, target = sample_triads('validation', small_samples=small_samples)
        compute_val_loss = mlp.compute_loss(input_val, sampled, target=target)
        if compute_val_loss < validation_loss:
            validation_loss = compute_val_loss
            print("Validation Loss: ", validation_loss)
            #Save Model
            torch.save(mlp.state_dict(), mlp_model)
    loss.backward()
    mlp_optimizer.step()



Training Loss:  1.3859809637069702




Validation Loss:  tensor(1.3738, device='cuda:0')




Training Loss:  1.3751635551452637




Validation Loss:  tensor(1.3661, device='cuda:0')




Training Loss:  1.3576452732086182




Validation Loss:  tensor(1.3658, device='cuda:0')




Training Loss:  1.3349465131759644




Validation Loss:  tensor(1.3646, device='cuda:0')




Training Loss:  1.290409803390503




Validation Loss:  tensor(1.3594, device='cuda:0')




Training Loss:  1.2478160858154297




Validation Loss:  tensor(1.3531, device='cuda:0')




Training Loss:  1.194199800491333




Validation Loss:  tensor(1.3438, device='cuda:0')




Training Loss:  1.175276279449463




Training Loss:  1.1658800840377808




Training Loss:  1.1363428831100464




Training Loss:  1.0785000324249268




Training Loss:  1.065596580505371




Training Loss:  0.9999157190322876




Validation Loss:  tensor(1.3152, device='cuda:0')




Training Loss:  0.9728954434394836




Validation Loss:  tensor(1.2541, device='cuda:0')




Training Loss:  0.9094236493110657




Validation Loss:  tensor(1.2224, device='cuda:0')




Training Loss:  0.845712423324585




Training Loss:  0.9091130495071411




Training Loss:  0.8864877223968506




Training Loss:  0.8747356534004211




Training Loss:  0.8469822406768799




Training Loss:  0.858034610748291




Training Loss:  0.8680769801139832




Training Loss:  0.8360507488250732




Training Loss:  0.8214761018753052




Training Loss:  0.7636245489120483




Training Loss:  0.7735505700111389




Training Loss:  0.7947033047676086




Training Loss:  0.8162539601325989




Training Loss:  0.7739742398262024




Training Loss:  0.7652398943901062




Training Loss:  0.7437338829040527




Training Loss:  0.7433636784553528




Training Loss:  0.7323870062828064




Training Loss:  0.7431440949440002




Training Loss:  0.7561774849891663




Training Loss:  0.758399248123169




Training Loss:  0.7340292930603027




Training Loss:  0.7091309428215027




Training Loss:  0.7572685480117798




Training Loss:  0.7668723464012146




Training Loss:  0.7478084564208984




Training Loss:  0.7384220361709595




Training Loss:  0.7202812433242798




Training Loss:  0.6992852687835693




Training Loss:  0.7470446228981018




Training Loss:  0.70884108543396




Training Loss:  0.6916447877883911




Training Loss:  0.6721842885017395




Training Loss:  0.710475504398346




Training Loss:  0.7115380764007568




## Load the best model

In [17]:
mlp = StructMLP(node_set_size).to(device)
mlp.load_state_dict(torch.load(mlp_model))

<All keys matched successfully>

## Forward pass on the test graphs

In [22]:
numbers = list(np.random.randint(500, size=NUM_SAMPLES))
hidden_samples_test = []
for number in numbers :
    svd = TruncatedSVD(n_components=256, n_iter=10, random_state=number)
    u_test = svd.fit_transform(adj_test_corrupted)
    hidden_samples_test.append(torch.Tensor(u_test).to(device))
for i in range(NUM_SAMPLES):
    hidden_samples_test[i] = torch.cat((hidden_samples_test[i].to(device), data.x[data.test_mask].to(device)),1)

    
small_samples = 200
sampled_test, target_test = sample_triads('test', small_samples)


t_test = target_test.to("cpu").numpy()
input_test = torch.stack(hidden_samples_test)
input_test = input_test.detach()

with torch.no_grad():
    test_pred = mlp.forward(input_test, sampled_test)
    pred = F.log_softmax(test_pred, dim=1)
pred = pred.detach().to("cpu").numpy()
pred = np.argmax(pred, axis=1)



## Test results

In [23]:
print("Test Micro F1 Score: ", f1_score(t_test, pred, average='micro'))
print("Test Weighted F1 Score: ", f1_score(t_test, pred, average='weighted'))
print("Test Accuracy Score: ", accuracy_score(t_test, pred))

Test Micro F1 Score:  0.3617021276595745
Test Weighted F1 Score:  0.28886531366825613
Test Accuracy Score:  0.3617021276595745
