In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from google.colab import drive
drive.mount('/content/drive')
device='cuda' if torch.cuda.is_available() else 'cpu'
dataset=118
x=np.load('/content/drive/MyDrive/gnn/data/data_%d_linkpred/%d_linkpred_x.npy'%(dataset,dataset)).transpose()
y=np.load('/content/drive/MyDrive/gnn/data/data_%d_linkpred/%d_linkpred_y.npy'%(dataset,dataset)).transpose()
y[y==2]=1
w=np.load('/content/drive/MyDrive/gnn/data/data_%d_linkpred/%d_linkpred_w.npy'%(dataset,dataset)).transpose()
print(torch.cuda.get_device_name(0))

Mounted at /content/drive
Tesla P100-PCIE-16GB


In [2]:
# sort activity
activity=np.sum(y,axis=0)
edges=np.argsort(activity)[::-1]
edge=edges[:10]
y=y[:,edge]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=18)
print('Training data size:',x_train.shape)
print('Training label size:',y_train.shape)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, features, labels, device='cpu'):
        self.features=torch.from_numpy(features).float()
        self.labels=torch.from_numpy(labels).float()
    def __len__(self):
        return len(self.features)
    def __getitem__(self, idx):
        if torch.is_tensor(idx): idx = idx.tolist()
        # Select sample
        X=self.features[idx]
        y=self.labels[idx]
        X=torch.reshape(X,(x.shape[1],dataset))
        return X,y
params = {'batch_size': 128,
          'shuffle': True,
          'num_workers': 2}
# Dataset Generators
training_set = Dataset(features=x_train,labels=y_train,device=device)
training_generator = torch.utils.data.DataLoader(training_set,**params)
validation_set = Dataset(features=x_test,labels=y_test,device=device)
validation_generator = torch.utils.data.DataLoader(validation_set,**params)

Training data size: (8000, 4, 118)
Training label size: (8000, 10)


In [3]:
W1 = w.copy()
W1 = np.asarray(W1)
W1 = W1 / 1
W2 = W1.copy()
print(type(w))
lam,v = np.linalg.eig(W1)
print(lam[0])
for i in range(2):
  W1 = np.matmul(W1,W2) / np.max(np.matmul(W1,W2))
  lam,v = np.linalg.eig(W1)
  print(lam[0])

<class 'numpy.ndarray'>
10.391198194095473
1.1997444434330236
1.219576528758643


In [4]:
class v2v(nn.Module):
    def __init__(self,in_feats,out_feats,w,n,bias=True):
        super(v2v,self).__init__()
        self.register_buffer('w',torch.from_numpy(w).float())
        self.register_buffer('n0',torch.tensor(n))
        # self.mapping=nn.Parameter(torch.Tensor(n,W.shape[0],W.shape[1]))
        # torch.nn.init.xavier_uniform_(self.mapping.data)
        self.scale0=nn.Parameter(torch.Tensor(out_feats,in_feats))
        torch.nn.init.xavier_uniform_(self.scale0.data)
        self.scale=nn.Parameter(torch.Tensor(n-1,out_feats,out_feats))
        torch.nn.init.xavier_uniform_(self.scale.data)
        self.bias=nn.Parameter(torch.Tensor(out_feats,1))
        torch.nn.init.xavier_uniform_(self.bias.data)

    def forward(self,input):
        # h=torch.mul(self.mapping[0,:,:],self.w) # element-wise
        # print(h.shape,input.shape)
        h=torch.matmul(input,self.w) # transpose due to batch generator
        h=torch.matmul(self.scale0,h)
        for i in range(self.n0 - 1):
          W1 = torch.matrix_power(self.w,i+2)/torch.max(torch.matrix_power(self.w,i+2))
          h1=torch.matmul(h,W1) # transpose due to batch generator 
          h=torch.matmul(self.scale[i,:,:],h)
        return h+self.bias 

# GNN using DGL v2v graph convolution and our own v2e graph convolution
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_size, W, num_bus, n_hop, fc_params):
        super(GCN, self).__init__()
        self.v2v1=v2v(in_feats,hidden_size[0],W,n_hop)
        self.v2v2=v2v(hidden_size[0],hidden_size[1],W,n_hop)
        self.v2v3=v2v(hidden_size[1],hidden_size[2],W,n_hop)
        self.v2v4=v2v(hidden_size[2],hidden_size[3],W,n_hop)
        self.v2v5=v2v(hidden_size[3],hidden_size[4],W,n_hop)
        self.v2v6=v2v(hidden_size[4],hidden_size[5],W,n_hop)
        self.linear1=nn.Linear(num_bus*hidden_size[-1],fc_params[0])
        self.linear2=nn.Linear(fc_params[0],2*fc_params[1])
         
    def forward(self, inputs):
        h=self.v2v1(inputs)
        h=torch.relu(h)
        h=self.v2v2(h)
        h=torch.relu(h)
        h=self.v2v3(h)
        h=torch.relu(h)
        h=self.v2v4(h)
        h=torch.relu(h)
        h=self.v2v5(h)
        h=torch.relu(h)
        h=self.v2v6(h)
        h=torch.reshape(h,(h.shape[0],-1))
        h=self.linear1(h)
        h=torch.relu(h)
        h=self.linear2(h)
        h=torch.reshape(h,(h.shape[0],2,-1))
        return h
n_bus=x.shape[2]
w_params=[10,50,50,50,50,10]
n_hop=4
fc_params=[2000,y_train.shape[1]]
# W = W / 10 # normalize
net=GCN(x.shape[1],w_params,w,n_bus,n_hop,fc_params) # Laplacian
net=net.to(device)

# set loss_func & optimizer
optimizer=torch.optim.Adam(net.parameters(),weight_decay=0.01)
loss_optm=[]
loss_val=[]
print(net)
print('number of params: %d'%(sum(temp.numel() for temp in net.parameters() if temp.requires_grad)))

GCN(
  (v2v1): v2v()
  (v2v2): v2v()
  (v2v3): v2v()
  (v2v4): v2v()
  (v2v5): v2v()
  (v2v6): v2v()
  (linear1): Linear(in_features=1180, out_features=2000, bias=True)
  (linear2): Linear(in_features=2000, out_features=20, bias=True)
)
number of params: 2441380


In [5]:
## Training
t0=time.time()
max_epochs=20
eval_epoch=5

# earlystopping
tolerance=4
min_delta=5e-4
previous=0

W_tensor = torch.from_numpy(w).float().to(device)
my_loss_func=nn.CrossEntropyLoss()
for epoch in range(max_epochs):
  # training loop
  train_loss=0.0
  for local_batch,local_label in training_generator:
    optimizer.zero_grad() # clear the past gradient
    local_batch,local_label=local_batch.to(device),local_label.to(device)
    logits=net(local_batch)
    loss=my_loss_func(logits,local_label.long())
    loss.backward()
    train_loss+=loss.item()
    optimizer.step() # update parameters of net
  loss_optm.append(train_loss/len(training_generator.dataset))
  print("Epoch %d | Training loss: %.8f"%(epoch,train_loss/len(training_generator.dataset)))
  # eval
  if (epoch+1)%eval_epoch==0:
    net.eval()
    eval_loss=0.0
    for eval_batch,eval_label in validation_generator:
      eval_batch,eval_label=eval_batch.to(device),eval_label.to(device)
      logits=net(eval_batch)
      loss=my_loss_func(logits,eval_label.long())
      eval_loss+=loss.item()
    eval_avg=eval_loss/len(validation_generator.dataset)
    if (epoch==0): previous=eval_avg
    else:
      if previous-eval_avg<min_delta: tolerance-=1
      if tolerance==0: break
      previous=eval_avg
    print("Epoch %d | Eval loss: %.8f" % (epoch, eval_avg))
    loss_val.append([epoch, eval_loss/len(validation_generator.dataset)])
    net.train()
t1=time.time()
print("Training time:%.4fs"%(t1-t0))
path='/content/drive/MyDrive/gnn/linkpred/%d_gnn.pickle'%(dataset)
torch.save(net.state_dict(),path)

Epoch 0 | Training loss: 0.00068057
Epoch 1 | Training loss: 0.00055332
Epoch 2 | Training loss: 0.00055311
Epoch 3 | Training loss: 0.00055273
Epoch 4 | Training loss: 0.00055197
Epoch 4 | Eval loss: 0.00053783
Epoch 5 | Training loss: 0.00055227
Epoch 6 | Training loss: 0.00055066
Epoch 7 | Training loss: 0.00055068
Epoch 8 | Training loss: 0.00055259
Epoch 9 | Training loss: 0.00055085
Epoch 9 | Eval loss: 0.00053177
Epoch 10 | Training loss: 0.00055380
Epoch 11 | Training loss: 0.00055403
Epoch 12 | Training loss: 0.00055098
Epoch 13 | Training loss: 0.00055498
Epoch 14 | Training loss: 0.00055355
Epoch 14 | Eval loss: 0.00053447
Epoch 15 | Training loss: 0.00055138
Epoch 16 | Training loss: 0.00055050
Epoch 17 | Training loss: 0.00055411
Epoch 18 | Training loss: 0.00055151
Epoch 19 | Training loss: 0.00055363
Training time:18.5245s


In [6]:
net.load_state_dict(torch.load(path))
# validate on test set
net.eval()
x_test_feed = torch.from_numpy(x_test).float()
x_test_feed = x_test_feed.to(device)
y_pred = net(x_test_feed)
y_pred=torch.argmax(y_pred,dim=1)
y_pred1 = y_pred.cpu().detach()
y_pred1 = y_pred1.numpy().transpose()
y_test=y_test.transpose()
print('Validation dataset size:',x_test_feed.shape)
print(y_pred.shape)
print(y_test.shape)

Validation dataset size: torch.Size([2000, 4, 118])
torch.Size([2000, 10])
(10, 2000)


In [9]:
y_diff = np.abs(y_test - y_pred1)
print(np.sum(y_diff)/np.sum(y_test))
# print(y_pred1,y_test)
print(np.sum(y_pred1),np.sum(y_test))
print('--')
print('positive accuracy:',np.sum(y_pred1==y_test)/y_test.shape[0]/y_test.shape[1])
print('--')
for edge in range(10):
  print(f1_score(y_pred1[edge,:],y_test[edge,:]))
print('--')
print(f1_score(y_pred1.reshape(-1,),y_test.reshape(-1,)))

0.07456445993031359
4000 4305.0
--
positive accuracy: 0.98395
--
1.0
0.9979959919839679
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
--
0.9613485851896448


  average, "true nor predicted", 'F-score is', len(true_sum)
