In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.optim import Adam
import torch.nn.functional as F
import numpy as np
import scipy.sparse as sp
from scipy.sparse import csr_matrix

import args

class LoadData():
    """Load graph data"""
    def __init__(self, file_path, family, samples):
        self.family = family
        self.samples = samples
        self.file_path = file_path
        self.nSample = len(samples)

    def get_adj_wgh(self):
        """generating adjacency matrix"""
        adj_orig_list =[]
        for sample in self.samples:
            f_name = self.file_path + ".".join([sample])
            adj   = np.asarray(pd.read_csv(f_name, index_col = 0, iterator = False))
            adj_orig_list.append(adj)    

        return adj_orig_list

    def get_adj_label(self):
        adj_m = self.get_adj_m()
        adj_label_list =[]
        
        for _, adj in enumerate(adj_m):
            adj_label = adj + sp.eye(adj.shape[0])
            adj_label = sparse_to_tuple(sp.coo_matrix(adj_label))
            #adj_label = sparse_to_tuple(adj_label)
            adj_label = torch.sparse.FloatTensor(torch.LongTensor(adj_label[0].T), 
                            torch.FloatTensor(adj_label[1]), 
                            torch.Size(adj_label[2]))
            adj_label_list.append(adj_label)
        return adj_label_list

    def get_adj_norm(self):
        adj_m = self.get_adj_m()
        adj_norm_list =[]

        for _, adj in enumerate(adj_m):
            adj_norm = preprocess_graph(adj)
            adj_norm = torch.sparse.FloatTensor(torch.LongTensor(adj_norm[0].T), 
                            torch.FloatTensor(adj_norm[1]), 
                            torch.Size(adj_norm[2]))
            adj_norm_list.append(adj_norm)

        return adj_norm_list

    def get_adj_m(self):
        adj_wgh = self.get_adj_wgh()
        adj_m_list =[]
        for _, adj in enumerate(adj_wgh):
            adj[adj>0] = 1
            for i in range(38):
                adj[i, i] = 0
            for i in range(38):
                for j in range(38):
                    if adj[i, j] == 1:
                        adj[j, i] = 1
            adj_m_list.append(adj)    
        return adj_m_list

    def get_feature(self):
        """generating feature matrix X"""
        adj_wgh = self.get_adj_wgh()
        x_list = []
        for _, adj in enumerate(adj_wgh):
            x_feature  = adj
            x_feature  = csr_matrix(x_feature)
            x_feature  = sparse_to_tuple(x_feature)
            x_feature  = torch.sparse.FloatTensor(torch.LongTensor(x_feature[0].T), 
                            torch.FloatTensor(x_feature[1]), 
                            torch.Size(x_feature[2]))
            x_list.append(x_feature)
        
        return x_list


def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

def preprocess_graph(adj):
    adj = sp.coo_matrix(adj)
    adj_ = adj + sp.eye(adj.shape[0])
    rowsum = np.array(adj_.sum(1))
    degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten())
    adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo()
    
    return sparse_to_tuple(adj_normalized)

from os import walk

def get_filename(mypath):
    f = []
    for (dirpath, dirnames, filenames) in walk(mypath):
        f.extend(filenames)
        break
    return f

class VGAE(nn.Module):
	def __init__(self, input_dim, hidden1_dim, hidden2_dim):
		super(VGAE,self).__init__()
		self.base_gcn = GraphConvSparse(input_dim, hidden1_dim)
		self.gcn_mean = GraphConvSparse(hidden1_dim, hidden2_dim, activation=lambda x:x)
		self.gcn_logstddev = GraphConvSparse(hidden1_dim, hidden2_dim, activation=lambda x:x)

	def encode(self, x, adj):
		hidden = self.base_gcn(x, adj)
		self.mean = self.gcn_mean(hidden, adj)
		self.logstd = self.gcn_logstddev(hidden, adj)
		gaussian_noise = torch.randn(x.size(0), args.hidden2_dim)
		sampled_z = gaussian_noise*torch.exp(self.logstd) + self.mean
		return sampled_z

	def forward(self, x, adj):
		Z = self.encode(x, adj)
		A_pred = dot_product_decode(Z)
		return A_pred

class GraphConvSparse(nn.Module):
	def __init__(self, input_dim, output_dim, activation = F.relu, **kwargs):
		super(GraphConvSparse, self).__init__(**kwargs)
		self.weight = glorot_init(input_dim, output_dim) 
		self.activation = activation

	def forward(self, inputs, adj):
		x = inputs
        
		x = torch.mm(x,self.weight)
		x = torch.mm(adj, x)
		outputs = self.activation(x)
		return outputs


def dot_product_decode(Z):
	A_pred = torch.sigmoid(torch.matmul(Z,Z.t()))
	return A_pred

def glorot_init(input_dim, output_dim):
	init_range = np.sqrt(6.0/(input_dim + output_dim))
	initial = torch.rand(input_dim, output_dim)*2*init_range - init_range
	return nn.Parameter(initial)



In [40]:
from tqdm import tqdm

samples = get_filename("./data/")
data = LoadData(file_path="./data/", family="EGF", samples=samples)

adj_label_list = data.get_adj_label()
adj_norm_list= data.get_adj_norm()
adj_wgh_list = data.get_adj_wgh()
adj_m = data.get_adj_m()
x_list = data.get_feature()

model = VGAE(38,1,1)
optimizer = Adam(model.parameters(), lr=args.learning_rate)

for epoch in range(1000000):
# init model and optimizer
    loss_total = 0
    for i in range(data.nSample):
        adj_wgh = adj_wgh_list[i]
        adj_norm = adj_norm_list[i]
        adj_label = adj_label_list[i]
        x_feature = x_list[i]
        
        A_pred = model(x_feature, adj_norm)
        weight_mask = adj_label.to_dense().view(-1) == 1
        weight_tensor = torch.ones(weight_mask.size(0)) 
        
        pos_weight = float(adj_wgh.shape[0] * adj_wgh.shape[0] - adj_wgh.sum()) / adj_wgh.sum()
        norm = adj_wgh.shape[0] * adj_wgh.shape[0] / float((adj_wgh.shape[0] * adj_wgh.shape[0] - adj_wgh.sum()) * 2)
        weight_tensor[weight_mask] = pos_weight

        loss =  norm*F.binary_cross_entropy(A_pred.view(-1), adj_label.to_dense().view(-1), weight = weight_tensor)
        
        kl_divergence = 0.5/ A_pred.size(0) * (1 + 2*model.logstd - model.mean**2 - torch.exp(model.logstd)**2).sum(1).mean()
        loss -= kl_divergence
        loss_total += loss

    loss_total = loss_total/data.nSample
    optimizer.zero_grad()
    loss_total.backward()
    optimizer.step()

    print("Loss:", 5*loss_total, epoch*20)

Loss: tensor(15.1088, grad_fn=<MulBackward0>) 0
Loss: tensor(13.1328, grad_fn=<MulBackward0>) 20
Loss: tensor(11.9995, grad_fn=<MulBackward0>) 40
Loss: tensor(10.3099, grad_fn=<MulBackward0>) 60
Loss: tensor(9.3911, grad_fn=<MulBackward0>) 80
Loss: tensor(8.3010, grad_fn=<MulBackward0>) 100
Loss: tensor(7.6526, grad_fn=<MulBackward0>) 120
Loss: tensor(7.0333, grad_fn=<MulBackward0>) 140
Loss: tensor(7.1112, grad_fn=<MulBackward0>) 160
Loss: tensor(7.2037, grad_fn=<MulBackward0>) 180
Loss: tensor(7.7984, grad_fn=<MulBackward0>) 200
Loss: tensor(7.5850, grad_fn=<MulBackward0>) 220
Loss: tensor(8.5920, grad_fn=<MulBackward0>) 240
Loss: tensor(8.6071, grad_fn=<MulBackward0>) 260
Loss: tensor(9.3125, grad_fn=<MulBackward0>) 280
Loss: tensor(10.3056, grad_fn=<MulBackward0>) 300
Loss: tensor(10.1636, grad_fn=<MulBackward0>) 320
Loss: tensor(10.7128, grad_fn=<MulBackward0>) 340
Loss: tensor(10.6039, grad_fn=<MulBackward0>) 360
Loss: tensor(9.9830, grad_fn=<MulBackward0>) 380
Loss: tensor(9.989