In [1]:
import time
import argparse
from sklearn.model_selection import train_test_split
from model import MODEL
from layers import *
from utlis import *
import numpy as np
import torch

In [3]:
# parser = argparse.ArgumentParser()

# # dataset and model dependent args
# parser.add_argument('--data', type=str, default='amazon', help='The dataset name. [Amazon_demo, Yelp_demo, amazon,yelp]')
# parser.add_argument('--batch-size', type=int, default=100, help='Batch size 1024 for yelp, 256 for amazon.')
# parser.add_argument('--lr', type=float, default=0.1, help='Initial learning rate. [0.1 for amazon and 0.001 for yelp]')
# parser.add_argument('--lambda_1', type=float, default=1e-4, help='Weight decay (L2 loss weight).')
# parser.add_argument('--embed_dim', type=int, default=64, help='Node embedding size at the first layer.')
# parser.add_argument('--num_epochs', type=int, default=61, help='Number of epochs.')
# parser.add_argument('--test_epochs', type=int, default=10, help='Epoch interval to run test set.')
# parser.add_argument('--seed', type=int, default=123, help='Random seed.')
# parser.add_argument('--no_cuda', action='store_true', default=False, help='Disables CUDA training.')

# if(torch.cuda.is_available()):
# 	print("cuda is available")

# args = parser.parse_args()
# args.cuda = not args.no_cuda and torch.cuda.is_available()

# if(args.cuda):
# 	print("runing with GPU")

# print(f'run on {args.data}')

In [3]:
class Args:
    def __init__(self):
        self.data = 'amazon'  # The dataset name. [Amazon_demo, Yelp_demo, amazon, yelp]
        self.batch_size = 100  # Batch size 1024 for yelp, 256 for amazon.
        self.lr = 0.1  # Initial learning rate. [0.1 for amazon and 0.001 for yelp]
        self.lambda_1 = 1e-4  # Weight decay (L2 loss weight).
        self.embed_dim = 64  # Node embedding size at the first layer.
        self.num_epochs = 61  # Number of epochs.
        self.test_epochs = 10  # Epoch interval to run test set.
        self.seed = 123  # Random seed.
        self.no_cuda = False  # Disables CUDA training.

args = Args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

if torch.cuda.is_available():
    print("CUDA is available")

# Access the arguments like this:
print(f"Dataset: {args.data}")
print(f"Batch size: {args.batch_size}")
print(f"Learning rate: {args.lr}")
print(f"Weight decay: {args.lambda_1}")
print(f"Embedding dimension: {args.embed_dim}")
print(f"Number of epochs: {args.num_epochs}")
print(f"Test epochs: {args.test_epochs}")
print(f"Random seed: {args.seed}")
print(f"CUDA enabled: {args.cuda}")


Dataset: amazon
Batch size: 100
Learning rate: 0.1
Weight decay: 0.0001
Embedding dimension: 64
Number of epochs: 61
Test epochs: 10
Random seed: 123
CUDA enabled: False


In [4]:
# load topology, feature, and label
homo, relation1, relation2, relation3, feat_data, labels = load_data(args.data)

# set seed
np.random.seed(args.seed)
random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)


# train_test split
if args.data == 'yelp':

	index = list(range(len(labels)))
	idx_train, idx_test, y_train, y_test = train_test_split(index, labels, stratify = labels, test_size = 0.80,
															random_state = 2, shuffle = True)

	# set prior
	num_1= len(np.where(y_train==1)[0])
	num_2= len(np.where(y_train==0)[0])
	p0 = (num_1/(num_1+num_2))
	p1 = 1- p0
	prior = np.array([p1, p0])

	if args.cuda:
		prior = (torch.from_numpy(prior +1e-8)).cuda()
	else:
		prior = (torch.from_numpy(prior +1e-8))

elif args.data == 'amazon':

	# 0-3304 are unlabeled nodes
	index = list(range(3305, len(labels)))
	idx_train, idx_test, y_train, y_test = train_test_split(index, labels[3305:], stratify = labels[3305:],
															test_size = 0.90, random_state = 2, shuffle = True)

	num_1 = len(np.where(y_train == 1)[0])
	num_2 = len(np.where(y_train == 0)[0])
	p0 = (num_1 / (num_1 + num_2))
	p1 = 1 - p0
	prior = np.array([p1, p0])
	if args.cuda:
		prior = (torch.from_numpy(prior +1e-8)).cuda()
	else:
		prior = (torch.from_numpy(prior +1e-8))
	#prior = np.array([0.9, 0.1])

In [5]:
# initialize model input
features = nn.Embedding(feat_data.shape[0], feat_data.shape[1])
feat_data = normalize(feat_data) 
features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad = False)
if args.cuda:
	features.cuda()

# set input graph topology
adj_lists = [relation1, relation2, relation3]


In [6]:
# build model

# the first neural network layer (ego-feature embedding module)
mlp = MLP_(features, feat_data.shape[1], args.embed_dim, cuda = args.cuda)

#first convolution layer
intra1_1 = IntraAgg(cuda = args.cuda)
intra1_2 = IntraAgg(cuda = args.cuda)
intra1_3 = IntraAgg(cuda = args.cuda)
agg1 = InterAgg(lambda nodes: mlp(nodes), args.embed_dim, adj_lists, [intra1_1, intra1_2, intra1_3], cuda = args.cuda)


#second convolution layer
intra2_1 = IntraAgg(cuda = args.cuda)
intra2_2 = IntraAgg(cuda = args.cuda)
intra2_3 = IntraAgg(cuda = args.cuda)

#def __init__(self, features, embed_dim, adj_lists, intraggs, cuda = False):
agg2 = InterAgg(lambda nodes: agg1(nodes), args.embed_dim*2, adj_lists, [intra2_1, intra2_2, intra2_3], cuda = args.cuda)
gnn_model = MODEL(2, 2, args.embed_dim, agg2, prior)
# gnn_model in one convolution layer
#gnn_model = MODEL(1, 2, args.embed_dim, agg1, prior, cuda = args.cuda)

In [7]:
if args.cuda:
	gnn_model.cuda()

optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, gnn_model.parameters()), lr=args.lr, weight_decay=args.lambda_1)
performance_log = []

# train the model

overall_time = 0
for epoch in range(args.num_epochs):

	# gnn_model.train()
	# shuffle
	random.shuffle(idx_train)
	num_batches = int(len(idx_train) / args.batch_size) +1

	loss = 0.0
	epoch_time = 0

	#mini-batch training
	for batch in range(num_batches):

		print(f'Epoch: {epoch}, batch: {batch}')

		i_start = batch * args.batch_size
		i_end = min((batch + 1) * args.batch_size, len(idx_train))

		batch_nodes = idx_train[i_start:i_end]

		batch_label = labels[np.array(batch_nodes)]

		optimizer.zero_grad()

		start_time = time.time()

		if args.cuda:
			loss = gnn_model.loss(batch_nodes, Variable(torch.cuda.LongTensor(batch_label)))
		else:
			loss = gnn_model.loss(batch_nodes, Variable(torch.LongTensor(batch_label)))

		end_time = time.time()

		epoch_time += end_time - start_time

		loss.backward()
		optimizer.step()
		loss += loss.item()

	print(f'Epoch: {epoch}, loss: {loss.item() / num_batches}, time: {epoch_time}s')
	overall_time += epoch_time


Epoch: 0, batch: 0
Epoch: 0, batch: 1
Epoch: 0, batch: 2
Epoch: 0, batch: 3
Epoch: 0, batch: 4
Epoch: 0, batch: 5
Epoch: 0, batch: 6
Epoch: 0, batch: 7
Epoch: 0, batch: 8
Epoch: 0, loss: 0.14122858996600757, time: 2.466630458831787s
GNN auc: 0.8921
GNN precision: 0.9519
GNN a_precision: 0.7695
GNN Recall: 0.7357
GNN f1: 0.8027
Epoch: 1, batch: 0
Epoch: 1, batch: 1
Epoch: 1, batch: 2
Epoch: 1, batch: 3
Epoch: 1, batch: 4
Epoch: 1, batch: 5
Epoch: 1, batch: 6
Epoch: 1, batch: 7
Epoch: 1, batch: 8
Epoch: 1, loss: 0.10129061832607324, time: 3.1055986881256104s
Epoch: 2, batch: 0
Epoch: 2, batch: 1
Epoch: 2, batch: 2
Epoch: 2, batch: 3
Epoch: 2, batch: 4
Epoch: 2, batch: 5
Epoch: 2, batch: 6
Epoch: 2, batch: 7
Epoch: 2, batch: 8
Epoch: 2, loss: 0.06852556800007704, time: 2.7235987186431885s
Epoch: 3, batch: 0
Epoch: 3, batch: 1
Epoch: 3, batch: 2
Epoch: 3, batch: 3
Epoch: 3, batch: 4
Epoch: 3, batch: 5
Epoch: 3, batch: 6
Epoch: 3, batch: 7
Epoch: 3, batch: 8
Epoch: 3, loss: 0.03152889068511

In [8]:
#testing the model for every $test_epoch$ epoch
if epoch % args.test_epochs == 0:

		#gnn_model.eval()
		auc, precision, a_p, recall, f1 = test_model(idx_test, y_test, gnn_model)
		performance_log.append([auc, precision, a_p, recall, f1])

print("The training time per epoch")
print(overall_time/args.num_epochs)

GNN auc: 0.9280
GNN precision: 0.8836
GNN a_precision: 0.8317
GNN Recall: 0.8873
GNN f1: 0.8854
The training time per epoch
2.7540428130353085
