## Data Preparation

Import the required libraries, and load edge list data into a graph object.

In [1]:
import torch
import torch.nn as nn

import numpy as np
import scipy.sparse as sp

import dgl
from dgl.nn import SAGEConv
import dgl.function as fn

In [2]:
with open('../data/edge.csv', 'r') as f:
    edges = f.readlines()

edges = [edge.strip().split(',') for edge in edges]
categories = edges[0] #['Source', 'Source_name', 'Target', 'Target_name', 'Type', 'Id', 'Label', 'Weight']

# identify source and target nodes and make them tensors
u = torch.tensor([int(edge[0]) for edge in edges[1:]])
v = torch.tensor([int(edge[2]) for edge in edges[1:]])
u_name = [edge[1] for edge in edges[1:]]
v_name = [edge[3] for edge in edges[1:]]
g = dgl.graph((u, v))

# load features
with open('../data/count.csv', 'r') as f:
    count = f.readlines()
count = [c.strip().split(',') for c in count][1:]
count = {c[0]: int(c[1]) for c in count}
count = {f"#{key}": value for key, value in count.items()}
print(count)
exit()

# create a tensor of count values based on the given node IDs
count_tensor = torch.zeros(g.num_nodes())
for i in range(g.num_nodes()):
    print(u_name[i])
    # if node_id in u:
    #     count_tensor[i] = count[]
    # elif node_id in v:
    #     count_tensor[i] = count[v_name[v == node_id][0]]
        
print(count_tensor)     
        
# features = []
# for u, v in zip(u_name, v_name):
#     features.append([count[u], count[v]])
    
# g.ndata['feat'] = 
print(len(torch.eye(g.number_of_nodes())))

{'#explorepage': 9, '#atlantamua': 1, '#studdedeyes': 1, '#makeupartist': 48, '#makeuptutorial': 46, '#bookingavailable': 1, '#glam': 10, '#crayoncase': 1, '#toofaced': 11, '#urbandecay': 13, '#linedlips': 1, '#makeupideas': 18, '#makeuplooks': 17, '#makeupslaves': 1, '#makemoney': 1, '#makeuplovers': 4, '#atlantamakeupartist': 1, '#exploremore': 2, '#makeupoftheday': 11, '#naturalface': 1, '#naturalmakeup': 3, '#naturalmakeuplook': 2, '#viral': 4, '#·¥ç·¥Ä·¥ã·¥á·¥ú·¥ò·¥õ·¥ú·¥õ·¥è Ä…™·¥Ä ü': 1, '#fullglam': 2, '#softgirl': 1, '#softbeat': 2, '#concealer': 2, '#snow': 1, '#toofacedmakeup': 1, '#pakistan': 2, '#Letsbuythat': 2, '#pakistani': 2, '#pakistaniblogger': 2, '#makeup': 74, '#mac': 15, '#smashbox': 2, '#skincare': 13, '#tarte': 3, '#ulta': 2, '#esteelauder': 10, '#colourpop': 10, '#shein': 2, '#forever21': 2, '#morphe': 13, '#thebodyshop': 2, '#makeuplook': 20, '#makeupjunkie': 16, '#instagood': 10, '#makeupblogger': 5, '#karachi': 2, '#islamabad': 3, '#lahore': 2, '#anastasiabe

: 

In [None]:
# g.ndata['feat'] = torch.tensor([int(edge[-1]) for edge in edges[1:]])

## Prepare training, validation and test data
We split the dataset into the three sets with the ratio of 0.70, 0.15 and 0.15 with random sampling.

In [None]:
eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)

# split into train, val, test
test_size = val_size = int(len(eids) * 0.15)
train_size = len(eids) - val_size - test_size

test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
val_pos_u, val_pos_v = u[eids[test_size:test_size+val_size]], v[eids[test_size:test_size+val_size]]
train_pos_u, train_pos_v = u[eids[test_size+val_size:]], v[eids[test_size+val_size:]]

# find negative edges by randomly sampling from all possible edges.
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())), shape=(len(u), len(v)))
adj_neg = 1 - adj.todense() - np.eye(len(eids))
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), len(eids))
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
val_neg_u, val_neg_v = neg_u[neg_eids[test_size:test_size+val_size]], neg_v[neg_eids[test_size:test_size+val_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size+val_size:]], neg_v[neg_eids[test_size+val_size:]]

When training, we need to remove the edges in the test set from the original graph via `dgl.remove_edges`. 

In [None]:
train_g = dgl.remove_edges(g, eids[:test_size])

## Define the model

The model consists of two GraphSAGE layers, each computes new node representations by averaging neighbor information. DGL provides ``dgl.nn.SAGEConv`` that conveniently creates a GraphSAGE layer.

In [None]:
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')
    
    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

## Create positive and negative edges for training

-  Treat the edges in the graph as *positive examples*.
-  Sample a number of non-existent edges (i.e.¬†node pairs with no edges
   between them) as *negative* examples.
-  Divide the positive examples and negative examples into a training, validation and test set.

In [None]:
# Note: num_nodes was g.number_of_nodes() in the original code
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_edges())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_edges())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_edges())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_edges())

The following code computes new edge features based on the incident nodes' features and the original edge features (if applicable) via `DGLGraph.apply_edges`.

DGL provides a set of optimized builtin functions to compute new
edge features based on the original node/edge features. For example,
``dgl.function.u_dot_v`` computes a dot product of the incident nodes‚Äô
representations for each edge.

In [None]:
class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

## Training loop

After defining the node representation and edge score computation, the training loop can thus be defined. The evaluation metric is the area under the ROC curve (AUC).

In [None]:
model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(16)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)