In [37]:
import torch
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv, GATConv, GINConv, MLP
import torch_geometric.transforms as T
from sklearn.model_selection import train_test_split

import os 

cwd = os.getcwd()
cwd = os.path.join(cwd, 'data')

dataset = Planetoid(root=cwd, name='Cora')
dataset1 = Planetoid(root=cwd, name='CiteSeer')
dataset2 = Planetoid(root = cwd, name= 'PubMed')

datasets = [dataset, dataset1, dataset2]


In [38]:

# class GCN(torch.nn.Module):
#     def __init__(self, in_channels, hidden_channels, out_channels):
#         super().__init__()
#         # Pre-process normalization to avoid CPU communication/graph breaks:
#         self.conv1 = GCNConv(in_channels, hidden_channels)
#         self.conv2 = GCNConv(hidden_channels, out_channels)

#     def forward(self, x, edge_index):
#         x = F.dropout(x, p=0.5, training=self.training)
#         x = self.conv1(x, edge_index)
#         x = x.relu()
#         x = F.dropout(x, p=0.5, training=self.training)
#         x = self.conv2(x, edge_index)
#         return x

class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_channels, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        return x

    
class GIN(torch.nn.Module):
    def __init__(self, num_features, hidden_channels, num_classes):
        super(GIN, self).__init__()
        self.convs = torch.nn.ModuleList()
        for _ in range(2):
            mlp = MLP([num_features, hidden_channels, hidden_channels])
            self.convs.append(GINConv(nn=mlp, train_eps=False))
            num_features = hidden_channels

        self.mlp = MLP([hidden_channels, hidden_channels, num_classes], norm=None, dropout=0.5)

    def forward(self, x, edge_index):
        for conv in self.convs:
            x = conv(x, edge_index).relu()
        return self.mlp(x) 

    

class GAT(torch.nn.Module):
    def __init__(self, num_features, hidden_channels, num_classes, heads=1):
        super(GAT, self).__init__()
        self.conv1 = GATConv(num_features, hidden_channels, heads=heads, dropout= 0.6)
        self.conv2 = GATConv(hidden_channels * heads, num_classes, heads=heads, dropout=0.6)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


In [43]:


# Training and evaluation functions
def train(model, optimizer, data, train_mask):
    model.train()
    optimizer.zero_grad()
    if model == GCN:
        out = model(data.x, data.edge_index, data.edge_weight)
    else: 
        out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def evaluate(model, data, mask):
    model.eval()
    logits = model(data.x, data.edge_index)
    loss = F.cross_entropy(logits[mask], data.y[mask])
    pred = logits[mask].max(1)[1]
    correct = pred.eq(data.y[mask]).sum().item()
    accuracy = correct / mask.sum().item()
    return accuracy, loss.item()

def split_indices(data, train_ratio=0.1, val_ratio=0.1, test_ratio=0.8):
    indices = np.arange(data.num_nodes)
    train_size = int(train_ratio * data.num_nodes)
    val_size = int(val_ratio * data.num_nodes)
    test_size = int(test_ratio * data.num_nodes)
    
    train_indices, temp_indices = train_test_split(indices, train_size=train_size, random_state=42)
    val_indices, test_indices = train_test_split(temp_indices, test_size=test_size, random_state=42)
    
    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    
    train_mask[train_indices] = True
    val_mask[val_indices] = True
    test_mask[test_indices] = True
    
    return train_mask, val_mask, test_mask


In [40]:
def reduce_mask(mask, reduction_ratio):
    num_true = mask.sum().item()
    num_to_keep = int(num_true * (reduction_ratio))
    
    true_indices = mask.nonzero(as_tuple=True)[0].numpy()
    np.random.shuffle(true_indices)
    
    selected_indices = true_indices[:num_to_keep]
    
    new_mask = torch.zeros_like(mask)
    new_mask[selected_indices] = True
    
    return new_mask

def adjust_masks(data, train_reduction=0.1, val_reduction=0.1, test_reduction=0.8):
    train_mask = data.train_mask
    val_mask = data.val_mask
    test_mask = data.test_mask
    
    train_mask = reduce_mask(train_mask, train_reduction)
    val_mask = reduce_mask(val_mask, val_reduction)
    test_mask = reduce_mask(test_mask, test_reduction)
    
    return train_mask, val_mask, test_mask

In [46]:

def run_experiment(model_class, num_features, num_classes, data, num_runs=5, epochs=300):
    for dataset in datasets:
        accuracies = []
        for _ in range(num_runs):
            print(f"Run {_ + 1}/{num_runs} on dataset {dataset.name}")
            train_mask, val_mask, test_mask = split_indices(data)
            
            if model_class == GCN:
                model = model_class(num_features, hidden_channels=256, num_classes=num_classes).to(device)
            model = model_class(num_features, hidden_channels=256, num_classes=num_classes).to(device)
            optimizer = optim.Adam(model.parameters(), lr=0.01)
            
            best_val_acc = 0.0
            best_model_state = None
            
            for epoch in range(epochs):
                train_loss = train(model, optimizer, data, train_mask)
                val_acc, val_loss = evaluate(model, data, val_mask)
                test_acc, test_loss = evaluate(model, data, test_mask)

                if epoch % 200 == 0:
                
                    print(f"Epoch: {epoch + 1}/{epochs} | Train Loss: {train_loss:.4f} | Val Acc: {val_acc:.4f} | Test Acc: {test_acc:.4f}")
                
                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    best_model_state = model.state_dict()
            
            if best_model_state:
                model.load_state_dict(best_model_state)
            
            test_acc, _ = evaluate(model, data, test_mask)
            accuracies.append(test_acc)
        
        mean_acc = np.mean(accuracies)
        std_acc = np.std(accuracies)
        
        print(f"\nMean Accuracy over {num_runs} runs: {mean_acc:.4f}, Std Deviation: {std_acc:.4f}")
        print()

In [47]:


def main():
    print(f'Dataset: {dataset.name}')
    data = dataset[0]  # Assuming dataset is a tuple where [0] is the data object

    global device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    num_features = data.num_features
    num_classes = dataset.num_classes

    for model_class in [GCN, GAT, GIN]:
        print('')
        print(f'{model_class}')
        run_experiment(model_class, num_features, num_classes, data)

if __name__ == '__main__':
    main()

Dataset: Cora

<class '__main__.GCN'>
Run 1/5 on dataset Cora
Epoch: 1/300 | Train Loss: 1.9586 | Val Acc: 0.4191 | Test Acc: 0.4501
Epoch: 201/300 | Train Loss: 0.0000 | Val Acc: 0.7904 | Test Acc: 0.8273
Run 2/5 on dataset Cora
Epoch: 1/300 | Train Loss: 1.9685 | Val Acc: 0.4485 | Test Acc: 0.4783
Epoch: 201/300 | Train Loss: 0.0000 | Val Acc: 0.7941 | Test Acc: 0.8283
Run 3/5 on dataset Cora
Epoch: 1/300 | Train Loss: 1.9507 | Val Acc: 0.4559 | Test Acc: 0.4751
Epoch: 201/300 | Train Loss: 0.0000 | Val Acc: 0.7904 | Test Acc: 0.8269
Run 4/5 on dataset Cora
Epoch: 1/300 | Train Loss: 1.9513 | Val Acc: 0.4926 | Test Acc: 0.4834
Epoch: 201/300 | Train Loss: 0.0000 | Val Acc: 0.7941 | Test Acc: 0.8269
Run 5/5 on dataset Cora
Epoch: 1/300 | Train Loss: 1.9336 | Val Acc: 0.4706 | Test Acc: 0.4894
Epoch: 201/300 | Train Loss: 0.0000 | Val Acc: 0.7941 | Test Acc: 0.8269

Mean Accuracy over 5 runs: 0.8260, Std Deviation: 0.0005

Run 1/5 on dataset CiteSeer
Epoch: 1/300 | Train Loss: 1.9395 |

In [33]:
data = datasets[2][0]
print(data)

Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])


In [45]:
train_mask = data.train_mask
val_mask = data.val_mask
test_mask = data.test_mask
print(train_mask.data, train_mask.shape, train_mask.sum().item())
print(val_mask.data, val_mask.shape, val_mask.sum().item())
print(test_mask.data, test_mask.shape, test_mask.sum().item())

tensor([ True,  True,  True,  ..., False, False, False]) torch.Size([19717]) 60
tensor([False, False, False,  ..., False, False, False]) torch.Size([19717]) 500
tensor([False, False, False,  ...,  True,  True,  True]) torch.Size([19717]) 1000


In [44]:
train_mask, val_mask, test_mask = split_indices(data)

print(train_mask.data, train_mask.shape, train_mask.sum().item())
print(val_mask.data, val_mask.shape, val_mask.sum().item())
print(test_mask.data, test_mask.shape, test_mask.sum().item())

tensor([False, False, False,  ...,  True, False,  True]) torch.Size([19717]) 1971
tensor([False, False, False,  ..., False, False, False]) torch.Size([19717]) 1973
tensor([ True,  True,  True,  ..., False,  True, False]) torch.Size([19717]) 15773


In [67]:
import numpy as np
from collections import defaultdict
import scipy.sparse as sp
import torch as th
from sklearn.preprocessing import OneHotEncoder
from torch_geometric.data import HeteroData

data_folder = "data/"


def encode_onehot(labels):

    # reshapes the numpy array to one column and whatever rows is required. 
    labels = labels.reshape(-1, 1)
    print(f'labels inside encode_oneshot {labels}')

    # tranfromss the labels or categorical array into a matrix of 0 and 1 that encodes where the data is presenst
    # this is used to pass in models for understanding where the catgories are. 
    enc = OneHotEncoder()

    #This method is used to fit the encoder to the data, learning the unique categories for each feature that will
    #  be transformed during the encoding process.
    enc.fit(labels)

    # converst categoriacal data into a binary matrix. 
    labels_onehot = enc.transform(labels).toarray()

    # returns this oneshot binary matrix. 
    return labels_onehot


def preprocess_features(features):
    """Row-normalize feature matrix and convert to tuple representation"""

    # gets a matrix plugged in and sums up all the rows of each the matrix and stores them in a tensor which gest turned into an array. 
    rowsum = np.array(features.sum(1))


    r_inv = np.power(rowsum, -1).flatten() # this performs an element-wise inverse on rowsum and then flatten the results to a 1-d array
    r_inv[np.isinf(r_inv)] = 0. # checks if any of the values are infinity prompting them to be equal to zero. 
    r_mat_inv = sp.diags(r_inv) # construct a digonal sparse matrix using the array of r_inv
    features = r_mat_inv.dot(features) # this multiples the new digonal matrix by the original features matrix. 
    if isinstance(features, np.ndarray):
        return features # if features was a numpy array it returns the new matrix 
    else:
        return features.todense() # if features matrix is not a numpy array it turns it into a dense matrix


def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""


    # the adj is a matrix that tells us which nodes are connecte dwith each other. 
    adj = sp.coo_matrix(adj) # this turns the adj matrix into a coo matrix which is used to save memory and better fro computation 
    # only saves the none zero objects in the matrix. 
    print(adj)
    rowsum = np.array(adj.sum(1))
    # 
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = th.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = th.from_numpy(sparse_mx.data)
    shape = th.Size(sparse_mx.shape)
    return th.sparse.FloatTensor(indices, values, shape)



def process_data_in_pyg(neigs):
    d = defaultdict(dict)
    metapaths = []
    for mp_i, nei1 in enumerate(neigs):
        dst_array_concat = np.concatenate(nei1)
        src_array_concat = []
        for src_id, dst_array in enumerate(nei1):
            src_array_concat.extend([src_id] * len(dst_array))
        src_array_concat = np.array(src_array_concat)
        src_name = f"target"
        dst_name = f"dst_{mp_i}"
        relation = f"relation_{mp_i}"
        d[(src_name, relation + "-->", dst_name)]["edge_index"] = th.LongTensor(np.vstack([src_array_concat, dst_array_concat]))
        metapaths.append((src_name, relation + "-->", dst_name))
        d[(dst_name, "<--" + relation, src_name)]["edge_index"] = th.LongTensor(np.vstack([dst_array_concat, src_array_concat]))
        metapaths.append((dst_name, "<--" + relation, src_name))
    g = HeteroData(d)
    return g, metapaths



def load_aminer(ratio, type_num):
    # The order of node types: 0 p 1 a 2 \r

    # creates the path to aminer dataset 
    path = data_folder + "aminer/"

    #loads the labels.npy into a np array 
    label = np.load(path + "labels.npy").astype('int32')
    print(f'lable:{label}')


    label = encode_onehot(label)
    print(f'lable after encode oneshot {label}')
    # load object that are seralized inside a numpy array 
    # these objects are from the authors. 
    nei_a = np.load(path + "nei_a.npy", allow_pickle=True)\

    # load object that are searilized inside a numpy array 
    # this is for relationships. 
    nei_r = np.load(path + "nei_r.npy", allow_pickle=True)

    # Because none of P, A or R has features, we assign one-hot encodings to all of them.
    # the fatures are added to the papers, authors and relationships 
    # the type-num is "type_num": [6564, 13329, 35890],
    # so paper -> 6564, author -> 13329 -> relationships -> 35890
    # spicy.sparse.eye create a identity matrix of size type_num 
    feat_p = sp.eye(type_num[0]) #return a tuple of the indices connected witht he value for example (2,2) 1, means there is a 1 in row 2, column2, this goes down to the size of the array. 

    feat_a = sp.eye(type_num[1])
    feat_r = sp.eye(type_num[2])

    #loads sparse matrix that have been stored in paper-author-paper, paper-relationship-paper, paper-
    pap = sp.load_npz(path + "pap.npz")
    prp = sp.load_npz(path + "prp.npz")
    pos = sp.load_npz(path + "pos.npz")

    # contains the indices of the nodes for the train, test, val
    # .npy is a way to store numpy arrays into files
    train = [np.load(path + "train_" + str(i) + ".npy") for i in ratio]
    test = [np.load(path + "test_" + str(i) + ".npy") for i in ratio]
    val = [np.load(path + "val_" + str(i) + ".npy") for i in ratio]


    label = th.FloatTensor(label) # make label a float tensor 
    nei_a = [th.LongTensor(i) for i in nei_a] # make neighboring area a tensor instead of a numpy array
    nei_r = [th.LongTensor(i) for i in nei_r] # make neighboring area of relationships a float tensor intstead of a numpy array. 

    # go into the preporccess features. 
    print("")
    print(feat_a, feat_a, feat_r)

    # passes the feat_a p, r sparse matrices with are idneitity matrices. there are tuples designenating row and column with a value next to it 
    # representing the value that is in the matrix. 
    feat_p = th.FloatTensor(preprocess_features(feat_p)) # this turns the matrix that is returned int a float tensor 
    print('')
    print(feat_p)
    feat_a = th.FloatTensor(preprocess_features(feat_a))
    feat_r = th.FloatTensor(preprocess_features(feat_r))

    print('')
    print(pap)
    pap = sparse_mx_to_torch_sparse_tensor(normalize_adj(pap))
    prp = sparse_mx_to_torch_sparse_tensor(normalize_adj(prp))
    pos = sparse_mx_to_torch_sparse_tensor(pos)
    train = [th.LongTensor(i) for i in train]
    val = [th.LongTensor(i) for i in val]
    test = [th.LongTensor(i) for i in test]
    print(train)
    return [nei_a, nei_r], [feat_p, feat_a, feat_r], [pap, prp], pos, label, train, val, test

def load_data(ratio=[20,40,60], type_num=[6564, 13329, 35890]):
    data = load_aminer(ratio, type_num)
    g, metapaths = process_data_in_pyg(data[0])
    return data, g, metapaths


data, g, metapaths = load_data()

lable:[0 0 0 ... 3 3 3]
labels inside encode_oneshot [[0]
 [0]
 [0]
 ...
 [3]
 [3]
 [3]]
lable after encode oneshot [[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 ...
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]

  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
  (4, 4)	1.0
  (5, 5)	1.0
  (6, 6)	1.0
  (7, 7)	1.0
  (8, 8)	1.0
  (9, 9)	1.0
  (10, 10)	1.0
  (11, 11)	1.0
  (12, 12)	1.0
  (13, 13)	1.0
  (14, 14)	1.0
  (15, 15)	1.0
  (16, 16)	1.0
  (17, 17)	1.0
  (18, 18)	1.0
  (19, 19)	1.0
  (20, 20)	1.0
  (21, 21)	1.0
  (22, 22)	1.0
  (23, 23)	1.0
  (24, 24)	1.0
  :	:
  (13304, 13304)	1.0
  (13305, 13305)	1.0
  (13306, 13306)	1.0
  (13307, 13307)	1.0
  (13308, 13308)	1.0
  (13309, 13309)	1.0
  (13310, 13310)	1.0
  (13311, 13311)	1.0
  (13312, 13312)	1.0
  (13313, 13313)	1.0
  (13314, 13314)	1.0
  (13315, 13315)	1.0
  (13316, 13316)	1.0
  (13317, 13317)	1.0
  (13318, 13318)	1.0
  (13319, 13319)	1.0
  (13320, 13320)	1.0
  (13321, 13321)	1.0
  (13322, 13322)	1.0
  (13323, 13323)	1.0
  (133

In [57]:
print(data[7])


[tensor([2368, 2138, 5156, 3491, 2714, 6142,  488, 6265, 2353, 4638, 5611, 2293,
        2356, 2688, 3396, 6465, 1085, 6375, 5318, 1407, 5205, 4700, 1697, 1913,
         993, 1450,  370, 3744, 1906, 4568, 2407,   82, 5625, 2398, 4886,  793,
        4599, 1141, 4467,  485, 5496, 2393, 6158, 1453, 4327,   97, 1211,  607,
        1259, 5593,  814, 2971, 6206, 1223, 4948, 1839, 4755, 3838, 2901, 1411,
        3423, 5619, 5056, 2908, 2675, 6281,  994, 3573, 4653,  557, 5775, 1562,
         646, 3245, 1143, 1156, 4832, 6038, 3346, 2009, 4786, 5483, 6300, 6459,
         621, 3055,   65, 4078,  445, 6000,  641, 6262, 5322,  379, 1728, 1640,
        4464,  602,  909, 1663, 1599, 1852, 1833, 5684, 6388, 4768, 4843, 2399,
         693, 4873, 3494, 4677, 3725, 4290, 5874, 5860, 1078, 3182, 5598, 2886,
        1519, 2119, 6517, 3577, 4108,  567, 4872, 3407,  952, 6165, 2389, 1419,
        5913, 5965, 5060, 2254, 4064, 5552,  729,  321, 2352, 4060,  713, 1521,
        2877, 1651, 3467, 2283, 3324,  

In [58]:
print(g)

HeteroData(
  (target, relation_0-->, dst_0)={ edge_index=[2, 18007] },
  (dst_0, <--relation_0, target)={ edge_index=[2, 18007] },
  (target, relation_1-->, dst_1)={ edge_index=[2, 58831] },
  (dst_1, <--relation_1, target)={ edge_index=[2, 58831] }
)


In [59]:
print(metapaths)

[('target', 'relation_0-->', 'dst_0'), ('dst_0', '<--relation_0', 'target'), ('target', 'relation_1-->', 'dst_1'), ('dst_1', '<--relation_1', 'target')]
