## Experiment 1: 

In [4]:
import argparse
import csv
import os
import sys
from IPython.display import display, Javascript
import json
import os

sys.path.insert(0, '/hkfs/work/workspace/scratch/cc7738-subgraph_training/Universal-MP')

import numpy as np
import scipy.sparse as ssp
import torch
import torch.nn.functional as F
import time

from matplotlib import pyplot as plt
from yacs.config import CfgNode

from graphgps.utils.ogbdataset import loaddataset
from graphgps.utils.heuristic import CN, AA, RA
from graphgps.models.GNN import GAT_Variant, GCN_Variant, SAGE_Variant, GIN_Variant, GAE_forall, InnerProduct, mlp_score


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample data
categories = ['C.ele', 'USAir', 'E.coli', 'Yeast', 'PB', 'NS', 'Power', 'Router']
methods = ['Baseline', 'GCN', 'SAGE', 'SEAL']
metrics = ['CN', 'AA', 'RA']
data = {
    'CN': np.random.rand(4, len(categories)) * 100,  # Simulated data
    'AA': np.random.rand(4, len(categories)) * 10,
    'RA': np.random.rand(4, len(categories)) * 1
}
errors = {
    'CN': np.random.rand(4, len(categories)) * 10,
    'AA': np.random.rand(4, len(categories)) * 1,
    'RA': np.random.rand(4, len(categories)) * 0.1
}

# Plot settings
fig, axes = plt.subplots(3, 1, figsize=(12, 6), sharex=True)
x = np.arange(len(categories))  # x-axis positions for each category
width = 0.2  # Width of each bar

# Colors for each method
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']

for i, metric in enumerate(metrics):
    ax = axes[i]
    for j, method in enumerate(methods):
        ax.bar(x + j * width, data[metric][j], width, label=method, yerr=errors[metric][j], color=colors[j])
    ax.set_yscale('log')
    ax.set_ylabel('MSE')
    ax.set_title(metric, loc='left', fontsize=12, backgroundcolor='wheat', fontweight='bold')
    ax.grid(True, which="both", linestyle='--', linewidth=0.5)

# Set labels and legend
axes[-1].set_xticks(x + width * 1.5)
axes[-1].set_xticklabels(categories)
axes[0].legend(methods, loc='upper right', bbox_to_anchor=(1.15, 1))

plt.tight_layout()
plt.show()


## Create GNNs 

In [5]:
def create_GAE_model(cfg_model: CN,
                     cfg_score: CN,
                     model_name: str):
    if model_name in {'GAT', 'VGAE', 'GAE', 'GraphSage'}:
        raise NotImplementedError('Current model does not exist')
        # model = create_model(cfg_model)

    elif model_name == 'GAT_Variant':
        encoder = GAT_Variant(cfg_model.in_channels,
                              cfg_model.hidden_channels,
                              cfg_model.out_channels,
                              cfg_model.num_layers,
                              cfg_model.dropout,
                              cfg_model.heads,
                              )
    elif model_name == 'GCN_Variant':
        encoder = GCN_Variant(cfg_model.in_channels,
                              cfg_model.hidden_channels,
                              cfg_model.out_channels,
                              cfg_model.num_layers,
                              cfg_model.dropout,
                              )
    elif model_name == 'SAGE_Variant':
        encoder = SAGE_Variant(cfg_model.in_channels,
                               cfg_model.hidden_channels,
                               cfg_model.out_channels,
                               cfg_model.num_layers,
                               cfg_model.dropout,
                               )
    elif model_name == 'GIN_Variant':
        encoder = GIN_Variant(cfg_model.in_channels,
                              cfg_model.hidden_channels,
                              cfg_model.out_channels,
                              cfg_model.num_layers,
                              cfg_model.dropout,
                              cfg_model.mlp_layer
                              )
    if cfg_score.product == 'dot':
        decoder = mlp_score(cfg_model.out_channels,
                            cfg_score.score_hidden_channels,
                            cfg_score.score_out_channels,
                            cfg_score.score_num_layers,
                            cfg_score.score_dropout,
                            cfg_score.product)
    elif cfg_score.product == 'inner':
        decoder = InnerProduct()

    else:
        # Without this else I got: UnboundLocalError: local variable 'model' referenced before assignment
        raise ValueError('Current model does not exist')

    return GAE_forall(encoder=encoder, decoder=decoder)


### Analysis Tool 

In [None]:
def save_to_csv(file_path, model_name, heuristic, test_loss):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    file_exists = os.path.isfile(file_path)
    with open(file_path, mode='a', newline='') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow(['Model', 'Heuristic', 'Test_Loss'])
        writer.writerow([model_name, heuristic, test_loss])
        
def visualize(pred, true_label, save_path = './visualization.png'):

    pred = pred.cpu().detach().numpy()
    true_label = true_label.cpu().detach().numpy()
    plt.figure(figsize=(10, 6))
    plt.scatter(np.arange(len(true_label)), true_label, color='blue', label='True Score', alpha=0.6)
    plt.scatter(np.arange(len(pred)), pred, color='red', label='Prediction', alpha=0.6)

    plt.title('Predictions vs True Score')
    plt.xlabel('Sample Index')
    plt.ylabel('Value')
    plt.ylim(0, 1.5)
    plt.legend()

    plt.savefig(save_path)
    plt.close()

    print(f"Visualization saved at {save_path}")


In [None]:
def train(model, optimizer, data, splits, device, epoch):
    model.train()
    optimizer.zero_grad()

    # Positive and negative edges for training
    pos_edge_index = splits['train']['pos_edge_label_index'].to(device)
    neg_edge_index = splits['train']['neg_edge_label_index'].to(device)

    # Labels for positive and negative edges (continuous regression labels)
    pos_edge_label = splits['train']['pos_edge_score'].to(device)
    neg_edge_label = splits['train']['neg_edge_score'].to(device)

    # Forward pass
    z = model.encode(data.x, data.edge_index)

    # Compute predictions for both positive and negative edges
    pos_pred = model.decode(z[pos_edge_index[0]], z[pos_edge_index[1]])
    neg_pred = model.decode(z[neg_edge_index[0]], z[neg_edge_index[1]])

    # Compute regression loss (MSE for continuous labels)
    pos_loss = F.mse_loss(pos_pred, pos_edge_label)
    neg_loss = F.mse_loss(neg_pred, neg_edge_label)
    loss = pos_loss + neg_loss
    loss.backward()

    # Optimizer step
    optimizer.step()
    visualize(pos_pred, pos_edge_label, save_path='./visualization_pos_train.png')
    visualize(neg_pred, neg_edge_label, save_path='./visualization_neg_train.png')

    return loss.item()



@torch.no_grad()
def valid(model, data, splits, device, epoch):
    model.eval()

    # Positive and negative edges for validation
    pos_edge_index = splits['valid']['pos_edge_label_index'].to(device)
    neg_edge_index = splits['valid']['neg_edge_label_index'].to(device)

    # Labels for positive and negative edges (continuous regression labels)
    pos_edge_label = splits['valid']['pos_edge_score'].to(device)
    neg_edge_label = splits['valid']['neg_edge_score'].to(device)

    # Forward pass
    z = model.encode(data.x, data.edge_index)

    # Predict scores for both positive and negative edges
    pos_pred = model.decode(z[pos_edge_index[0]], z[pos_edge_index[1]])
    neg_pred = model.decode(z[neg_edge_index[0]], z[neg_edge_index[1]])

    # Compute regression loss (MSE)
    pos_loss = F.mse_loss(pos_pred, pos_edge_label)
    neg_loss = F.mse_loss(neg_pred, neg_edge_label)
    loss = pos_loss + neg_loss
    return loss.item()


@torch.no_grad()
def test(model, data, splits, device):
    model.eval()

    # Positive and negative edges for test
    pos_edge_index = splits['test']['pos_edge_label_index'].to(device)
    neg_edge_index = splits['test']['neg_edge_label_index'].to(device)

    # Labels for positive and negative edges (continuous regression labels)
    pos_edge_label = splits['test']['pos_edge_score'].to(device)
    neg_edge_label = splits['test']['neg_edge_score'].to(device)

    # Forward pass
    z = model.encode(data.x, data.edge_index)

    # Predict scores for both positive and negative edges
    pos_pred = model.decode(z[pos_edge_index[0]], z[pos_edge_index[1]])
    neg_pred = model.decode(z[neg_edge_index[0]], z[neg_edge_index[1]])
    visualize(pos_pred, pos_edge_label, save_path = './visualization_pos.png')
    visualize(neg_pred, neg_edge_label, save_path = './visualization_neg.png')

    # Compute regression loss (MSE)
    pos_loss = F.mse_loss(pos_pred, pos_edge_label)
    neg_loss = F.mse_loss(neg_pred, neg_edge_label)
    loss = pos_loss + neg_loss

    return loss.item()

In [None]:
class Config:
    def __init__(self):
        self.epochs = 30
        self.dataset = "Cora"
        self.batch_size = 512
        self.heuristic = "AA"
        self.gnn = "gcn"
        self.model = "GIN_Variant"
        self.use_feature = False


In [11]:

args = Config()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, splits = loaddataset(args.dataset, True)
data = data.to(device)


with open('/hkfs/work/workspace/scratch/cc7738-subgraph_training/Universal-MP/yamls/cora/heart_gnn_models.yaml', "r") as f:
    cfg = CfgNode.load_cfg(f)
    
cfg_model = eval(f'cfg.model.{args.model}')
if not hasattr(splits['train'], 'x') or splits['train'].x is None:
    cfg_model.in_channels = 1024
else:
    cfg_model.in_channels = data.num_nodes
cfg_score = eval(f'cfg.score.{args.model}')
cfg.model.type = args.model
edge_weight = torch.ones(data.edge_index.size(1), dtype=float)
A = ssp.csr_matrix(
    (edge_weight, (data.edge_index[0].cpu(), data.edge_index[1].cpu())),
    shape=(data.num_nodes, data.num_nodes)
)
method_dict = {
    "CN": CN,
    "AA": AA,
    "RA": RA
}
for split in splits:
    pos_edge_score, _ = method_dict[args.heuristic](A, splits[split]['pos_edge_label_index'],
                                                    batch_size=args.batch_size)
    neg_edge_score, _ = method_dict[args.heuristic](A, splits[split]['neg_edge_label_index'],
                                                    batch_size=args.batch_size)
    splits[split]['pos_edge_score'] = torch.sigmoid(pos_edge_score)
    splits[split]['neg_edge_score'] = torch.sigmoid(neg_edge_score)
if not args.use_feature:
    A_dense = A.toarray()
    A_tensor = torch.tensor(A_dense)
    data.x = A_tensor.float().to(device)

model = create_GAE_model(cfg_model, cfg_score, args.model).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(1, args.epochs + 1):
    start = time.time()
    loss = train(model, optimizer, data, splits, device, args.batch_size)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
test_loss = test(model, data, splits, device)
save_to_csv(f'./results/test_results_{args.dataset}.csv', args.model, args.heuristic, test_loss)
print('Saved results.')



  multiplier = 1 / np.log(A.sum(axis=0))
  multiplier = 1 / np.log(A.sum(axis=0))


layers in gin:  1
layers in mlp:  1


  pos_loss = F.mse_loss(pos_pred, pos_edge_label)
  neg_loss = F.mse_loss(neg_pred, neg_edge_label)


Visualization saved at ./visualization_pos_train.png
Visualization saved at ./visualization_neg_train.png
Epoch: 001, Loss: 0.0191
Visualization saved at ./visualization_pos_train.png
Visualization saved at ./visualization_neg_train.png
Epoch: 002, Loss: 0.0188
Visualization saved at ./visualization_pos_train.png
Visualization saved at ./visualization_neg_train.png
Epoch: 003, Loss: 0.0185
Visualization saved at ./visualization_pos_train.png
Visualization saved at ./visualization_neg_train.png
Epoch: 004, Loss: 0.0182
Visualization saved at ./visualization_pos_train.png
Visualization saved at ./visualization_neg_train.png
Epoch: 005, Loss: 0.0180
Visualization saved at ./visualization_pos_train.png
Visualization saved at ./visualization_neg_train.png
Epoch: 006, Loss: 0.0178
Visualization saved at ./visualization_pos_train.png
Visualization saved at ./visualization_neg_train.png
Epoch: 007, Loss: 0.0177
Visualization saved at ./visualization_pos_train.png
Visualization saved at ./visua

  pos_loss = F.mse_loss(pos_pred, pos_edge_label)
  neg_loss = F.mse_loss(neg_pred, neg_edge_label)
