In [1]:
from pprint import pprint
from torch_geometric.data import Data, DataListLoader, Dataset, InMemoryDataset, Batch
from torch_geometric.loader import DataListLoader, DataLoader
from torch_geometric.nn import *
from torch_geometric.utils import to_dense_adj, to_dense_batch, add_self_loops, remove_self_loops
from torch_geometric.nn.conv import MessagePassing
import torch
from torch import nn
import rdkit
from tqdm.auto import tqdm
import itertools
from rdkit import Chem
import pandas as pd
from importlib import reload
import matplotlib.pyplot as plt
from rdkit import RDLogger

from copy import deepcopy
#from torch.utils.data import Dataset, DataLoader
from typing import Tuple, List, Dict, Union
from torch import Tensor
from torch_geometric.nn import MessagePassing, radius_graph
from torch_geometric.utils import add_self_loops, degree
from torch_scatter import scatter
from torch_geometric.typing import (
    Adj,
    OptTensor,
    SparseTensor,
    pyg_lib,
    torch_sparse,
)
from torch_geometric.data import (
    Data,
    InMemoryDataset,
    download_url,
    extract_zip,
)
from rdkit import Chem
import os
# Suppress RDKit warnings
RDLogger.DisableLog('rdApp.*')
cuda=torch.device('cuda') if torch.cuda.is_available() else 'cpu'
import sascorer
#torch.set_default_dtype(torch.float64)
from models import *
from rdkit.Chem.Crippen import MolLogP
from typing import List
#from prolog import *
#import torchlens as tl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import KarateClub
from torch_geometric.utils import negative_sampling
from torch_geometric.transforms import RandomLinkSplit
from sklearn.metrics import roc_auc_score

In [3]:
from torch_geometric.datasets import Planetoid

In [4]:
def enumerate_last(iterable):
    it = iter(iterable)
    try:
        prev = next(it)
    except StopIteration:
        return  # Empty iterable case

    for item in it:
        yield (False, prev)
        prev = item

    yield (True, prev)  # Last element

In [6]:

class GCNLinkPredictor(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCNLinkPredictor, self).__init__()
        self.convs = nn.ModuleList([
            GCNConv(in_channels, hidden_channels),
            GCNConv(hidden_channels, hidden_channels),
            GCNConv(hidden_channels, hidden_channels),
            GCNConv(hidden_channels, hidden_channels),
            GCNConv(hidden_channels, hidden_channels),
            GCNConv(hidden_channels, out_channels)
        ])
    def encode(self, x, edge_index):
        for is_last,conv in enumerate_last(self.convs):
            x = conv(x, edge_index)
            if not is_last:
                x=x.relu()
        return x

    def decode(self, z, edge_index):
        # Dot product decoder to compute the edge score.
        return (z[edge_index[0]] * z[edge_index[1]]).sum(dim=1)

In [7]:
from models import m307b_clean
from importlib import reload
_=reload(m307b_clean)

In [13]:
from models.m307b_clean import M307B
class M307BLinkPredictor(M307B):
    def __init__(self,*args, **kwargs):
        super().__init__(*args, **kwargs)
        self.map = nn.Linear(1433,16)
    def encode(self, h, edge_index):
        edge_attr = torch.zeros_like(edge_index[0,:]).view(-1,1).float()
        return self(self.map(h), edge_attr, edge_index)

    def decode(self, z, edge_index):
        # Dot product decoder to compute the edge score.
        return (z[edge_index[0]] * z[edge_index[1]]).sum(dim=1)

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import KarateClub
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.utils import negative_sampling
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA

# Load the KarateClub dataset
ds=Planetoid('datasets/planetoid-cora',name='Cora',split='full')
data = ds[0]

# If no node features are provided, use an identity matrix as one-hot encoding per node.
if data.x is None:
    data.x = torch.eye(data.num_nodes)

# Use RandomLinkSplit to create training, validation, and test splits.
# Note: is_undirected=True is set for an undirected graph.
transform = RandomLinkSplit(num_val=0.1, num_test=0.2, is_undirected=True)
train_data, val_data, test_data = transform(data)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)

def train(model, opetimizer, data):
    model.train()
    optimizer.zero_grad()
    
    # Encode nodes using only the training graph (with positive edges).
    z = model.encode(data.x, data.edge_index)
    
    # Compute positive edge scores using training edges.
    pos_edge = data.edge_index
    pos_scores = model.decode(z, pos_edge)
    
    # Negative sampling on the training graph.
    neg_edge = negative_sampling(
        edge_index=pos_edge,
        num_nodes=data.num_nodes,
        num_neg_samples=pos_edge.size(1)
    )
    neg_scores = model.decode(z, neg_edge)
    
    # Binary cross entropy loss.
    pos_loss = -torch.log(torch.sigmoid(pos_scores) + 1e-15).mean()
    neg_loss = -torch.log(1 - torch.sigmoid(neg_scores) + 1e-15).mean()
    loss = pos_loss + neg_loss
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def test(model, train_data, val_data, test_data):
    model.eval()
    # Use the training graph to compute node embeddings.
    z = model.encode(train_data.x, train_data.edge_index).to(device)
    
    def compute_auc(split):
        # Evaluate using the provided edge_label_index and edge_label.
        scores = model.decode(z, split.edge_label_index)
        scores = scores.cpu()
        labels = split.edge_label.cpu()
        return roc_auc_score(labels, scores)
    
    auc_val = compute_auc(val_data)
    auc_test = compute_auc(test_data)
    return auc_val, auc_test

In [16]:

model = GCNLinkPredictor(in_channels=train_data.x.size(1), hidden_channels=16, out_channels=16).to(device)
print("#params:",sum(map(torch.numel,model.parameters())))
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# Train the model
for epoch in range(100):
    loss = train(model, optimizer, train_data)
    if epoch % 5 == 0:
        auc_val, auc_test = test(model, train_data, val_data, test_data)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val AUC: {auc_val:.4f}, Test AUC: {auc_test:.4f}')


#params: 24304
Epoch: 000, Loss: 1.3846, Val AUC: 0.6473, Test AUC: 0.6623
Epoch: 005, Loss: 1.3795, Val AUC: 0.6435, Test AUC: 0.6570
Epoch: 010, Loss: 1.3206, Val AUC: 0.6511, Test AUC: 0.6739
Epoch: 015, Loss: 1.1939, Val AUC: 0.7236, Test AUC: 0.7083
Epoch: 020, Loss: 1.1098, Val AUC: 0.7344, Test AUC: 0.7337
Epoch: 025, Loss: 1.0920, Val AUC: 0.7305, Test AUC: 0.7292
Epoch: 030, Loss: 1.0858, Val AUC: 0.7302, Test AUC: 0.7310
Epoch: 035, Loss: 1.0775, Val AUC: 0.7328, Test AUC: 0.7320
Epoch: 040, Loss: 1.0892, Val AUC: 0.7313, Test AUC: 0.7350
Epoch: 045, Loss: 1.0831, Val AUC: 0.7256, Test AUC: 0.7309
Epoch: 050, Loss: 1.0816, Val AUC: 0.7258, Test AUC: 0.7296
Epoch: 055, Loss: 1.0720, Val AUC: 0.7275, Test AUC: 0.7278
Epoch: 060, Loss: 1.0707, Val AUC: 0.7294, Test AUC: 0.7258
Epoch: 065, Loss: 1.0673, Val AUC: 0.7326, Test AUC: 0.7250
Epoch: 070, Loss: 1.0686, Val AUC: 0.7330, Test AUC: 0.7283
Epoch: 075, Loss: 1.0643, Val AUC: 0.7368, Test AUC: 0.7318
Epoch: 080, Loss: 1.0262,

In [18]:
data

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [17]:
node_dim=16
hidden_dim=16
inner_dim=16
model = M307BLinkPredictor(
    node_dimses=[
        [node_dim,   inner_dim, hidden_dim],
        [hidden_dim, inner_dim, hidden_dim],
        [hidden_dim, inner_dim, hidden_dim],
        [hidden_dim, inner_dim, hidden_dim],
        [hidden_dim, inner_dim, hidden_dim],
        [hidden_dim, inner_dim, hidden_dim],
    ],
    edge_dimses=[
        [1, hidden_dim],
        [1, hidden_dim],
        [1, hidden_dim],
        [1, hidden_dim],
        [1, hidden_dim],
        [1, hidden_dim],
    ],
    activation=nn.SiLU,
    dropout_rate=0,
    ).to(device)
print("#params:",sum(map(torch.numel,model.parameters())))
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# Train the model
for epoch in range(100):
    loss = train(model, optimizer, train_data)
    if epoch % 5 == 0:
        auc_val, auc_test = test(model, train_data, val_data, test_data)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val AUC: {auc_val:.4f}, Test AUC: {auc_test:.4f}')


#params: 27142
Epoch: 000, Loss: 17.0587, Val AUC: 0.5611, Test AUC: 0.5662
Epoch: 005, Loss: 3.7842, Val AUC: 0.6129, Test AUC: 0.6284
Epoch: 010, Loss: 1.9487, Val AUC: 0.6415, Test AUC: 0.6533
Epoch: 015, Loss: 1.4802, Val AUC: 0.6474, Test AUC: 0.6743
Epoch: 020, Loss: 1.3741, Val AUC: 0.6715, Test AUC: 0.7009
Epoch: 025, Loss: 1.2585, Val AUC: 0.6802, Test AUC: 0.7099
Epoch: 030, Loss: 1.1877, Val AUC: 0.6967, Test AUC: 0.7288
Epoch: 035, Loss: 1.1588, Val AUC: 0.7124, Test AUC: 0.7446
Epoch: 040, Loss: 1.1031, Val AUC: 0.7185, Test AUC: 0.7537
Epoch: 045, Loss: 1.0742, Val AUC: 0.7245, Test AUC: 0.7614
Epoch: 050, Loss: 1.0402, Val AUC: 0.7303, Test AUC: 0.7677
Epoch: 055, Loss: 1.0148, Val AUC: 0.7365, Test AUC: 0.7747
Epoch: 060, Loss: 1.0599, Val AUC: 0.7393, Test AUC: 0.7812
Epoch: 065, Loss: 1.0141, Val AUC: 0.7437, Test AUC: 0.7855
Epoch: 070, Loss: 1.0078, Val AUC: 0.7489, Test AUC: 0.7898
Epoch: 075, Loss: 0.9793, Val AUC: 0.7488, Test AUC: 0.7936
Epoch: 080, Loss: 1.0242

In [34]:
for conv in model.convs:
    print(conv.alpha)

Parameter containing:
tensor([1.0061, 1.0141, 1.0158, 1.1479, 1.1511, 1.1472, 1.1271, 0.8646, 0.9467,
        0.8907, 0.8993, 0.7285, 1.0994, 0.9746, 0.7081, 0.9881, 1.1249, 1.0201,
        1.0170, 0.7510, 0.9181, 0.8399, 0.9558, 1.1762, 0.7181, 0.7015, 0.6753,
        1.0228, 1.0947, 1.0132, 0.8804, 0.9982, 1.0818, 1.1061, 0.9119, 1.0503,
        0.9111, 0.8308, 0.9412, 0.8230, 0.7303, 1.2109, 0.7824, 1.0068, 0.9572,
        1.0117, 0.9358, 0.9009, 0.9428, 0.7858, 1.0028, 1.0819, 0.7838, 1.1019,
        1.0545, 1.0808, 0.7992, 0.7195, 0.9679, 0.9222, 0.7677, 0.7603, 0.9260,
        0.9416, 1.0698, 0.8621, 0.8129, 0.8895, 0.6565, 0.9638, 0.8874, 0.8962,
        0.7377, 0.6436, 0.7694, 0.9658, 0.7946, 0.9556, 0.9460, 0.8951, 0.7375,
        0.8055, 0.9803, 0.9196, 1.0044, 0.9176, 0.7754, 0.8766, 0.9366, 0.9999,
        0.8318, 0.8912, 1.0089, 0.8411, 0.7546, 0.9442, 0.9385, 0.9093, 0.7587,
        0.8542], requires_grad=True)
Parameter containing:
tensor([1.0074, 1.1171, 0.8210, 1.0191,

In [24]:
val_data.x

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]], device='cuda:0')

Number of graphs in the dataset: 100
Example graph: Data(x=[108, 3], edge_index=[2, 4884], y=[108])
Epoch: 020, Loss: 1.2436, Val AUC: 0.5012, Test AUC: 0.5017
Epoch: 040, Loss: 1.1926, Val AUC: 0.5012, Test AUC: 0.5018
Epoch: 060, Loss: 1.1647, Val AUC: 0.5014, Test AUC: 0.5018
Epoch: 080, Loss: 1.1799, Val AUC: 0.5016, Test AUC: 0.5018
Epoch: 100, Loss: 1.1447, Val AUC: 0.5016, Test AUC: 0.5019
Epoch: 120, Loss: 1.1428, Val AUC: 0.5016, Test AUC: 0.5019


KeyboardInterrupt: 

In [10]:
batch

NameError: name 'batch' is not defined