In [14]:
!pip install --upgrade pip
!pip install torch_geometric
!pip install torch
!pip install matplotlib
!pip install h5py

Collecting h5py
  Downloading h5py-3.10.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.5 kB)
Downloading h5py-3.10.0-cp311-cp311-macosx_11_0_arm64.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: h5py
Successfully installed h5py-3.10.0


In [15]:
import torch 
from torch_geometric.nn import GCNConv, SAGEConv, to_hetero
from torch_geometric.utils import negative_sampling
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from sklearn.metrics import roc_auc_score
from torch_geometric.data import InMemoryDataset
from torch_geometric.utils.convert import to_networkx
from torch_geometric.transforms import ToUndirected
import torch_geometric.data as dt

import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import h5py

import scipy.sparse as sp

In [16]:
# Build dataset

# Load data from .mat files
gf_data = h5py.File("../data/GeneFeatures.mat", 'r')
gp_data= h5py.File("../data/genes_phenes.mat", 'r')
cf_data = h5py.File("../data/clinicalfeatures_tfidf.mat", 'r')

gene_ids = gp_data["geneIds"]
phene_ids = gp_data["pheneIds"]

# GeneGene_Hs --> "data", "ir", "jc"
print("GeneGene data: ", gp_data["GeneGene_Hs"]["data"].shape)
print("GeneGene ir: ", gp_data["GeneGene_Hs"]["ir"].shape)
print("GeneGene jc: ", gp_data["GeneGene_Hs"]["jc"].shape)
print("Gene Features: ", gf_data["GeneFeatures"].shape) # 4536x12331
print("Clinical Features: ", cf_data["F"].shape) # 3215
print("GeneIds: ", gp_data["geneIds"].shape)
print("PheneIds: ", gp_data["pheneIds"].shape)
print(gp_data[gp_data["pheneIds"][0][0]][0]) # 1x3215
print("GenePhenes: ", gp_data["GenePhene"].shape)


gene_network_adj = sp.csc_matrix((np.array(gp_data['GeneGene_Hs']['data']),
    np.array(gp_data['GeneGene_Hs']['ir']), np.array(gp_data['GeneGene_Hs']['jc'])),
    shape=(12331,12331)).tocoo()

disease_network_adj = sp.csc_matrix((np.array(gp_data['PhenotypeSimilarities']['data']),
    np.array(gp_data['PhenotypeSimilarities']['ir']), np.array(gp_data['PhenotypeSimilarities']['jc'])),
    shape=(3215, 3215)).tocoo()

disease_offset = gene_network_adj.shape[0]+1


dg_ref = gp_data['GenePhene'][0][0]
gene_disease_adj = sp.csc_matrix((np.array(gp_data[dg_ref]['data']),
    np.array(gp_data[dg_ref]['ir']), np.array(gp_data[dg_ref]['jc'])),
    shape=(12331, 3215)).tocoo()

print(disease_offset)


print(gene_network_adj.shape)   # which genes are linked to each other
print(disease_network_adj.shape) # which diseases are linked to each other
print(gene_disease_adj.shape) # which genes are linked to which diseases

GeneGene data:  (733836,)
GeneGene ir:  (733836,)
GeneGene jc:  (12332,)
Gene Features:  (4536, 12331)
Clinical Features:  (16592, 3215)
GeneIds:  (1, 12331)
PheneIds:  (9, 1)
[2.00000e+00 5.00000e+00 1.00100e+05 ... 1.61550e+05 6.10805e+05
 6.14485e+05]
GenePhenes:  (9, 1)
12332
(12331, 12331)
(3215, 3215)
(12331, 3215)


In [17]:
# load up Gene Features into a tensor
gene_nodes = torch.tensor(gf_data["GeneFeatures"][:]).T
disease_nodes = torch.tensor(cf_data["F"][:]).T

gene_rows = gene_network_adj.row
gene_cols = gene_network_adj.col
gene_data = gene_network_adj.data

disease_rows = disease_network_adj.row
disease_cols = disease_network_adj.col
disease_data = disease_network_adj.data

gene_disease_rows = gene_disease_adj.row
gene_disease_cols =  gene_disease_adj.col
gene_disease_data = gene_disease_adj.data



gm_graph = dt.HeteroData()
gm_graph["gene"].x = gene_nodes
gm_graph["gene"].node_id = torch.arange(len(gene_nodes))

gm_graph["gene", "gene_gene", "gene"].edge_index = torch.tensor([gene_rows, gene_cols])
gm_graph["gene", "gene_gene", "gene"].edge_attr = torch.tensor(gene_data)

gm_graph["disease"].x = disease_nodes
gm_graph["gene"].node_id = torch.arange(len(disease_nodes))

gm_graph["disease", "dis_dis", "disease"].edge_index = torch.tensor([disease_rows, disease_cols])
gm_graph["disease", "dis_dis", "disease"].edge_attr = torch.tensor(disease_data)


gm_graph["gene", "gda", "disease"].edge_index = torch.tensor([gene_disease_rows, gene_disease_cols])
gm_graph["gene", "gda", "disease"].edge_attr = torch.tensor(gene_disease_data)



# gm_graph.add_edge_index(edge_index=[gene_rows, gene_cols], edge_attr=gene_data, source="gene", target="gene")
# gm_graph.add_edge_index(edge_index=[disease_rows, disease_cols], edge_attr=disease_data, source="disease", target="disease")
# gm_graph.add_edge_index(edge_index=[gene_disease_rows, gene_disease_cols], edge_attr=gene_disease_data, source="gene", target="disease")


gene_mutations = [gm_graph]

  gm_graph["gene", "gene_gene", "gene"].edge_index = torch.tensor([gene_rows, gene_cols])


In [18]:
class GeneMutations(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None, data_list=None):
        super(GeneMutations, self).__init__(root, transform, pre_transform)
        self.data, self.slices = self.collate(data_list)


transform = T.Compose([
        T.ToUndirected(),
        T.RandomLinkSplit(
            num_val=0.05,
            num_test=0.1,
            is_undirected=True,
            neg_sampling_ratio=2.0,
            edge_types=gm_graph.edge_types,
            # rev_edge_types=("disease", "gda", "gene"),
            add_negative_train_samples=False
        )
    ]
)


gm = GeneMutations(".", transform=transform, data_list=gene_mutations)
train_data, val_data, test_data = gm[0]

print(train_data)
print(val_data)
print(test_data)

HeteroData(
  gene={ x=[12331, 4536] },
  disease={ x=[3215, 16592] },
  (gene, gene_gene, gene)={
    edge_index=[2, 623764],
    edge_attr=[623764],
    edge_label=[311882],
    edge_label_index=[2, 311882],
  },
  (disease, dis_dis, disease)={
    edge_index=[2, 2704872],
    edge_attr=[2704872],
    edge_label=[1352436],
    edge_label_index=[2, 1352436],
  },
  (gene, gda, disease)={
    edge_index=[2, 3362],
    edge_attr=[3362],
    edge_label=[3362],
    edge_label_index=[2, 3362],
  },
  (disease, rev_gda, gene)={
    edge_index=[2, 3954],
    edge_attr=[3954],
  }
)
HeteroData(
  gene={ x=[12331, 4536] },
  disease={ x=[3215, 16592] },
  (gene, gene_gene, gene)={
    edge_index=[2, 623764],
    edge_attr=[623764],
    edge_label=[55035],
    edge_label_index=[2, 55035],
  },
  (disease, dis_dis, disease)={
    edge_index=[2, 2704872],
    edge_attr=[2704872],
    edge_label=[238662],
    edge_label_index=[2, 238662],
  },
  (gene, gda, disease)={
    edge_index=[2, 3362],
   

In [5]:
!pip install -U executing



In [33]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels=0):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), hidden_channels) # out_channels
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x



class Classifier(torch.nn.Module):
    def forward(self, x_gene, x_disease, edge_label_index):
        edge_feat_gene = x_gene[edge_label_index[0]]
        edge_feat_disease = x_disease[edge_label_index[1]]
        return (edge_feat_gene * edge_feat_disease).sum(dim=-1)

    
class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin = torch.nn.Linear(20, hidden_channels)
        self.gene_emb = torch.nn.Embedding(gm_graph["gene"].num_nodes, hidden_channels)
        self.disease_emb = torch.nn.Embedding(gm_graph["disease"].num_nodes, hidden_channels)

        self.gnn = GNN(hidden_channels)
        self.gnn = to_hetero(self.gnn, metadata=gm_graph.metadata())

        self.classifier = Classifier()

    def forward(self, data):
        x_dict = {
            "gene": self.gene_emb(data["gene"].node_id),
            "disease": self.disease_emb(data["disease".node_id])
        }

        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["gene"],
            x_dict["disease"],
            data["gene", "gda", "disease"].edge_label_index
        )

        return pred


# model = GNN(hidden_channels=64, out_channels=len(gm_graph.num_node_features))
model = Model(hidden_channels=64)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss()
# gm_graph["gene"].num_nodes

In [34]:
model(gm_graph)


AttributeError: 'NodeStorage' object has no attribute 'node_id'