In [1]:
import torch
import numpy as np
import pandas as pd
import utils.cancer_data as pathway
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load both data and operate by class pathway
pathwayPATH = './Gene_DATA/sourcePathway.txt'
cancerPATH = './BreastCancer/Data_RNASeq2.mat'

data = pathway(pthwayPATH=pathwayPATH, cancerPATH=cancerPATH)

In [3]:
# Create deep_geometric data object
x = np.concatenate([data.activ_free, data.activ_cancer],
                   axis = 1)
x = torch.from_numpy(x)
edge_index = torch.from_numpy(data.edge_index.T)

pathway_data = Data(x = x.unsqueeze(-1),
                   edge_index = edge_index)
pathway_data.protein_id = torch.from_numpy(data.remained_protein_id)

In [6]:
# the remained id that does have at least one hop
# Based on computation, we only have 1435 protein nodes satisfy the requirement
least_size = 5
save_ids = []
edge_index = pathway_data.edge_index.numpy()
node_i, node_j = edge_index
for elem in pathway_data.protein_id.numpy():
    level_node = node_i[np.in1d(node_j, elem)]
    level_node = np.setdiff1d(level_node, elem)
    save_ids.append(elem) if level_node.size > least_size  else save_ids

In [7]:
num_hops = 2
batch_size = 3
# Start from here is the test of sub-sampling linkage based on num of hops
# First, split the started node into desgined batch_size
batch_splits = torch.split(torch.tensor(save_ids), batch_size)

In [14]:
# function that find all parent hops
batched_node_list = []
for batch in batch_splits:
    # the ids i want to keep in further sub-sampling part
    this_batch_ids = batch.numpy()
    for num in range(num_hops):
        this_batch_ids = np.hstack([this_batch_ids, node_i[np.in1d(node_j, this_batch_ids)]])
    batched_node_list.append(np.unique(this_batch_ids))

In [15]:
# From here, extract the remained graph based on batached_node_id-list
le = LabelEncoder()
le.fit(data.remained_protein)

LabelEncoder()

In [16]:

def sampler_generater(batch, le):
    """
    This function passes batch index number to obtained trained object
    """
    deep_pthway = Data()
    newpthway_Namelist = data.pthway_NameList.iloc[batch,:].reset_index(drop=True)
    deep_pthway.genome_Namelist = newpthway_Namelist[newpthway_Namelist['GenomeType'] == 'protein']['GenomeName'].values
    activ_id = le.transform(deep_pthway.genome_Namelist)
    deep_pthway.activ_free = data.activ_free[activ_id]
    deep_pthway.activ_cancer = data.activ_cancer[activ_id]

    deep_pthway.pth_Namelist = newpthway_Namelist
    Edgelist = data.Edgelist
    Namelist_l = list(newpthway_Namelist['GenomeName'].values)
    Edgelist_l = list(Edgelist.iloc[:,0].values)
    Edgelist_ll = list(Edgelist.iloc[:,1].values)
    exclude_list = []
    for idx, (elem, elem2) in enumerate(zip(Edgelist_l, Edgelist_ll)):
        if ((elem not in Namelist_l) or (elem2 not in Namelist_l)):
            exclude_list.append(idx)

    newpthway_Edgelist = Edgelist.drop(exclude_list).reset_index(drop=True)
    deep_pthway.Edgelist = newpthway_Edgelist

    le2 = LabelEncoder()
    le2.fit(deep_pthway.pth_Namelist['GenomeName'].values)
    deep_pthway.edge_index = le2.transform(deep_pthway.Edgelist.iloc[:,:2].values.reshape(-1)).reshape(-1,2)
    deep_pthway.all_elem_className = list(le2.classes_)

    # Label edge_class
    le2 = LabelEncoder()
    le2.fit(deep_pthway.Edgelist['edgeType'])
    deep_pthway.edge_class = le2.transform(deep_pthway.Edgelist['edgeType'])
    deep_pthway.edge_className = list(le2.classes_)

    # Label node class
    le2 = LabelEncoder()
    le2.fit(deep_pthway.pth_Namelist['GenomeType'])
    deep_pthway.node_class = le2.transform(deep_pthway.pth_Namelist['GenomeType'])
    deep_pthway.node_className = list(le2.classes_) 