In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
# NOTE: update the dataset.py file in your local folder.

def generate_anomaly_type_data(workflows, anomaly_type, sample_size=30, **kwargs):
    """ Sample from workflows with anomaly types.

    Args:
        workflows (str | list): workflow name(s). 
        anomaly_types (str): anomaly type.
        sample_size (int, optional): sample size. Defaults to 30.

    Returns:
        dataset: InMemoryDataset.
    """
    from psd_gnn.dataset import Merge_PSD_Dataset
    import os.path as osp
    node_level = kwargs.get("node_level", True)
    workflows = [workflows] if isinstance(workflows, str) else workflows

    ROOT = osp.join("/tmp", "data", "psd", "all")
    dataset = Merge_PSD_Dataset(ROOT, name=workflows, 
                                anomaly_cat=anomaly_type, 
                                node_level=node_level, 
                                binary_labels=True, 
                                sample_size=sample_size, 
                                force_reprocess=True)
    return dataset


def generate_large_graphs(dataset, **kwargs):
    """ Build large graphs given a list of graphs.

    Args:
        dataset (InMemoryDataset): Dataset object.
    
    Returns:
        tuple: data and slices.
    """    
    from torch_geometric.loader import DataLoader, RandomNodeLoader

    node_level = kwargs.get("node_level", True)
    batch_size = kwargs.get("batch_size", 1)
    if node_level:
        loader = RandomNodeLoader(dataset[0], num_parts=5, num_workers=5, shuffle=True)
    else:
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return loader

In [3]:
dataset = generate_anomaly_type_data(["1000genome_new_2022", "montage"], "cpu", sample_size=30)
loader = generate_large_graphs(dataset, node_level=True)

Processing...
  norm_feat = (all_feat - v_min) / (v_max - v_min)
Done!
Processing...
  norm_feat = (all_feat - v_min) / (v_max - v_min)
Done!


In [4]:
# sampled dataset
dataset[0]

Data(x=[20280, 25], edge_index=[2, 93810], y=[20280])

In [5]:
# access features from loader
loader.data.x, loader.data.edge_index

(tensor([[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.2500],
         [0.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.2500],
         ...,
         [0.0000, 0.0000, 1.0000,  ..., 1.0000, 0.0000, 0.7500],
         [1.0000, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 1.0000],
         [1.0000, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 1.0000]]),
 tensor([[    0,     0,     0,  ..., 20275, 20277, 20277],
         [    1,     2,     3,  ..., 20277, 20278, 20279]]))