In [1]:
import pandas as pd
from src import DatasetManager, MinMaxScaling, TensorLineFeaturizer, IdentityDrugFeaturizer, IdentityPipeline
from GDSC import GDSCPreprocessingPipeline, GDSCRawPreprocessingPipeline
from PRISM import PRISMPreprocessingPipeline
from CTRPv2 import CTRPv2PreprocessingPipeline
from NI60 import NI60PreprocessingPipeline
import os
import numpy as np
from GraphCreator import GraphCreator



In [2]:
paccmann_genes = pd.read_csv("https://raw.githubusercontent.com/prassepaul/mlmed_ranking/main/data/gdsc_data/paccmann_gene_list.txt", index_col=None, header=None).to_numpy().squeeze().tolist()

In [3]:
manager = DatasetManager(processing_pipeline = NI60PreprocessingPipeline(target = "TGI",
                                                                         gene_subset = paccmann_genes),
                        target_processor = IdentityPipeline(),
                        partition_column = "DRUG_ID",
                        k = 25,
                        drug_featurizer = GraphCreator(),
                        line_featurizer = TensorLineFeaturizer())

In [4]:
train, val, test = manager.get_partition(0)

In [5]:
train

Unnamed: 0,CELL_ID,DRUG_ID,Y
0,ACH-000201,740,-7.6021
1,ACH-000201,752,-5.8269
3,ACH-000201,3053,-8.0765
4,ACH-000201,3088,-6.9898
5,ACH-000201,6396,-3.7542
...,...,...,...
814290,ACH-000828,26980,-5.1412
814291,ACH-000828,27640,-2.8850
814292,ACH-000828,82151,-6.9236
814293,ACH-000828,125066,-4.6953


In [7]:
line_dict = manager.get_cell_lines()
line_dict["ACH-000201"]

tensor([6.5590, 2.9561, 1.7181,  ..., 5.2638, 0.0000, 0.0000])

In [8]:
drug_dict = manager.get_drugs()

In [9]:
drug_dict[1]

Data(x=[9, 79], edge_index=[2, 18], edge_attr=[18, 10])

In [10]:
from TorchDatasets import TorchGraphsDataset

In [11]:
train_dataset = TorchGraphsDataset(data=train,
                   drug_dict = drug_dict,
                   line_dict = line_dict)

In [12]:
train_dataset[1]

Data(x=[11, 79], edge_index=[2, 24], edge_attr=[24, 10], y=[1], cell=[2087])