In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import Descriptors, rdFingerprintGenerator

from statsmodels.distributions.empirical_distribution import ECDF

real_200_descr = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207]

In [2]:
toxcast = pd.read_csv("datasets/toxcast_data.csv")

print('Shape: ', toxcast.shape)

vals = toxcast.values.flatten()
print('# meas.: ', len([v for v in vals if str(v) != 'nan']))

toxcast.head() 

Shape:  (8615, 618)
# meas.:  1547010


Unnamed: 0,smiles,ACEA_T47D_80hr_Negative,ACEA_T47D_80hr_Positive,APR_HepG2_CellCycleArrest_24h_dn,APR_HepG2_CellCycleArrest_24h_up,APR_HepG2_CellCycleArrest_72h_dn,APR_HepG2_CellLoss_24h_dn,APR_HepG2_CellLoss_72h_dn,APR_HepG2_MicrotubuleCSK_24h_dn,APR_HepG2_MicrotubuleCSK_24h_up,...,Tanguay_ZF_120hpf_OTIC_up,Tanguay_ZF_120hpf_PE_up,Tanguay_ZF_120hpf_PFIN_up,Tanguay_ZF_120hpf_PIG_up,Tanguay_ZF_120hpf_SNOU_up,Tanguay_ZF_120hpf_SOMI_up,Tanguay_ZF_120hpf_SWIM_up,Tanguay_ZF_120hpf_TRUN_up,Tanguay_ZF_120hpf_TR_up,Tanguay_ZF_120hpf_YSE_up
0,[O-][N+](=O)C1=CC=C(Cl)C=C1,0.0,0.0,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C[SiH](C)O[Si](C)(C)O[Si](C)(C)O[SiH](C)C,,,,,,,,,,...,,,,,,,,,,
2,CN1CCN(CC1)C(=O)C1CCCCC1,,,,,,,,,,...,,,,,,,,,,
3,NC1=CC=C(C=C1)[N+]([O-])=O,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,OC1=CC=C(C=C1)[N+]([O-])=O,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
tox21 = pd.read_csv("datasets/tox21.csv")

print('Shape: ', tox21.shape)

vals = tox21.values.flatten()
print('# meas.: ', len([v for v in vals if str(v) != 'nan']))

tox21.head() 

Shape:  (7831, 14)
# meas.:  93608


Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


In [4]:
tox21_X = tox21.iloc[:, -1]
tox21_Y = tox21.iloc[:, 0:-1]
print('tox21_X:', tox21_X.shape, ' | tox21_Y:', tox21_Y.shape)

tox21_X: (7831,)  | tox21_Y: (7831, 13)


**Delete rows with missing SMILES strings, replace incorrect strings (toxcast)**

In [5]:
row_ids_fail = toxcast[toxcast.smiles == 'FAIL'].index
row_ids_fail

Int64Index([4182, 4788, 5105, 6003, 6171, 6625, 6639, 6640, 6923, 7199, 7353,
            7877, 7965, 8064, 8200],
           dtype='int64')

In [6]:
mols_to_drop = [
    '[NH4+].[NH4+].F[Si--](F)(F)(F)(F)F', 
    '[Na+].[Na+].F[Si--](F)(F)(F)(F)F', 
    '[Cl-][Pt]1([Cl-])[NH2+]CC[NH2+]1', 
    '[NH4+].[NH4+].[Cl-][Pt++]([Cl-])([Cl-])[Cl-]', 
    '[F-][B+3]([F-])([F-])[F-].CC[N+]1(C)CCCC1'
]

row_ids_mol = toxcast[toxcast['smiles'].isin(mols_to_drop)].index

row_ids_mol

Int64Index([1041, 1792, 1884, 2460, 2467], dtype='int64')

In [7]:
toxcast = toxcast.drop(row_ids_fail)
toxcast = toxcast.drop(row_ids_mol)

In [8]:
toxcast_X = toxcast.iloc[:, 0]
toxcast_Y = toxcast.iloc[:, 1:]
print('toxcast_X:', toxcast_X.shape, ' | toxcast_Y:', toxcast_Y.shape)

toxcast_X: (8595,)  | toxcast_Y: (8595, 617)


**Train-val-test split**

Train : Validation : Test = 60 : 20 : 20

In [9]:
def train_val_test_split(X, y):
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.4, random_state=42)
    
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

tox21_X_train, tox21_X_val, tox21_X_test, tox21_y_train, tox21_y_val, tox21_y_test = train_val_test_split(tox21_X, tox21_Y)

toxcast_X_train, toxcast_X_val, toxcast_X_test, toxcast_y_train, toxcast_y_val, toxcast_y_test = train_val_test_split(toxcast_X, toxcast_Y)

In [10]:
[i.shape for i in [tox21_X_train, tox21_X_val, tox21_X_test, tox21_y_train, tox21_y_val, tox21_y_test]]

[(4698,), (1566,), (1567,), (4698, 13), (1566, 13), (1567, 13)]

In [11]:
[i.shape for i in [toxcast_X_train, toxcast_X_val, toxcast_X_test, toxcast_y_train, toxcast_y_val, toxcast_y_test]]

[(5157,), (1719,), (1719,), (5157, 617), (1719, 617), (1719, 617)]

**Separate the mol_id column (tox21)**

In [12]:
tox21_y_train_ids = tox21_y_train.iloc[:, -1]
tox21_y_val_ids = tox21_y_val.iloc[:, -1]
tox21_y_test_ids = tox21_y_test.iloc[:, -1]

tox21_y_train = tox21_y_train.iloc[:, 0:-1]
tox21_y_val = tox21_y_val.iloc[:, 0:-1]
tox21_y_test = tox21_y_test.iloc[:, 0:-1]

In [13]:
[i.shape for i in [tox21_y_train, tox21_y_val, tox21_y_test]]

[(4698, 12), (1566, 12), (1567, 12)]

**Replace NaN values with -1 (tox21)** 

In [14]:
tox21_y_train = tox21_y_train.fillna(-1).to_numpy()
tox21_y_val = tox21_y_val.fillna(-1).to_numpy()
tox21_y_test = tox21_y_test.fillna(-1).to_numpy()

In [15]:
tox21_y_train

array([[-1.,  0.,  0., ...,  0., -1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1., -1., -1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

**Replace NaN values with -1 (toxcast)** 

In [16]:
toxcast_y_train = toxcast_y_train.fillna(-1).to_numpy()
toxcast_y_val = toxcast_y_val.fillna(-1).to_numpy()
toxcast_y_test = toxcast_y_test.fillna(-1).to_numpy()

In [17]:
toxcast_y_train

array([[-1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       ...,
       [-1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.]])

**Create mol objects**

In [18]:
def create_mols(smiles_strings):
    mols = list()

    for smiles in smiles_strings:
        mol = Chem.MolFromSmiles(smiles)
        mols.append(mol)
        if mol is None:
            print(smiles)
    return mols

In [19]:
tox21_X_train_mols, tox21_X_val_mols, tox21_X_test_mols = [create_mols(smiles_strings) for smiles_strings in [tox21_X_train, tox21_X_val, tox21_X_test]]



In [20]:
[np.array(i).shape for i in [tox21_X_train_mols, 
                             tox21_X_val_mols, 
                             tox21_X_test_mols, 
                             tox21_y_train, 
                             tox21_y_val, tox21_y_test]]

[(4698,), (1566,), (1567,), (4698, 12), (1566, 12), (1567, 12)]

In [21]:
toxcast_X_train_mols, toxcast_X_val_mols, toxcast_X_test_mols = [create_mols(smiles_strings) for smiles_strings in [toxcast_X_train, 
                                                                                                                    toxcast_X_val, 
                                                                                                                    toxcast_X_test]]



In [22]:
[np.array(i).shape for i in [toxcast_X_train_mols, toxcast_X_val_mols, toxcast_X_test_mols, toxcast_y_train, toxcast_y_val, toxcast_y_test]]

[(5157,), (1719,), (1719,), (5157, 617), (1719, 617), (1719, 617)]

**Compute ECFPs**

In [23]:
def compute_ecfps(mols):
    ecfps = list()

    for mol in mols:
        fp_sparseVec = rdFingerprintGenerator.GetCountFPs(
                        [mol], fpType=rdFingerprintGenerator.MorganFP
                       )[0]
        fp = np.zeros((0,), np.int8)
        DataStructs.ConvertToNumpyArray(fp_sparseVec, fp)

        ecfps.append(fp)

    ecfps = np.array(ecfps)
    return ecfps

In [24]:
tox21_X_train_ecfps, tox21_X_val_ecfps, tox21_X_test_ecfps = [compute_ecfps(mols) for mols in [tox21_X_train_mols, 
                                                                                               tox21_X_val_mols, 
                                                                                               tox21_X_test_mols]]

In [25]:
'tox21', [np.array(i).shape for i in [tox21_X_train_ecfps, tox21_X_val_ecfps, tox21_X_test_ecfps]]

('tox21', [(4698, 2048), (1566, 2048), (1567, 2048)])

In [26]:
toxcast_X_train_ecfps, toxcast_X_val_ecfps, toxcast_X_test_ecfps = [compute_ecfps(mols) for mols in [toxcast_X_train_mols, 
                                                                                                     toxcast_X_val_mols, 
                                                                                                     toxcast_X_test_mols]]

In [27]:
'toxcast', [np.array(i).shape for i in [toxcast_X_train_ecfps, toxcast_X_val_ecfps, toxcast_X_test_ecfps]]

('toxcast', [(5157, 2048), (1719, 2048), (1719, 2048)])

**Compute descriptors**

In [28]:
def compute_descriptors(mols):
    rdkit_descriptors = list()

    for mol in mols:
        descrs = list()
        for descr in Descriptors._descList:
            _, descr_calc_fn = descr
            descrs.append(descr_calc_fn(mol))

        descrs = np.array(descrs)
        descrs = descrs[real_200_descr]
        rdkit_descriptors.append(descrs)

    rdkit_descriptors = np.array(rdkit_descriptors)
    return rdkit_descriptors

In [29]:
%%time
tox21_X_train_descr, tox21_X_val_descr, tox21_X_test_descr = [compute_descriptors(mols) for mols in [tox21_X_train_mols, 
                                                                                                     tox21_X_val_mols, 
                                                                                                     tox21_X_test_mols]]



CPU times: total: 1min 2s
Wall time: 1min 6s


In [30]:
'tox21', [np.array(i).shape for i in [tox21_X_train_descr, tox21_X_val_descr, tox21_X_test_descr]]

('tox21', [(4698, 200), (1566, 200), (1567, 200)])

In [31]:
%%time
toxcast_X_train_descr, toxcast_X_val_descr, toxcast_X_test_descr = [compute_descriptors(mols) for mols in [toxcast_X_train_mols, 
                                                                                                           toxcast_X_val_mols, 
                                                                                                           toxcast_X_test_mols]]



CPU times: total: 1min 9s
Wall time: 1min 13s


In [32]:
'toxcast', [np.array(i).shape for i in [toxcast_X_train_descr, 
                                        toxcast_X_val_descr, 
                                        toxcast_X_test_descr]]

('toxcast', [(5157, 200), (1719, 200), (1719, 200)])

**Compute quantiles for descriptors**

In [33]:
def compute_quantiles(rdkit_descriptors_train, rdkit_descriptors_val, rdkit_descriptors_test):

    rdkit_descriptors_quantiles_train = np.zeros_like(rdkit_descriptors_train)
    rdkit_descriptors_quantiles_val = np.zeros_like(rdkit_descriptors_val)
    rdkit_descriptors_quantiles_test = np.zeros_like(rdkit_descriptors_test)
    
    descriptors_list =  [rdkit_descriptors_train, rdkit_descriptors_val, rdkit_descriptors_test]
    quantiles_list = [rdkit_descriptors_quantiles_train, rdkit_descriptors_quantiles_val, rdkit_descriptors_quantiles_test]
    
    for descriptors, quantiles in zip(descriptors_list, quantiles_list):
        for column in range(descriptors.shape[1]):
            raw_values_ecdf = rdkit_descriptors_train[:,column].reshape(-1)
            raw_values = descriptors[:,column].reshape(-1)

            ecdf = ECDF(raw_values_ecdf)
            feature_quantiles = ecdf(raw_values)
            quantiles[:,column] = feature_quantiles
    return rdkit_descriptors_quantiles_train, rdkit_descriptors_quantiles_val, rdkit_descriptors_quantiles_test

In [34]:
tox21_descr_quantiles_X_train, tox21_descr_quantiles_X_val, tox21_descr_quantiles_X_test = compute_quantiles(tox21_X_train_descr, 
                                                                                                             tox21_X_val_descr, 
                                                                                                             tox21_X_test_descr)

In [35]:
'tox21', [np.array(i).shape for i in [tox21_descr_quantiles_X_train, tox21_descr_quantiles_X_val, tox21_descr_quantiles_X_test]]

('tox21', [(4698, 200), (1566, 200), (1567, 200)])

In [36]:
toxcast_descr_quantiles_X_train, toxcast_descr_quantiles_X_val, toxcast_descr_quantiles_X_test = compute_quantiles(toxcast_X_train_descr, 
                                                                                                                   toxcast_X_val_descr, 
                                                                                                                   toxcast_X_test_descr)

In [37]:
'toxcast', [np.array(i).shape for i in [toxcast_descr_quantiles_X_train, 
                             toxcast_descr_quantiles_X_val, 
                             toxcast_descr_quantiles_X_test]]

('toxcast', [(5157, 200), (1719, 200), (1719, 200)])

**Concatenate ecfps and rdkit descriptor quantiles**

In [38]:
def concatenate_arrays(ecfp_train, ecfp_val, ecfp_test, descr_q_train, descr_q_val, descr_q_test):
    concatenated_arrays = []
    
    for ecfp, desc in zip([ecfp_train, ecfp_val, ecfp_test], [descr_q_train, descr_q_val, descr_q_test]):
    
        concat_array = np.concatenate((ecfp, desc), axis=1)
        concatenated_arrays.append(concat_array)
    
    return concatenated_arrays

In [39]:
tox21_ecfp_descr_quantiles_X_train, tox21_ecfp_descr_quantiles_X_val, tox21_ecfp_descr_quantiles_X_test = concatenate_arrays(tox21_X_train_ecfps, 
                                                                                                                             tox21_X_val_ecfps, 
                                                                                                                             tox21_X_test_ecfps, 
                                                                                                                             tox21_descr_quantiles_X_train, 
                                                                                                                             tox21_descr_quantiles_X_val, 
                                                                                                                             tox21_descr_quantiles_X_test)

In [40]:
'tox21', [np.array(i).shape for i in [tox21_ecfp_descr_quantiles_X_train, tox21_ecfp_descr_quantiles_X_val, tox21_ecfp_descr_quantiles_X_test]]

('tox21', [(4698, 2248), (1566, 2248), (1567, 2248)])

In [41]:
toxcast_ecfp_descr_quantiles_X_train, toxcast_ecfp_descr_quantiles_X_val, toxcast_ecfp_descr_quantiles_X_test = concatenate_arrays(toxcast_X_train_ecfps, 
                                                                                                                             toxcast_X_val_ecfps, 
                                                                                                                             toxcast_X_test_ecfps, 
                                                                                                                             toxcast_descr_quantiles_X_train, 
                                                                                                                             toxcast_descr_quantiles_X_val, 
                                                                                                                             toxcast_descr_quantiles_X_test)

In [42]:
'toxcast', [np.array(i).shape for i in [toxcast_ecfp_descr_quantiles_X_train, toxcast_ecfp_descr_quantiles_X_val, toxcast_ecfp_descr_quantiles_X_test]]

('toxcast', [(5157, 2248), (1719, 2248), (1719, 2248)])

**Feature standartization**

In [43]:
scaler = StandardScaler()

scaler.fit(tox21_ecfp_descr_quantiles_X_train)

tox21_ecfp_descr_quantiles_scaled_X_train, tox21_ecfp_descr_quantiles_scaled_X_val, tox21_ecfp_descr_quantiles_scaled_X_test = [scaler.transform(dataset) for dataset in [tox21_ecfp_descr_quantiles_X_train, 
                                                                                      tox21_ecfp_descr_quantiles_X_val, 
                                                                                      tox21_ecfp_descr_quantiles_X_test]]

In [44]:
'tox21', [np.array(i).shape for i in [tox21_ecfp_descr_quantiles_scaled_X_train, 
                             tox21_ecfp_descr_quantiles_scaled_X_val, 
                             tox21_ecfp_descr_quantiles_scaled_X_test]]

('tox21', [(4698, 2248), (1566, 2248), (1567, 2248)])

In [45]:
scaler = StandardScaler()

scaler.fit(toxcast_ecfp_descr_quantiles_X_train)

toxcast_ecfp_descr_quantiles_scaled_X_train, toxcast_ecfp_descr_quantiles_scaled_X_val, toxcast_ecfp_descr_quantiles_scaled_X_test = [scaler.transform(dataset) for dataset in [toxcast_ecfp_descr_quantiles_X_train, 
                                                                                      toxcast_ecfp_descr_quantiles_X_val, 
                                                                                      toxcast_ecfp_descr_quantiles_X_test]]

In [46]:
'toxcast', [np.array(i).shape for i in [toxcast_ecfp_descr_quantiles_scaled_X_train, 
                             toxcast_ecfp_descr_quantiles_scaled_X_val, 
                             toxcast_ecfp_descr_quantiles_scaled_X_test]]

('toxcast', [(5157, 2248), (1719, 2248), (1719, 2248)])

**Save data (tox21)**

In [47]:
tox21_features_train_val_test = {
    "train": tox21_ecfp_descr_quantiles_scaled_X_train,
    "validation": tox21_ecfp_descr_quantiles_scaled_X_val,
    "test": tox21_ecfp_descr_quantiles_scaled_X_test
}

np.save('preprocessed_data/tox21_features_train_val_test.npy', tox21_features_train_val_test, allow_pickle=True)

In [48]:
tox21_labels_train_val_test = {
    "train": tox21_y_train,
    "validation": tox21_y_val,
    "test": tox21_y_test
}

np.save('preprocessed_data/tox21_labels_train_val_test.npy', tox21_labels_train_val_test, allow_pickle=True)

In [49]:
tox21_ids = {
    "train": tox21_y_train_ids,
    "validation": tox21_y_val_ids,
    "test": tox21_y_test_ids
}

np.save('preprocessed_data/tox21_ids.npy', tox21_ids, allow_pickle=True)

**Save data (toxcast)**

In [50]:
toxcast_features_train_val_test = {
    "train": toxcast_ecfp_descr_quantiles_scaled_X_train,
    "validation": toxcast_ecfp_descr_quantiles_scaled_X_val,
    "test": toxcast_ecfp_descr_quantiles_scaled_X_test
}

np.save('preprocessed_data/toxcast_features_train_val_test.npy', toxcast_features_train_val_test, allow_pickle=True)

In [51]:
toxcast_labels_train_val_test = {
    "train": toxcast_y_train,
    "validation": toxcast_y_val,
    "test": toxcast_y_test
}

np.save('preprocessed_data/toxcast_labels_train_val_test.npy', toxcast_labels_train_val_test, allow_pickle=True)

**Load data (tox21)**

In [52]:
loaded_tox21_features_train_val_test = np.load('preprocessed_data/tox21_features_train_val_test.npy', allow_pickle=True).item()

loaded_tox21_ecfp_descr_quantiles_scaled_X_train = loaded_tox21_features_train_val_test['train']
loaded_tox21_ecfp_descr_quantiles_scaled_X_val = loaded_tox21_features_train_val_test['validation']
loaded_tox21_ecfp_descr_quantiles_scaled_X_test = loaded_tox21_features_train_val_test['test']

In [53]:
loaded_tox21_ecfp_descr_quantiles_scaled_X_train.shape, loaded_tox21_ecfp_descr_quantiles_scaled_X_val.shape, loaded_tox21_ecfp_descr_quantiles_scaled_X_test.shape

((4698, 2248), (1566, 2248), (1567, 2248))

In [54]:
loaded_tox21_labels_train_val_test = np.load('preprocessed_data/tox21_labels_train_val_test.npy', allow_pickle=True).item()

loaded_tox21_y_train = loaded_tox21_labels_train_val_test['train']
loaded_tox21_y_val = loaded_tox21_labels_train_val_test['validation']
loaded_tox21_y_test = loaded_tox21_labels_train_val_test['test']

In [55]:
loaded_tox21_y_train.shape, loaded_tox21_y_val.shape, loaded_tox21_y_test.shape

((4698, 12), (1566, 12), (1567, 12))

In [56]:
loaded_tox21_ids = np.load('preprocessed_data/tox21_ids.npy', allow_pickle=True).item()

loaded_tox21_ids_train = loaded_tox21_ids['train']
loaded_tox21_ids_val = loaded_tox21_ids['validation']
loaded_tox21_ids_test = loaded_tox21_ids['test']

In [57]:
loaded_tox21_ids_train.shape, loaded_tox21_ids_val.shape, loaded_tox21_ids_test.shape

((4698,), (1566,), (1567,))

**Load data (toxcast)**

In [58]:
loaded_toxcast_features_train_val_test = np.load('preprocessed_data/toxcast_features_train_val_test.npy', allow_pickle=True).item()

loaded_toxcast_ecfp_descr_quantiles_scaled_X_train = loaded_toxcast_features_train_val_test['train']
loaded_toxcast_ecfp_descr_quantiles_scaled_X_val = loaded_toxcast_features_train_val_test['validation']
loaded_toxcast_ecfp_descr_quantiles_scaled_X_test = loaded_toxcast_features_train_val_test['test']

In [59]:
loaded_toxcast_ecfp_descr_quantiles_scaled_X_train.shape, loaded_toxcast_ecfp_descr_quantiles_scaled_X_val.shape, loaded_toxcast_ecfp_descr_quantiles_scaled_X_test.shape

((5157, 2248), (1719, 2248), (1719, 2248))

In [60]:
loaded_toxcast_labels_train_val_test = np.load('preprocessed_data/toxcast_labels_train_val_test.npy', allow_pickle=True).item()

loaded_toxcast_y_train = loaded_toxcast_labels_train_val_test['train']
loaded_toxcast_y_val = loaded_toxcast_labels_train_val_test['validation']
loaded_toxcast_y_test = loaded_toxcast_labels_train_val_test['test']

In [61]:
loaded_toxcast_y_train.shape, loaded_toxcast_y_val.shape, loaded_toxcast_y_test.shape

((5157, 617), (1719, 617), (1719, 617))