In [1]:
import deepchem as dc
import numpy as np

In [2]:
# numpy arrays
x = np.random.random((4, 5))
y = np.random.random((4, 1))

In [7]:
# Wrap above arrays in a NumpyDataset object
dataset = dc.data.NumpyDataset(x, y)
print(dataset.X)

[[0.50899223 0.72634202 0.02337236 0.77227925 0.89666579]
 [0.16947571 0.38010428 0.67691202 0.63866104 0.87445125]
 [0.03928106 0.15390206 0.96344381 0.87637195 0.05069557]
 [0.17783445 0.16433022 0.42520067 0.52185761 0.93478816]]


In [8]:
print(dataset.y)

[[0.73325444]
 [0.42978761]
 [0.21188741]
 [0.2418708 ]]


In [9]:
# Those are still equals
np.array_equal(x, dataset.X)

True

In [10]:
np.array_equal(y, dataset.y)

True

# Traning a Model to Predict Toxicity of Molecules

In [11]:
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()



In [12]:
tox21_tasks

['NR-AR',
 'NR-AR-LBD',
 'NR-AhR',
 'NR-Aromatase',
 'NR-ER',
 'NR-ER-LBD',
 'NR-PPAR-gamma',
 'SR-ARE',
 'SR-ATAD5',
 'SR-HSE',
 'SR-MMP',
 'SR-p53']

In [13]:
len(tox21_tasks)

12

Each of 12 tasks here corresponds with a paritcular biological experiment.

In [14]:
# Contains training, validation, testing sets
tox21_datasets

(<DiskDataset X.shape: (6264, 1024), y.shape: (6264, 12), w.shape: (6264, 12), task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>,
 <DiskDataset X.shape: (783, 1024), y.shape: (783, 12), w.shape: (783, 12), ids: ['N#C[C@@H]1CC(F)(F)CN1C(=O)CNC1CC2CCC(C1)N2c1ncccn1'
  'CN(C)C(=O)NC1(c2ccccc2)CCN(CCC[C@@]2(c3ccc(Cl)c(Cl)c3)CCCN(C(=O)c3ccccc3)C2)CC1'
  'CSc1nnc(C(C)(C)C)c(=O)n1N' ...
  'O=C(O[C@H]1CN2CCC1CC2)N1CCc2ccccc2[C@@H]1c1ccccc1'
  'C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]4[C@H]3C(=C)C[C@@]21CC'
  'NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3c(c2)CCO3)C1'], task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>,
 <DiskDataset X.shape: (784, 1024), y.shape: (784, 12), w.shape: (784, 12), ids: ['CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@H]1C(=O)O.CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@H]1C(=O)O.c1ccc(CNCCNCc2ccccc2)cc1'
  'CC(C)(c1ccc(Oc2ccc3c(c2)C(=O)OC3=O)cc1)c1ccc(Oc2ccc3c(c2)C(=O)OC3=O)cc1'
  'Cc1cc(C(C)(C)C)c(O)c(C)c1

In [15]:
train_dataset, valid_dataset, test_dataset = tox21_datasets

In [16]:
train_dataset.X.shape

(6264, 1024)

In [17]:
valid_dataset.X.shape

(783, 1024)

In [18]:
test_dataset.X.shape

(784, 1024)

In [19]:
# y vectors
np.shape(train_dataset.y)

(6264, 12)

In [20]:
np.shape(valid_dataset.y)

(783, 12)

In [21]:
np.shape(test_dataset.y)

(784, 12)