In [2]:
import deepchem as dc
from datasail.sail import datasail

In [6]:
import pandas as pd

url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz"
df = pd.read_csv(url)

df["ID"] = [f"Comp{i+1:06d}" for i in range(len(df))]
df = df[["ID", "smiles", "SR-ARE"]].rename(columns={"smiles": "SMILES"})
df = df.dropna(subset=["SR-ARE"])
df

Unnamed: 0,ID,SMILES,SR-ARE
0,Comp000001,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,1.0
2,Comp000003,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,0.0
4,Comp000005,CC(O)(P(=O)(O)O)P(=O)(O)O,0.0
6,Comp000007,O=S(=O)(Cl)c1ccccc1,0.0
7,Comp000008,O=C(O)Cc1cc(I)c(Oc2ccc(O)c(I)c2)c(I)c1,1.0
...,...,...,...
7819,Comp007820,Cc1ccc(=O)n(-c2ccccc2)c1,0.0
7826,Comp007827,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,0.0
7828,Comp007829,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,1.0
7829,Comp007830,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,0.0


In [8]:
%%capture
e_splits, _, _ = datasail(
    techniques=["C1e"],
    splits=[8, 2],
    names=["train", "test"],
    runs=3,
    solver="SCIP",
    e_type="M",
    e_data=dict(df[["ID", "SMILES"]].values.tolist()),
    e_strat=dict(df[["ID", "SR-ARE"]].values.tolist()),
)

(CVXPY) Feb 21 01:07:46 AM: Your problem has 100 variables, 56 constraints, and 0 parameters.
(CVXPY) Feb 21 01:07:46 AM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Feb 21 01:07:46 AM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Feb 21 01:07:46 AM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Feb 21 01:07:46 AM: Your problem is compiled with the CPP canonicalization backend.
(CVXPY) Feb 21 01:07:46 AM: Compiling problem (target solver=SCIP).
(CVXPY) Feb 21 01:07:46 AM: Reduction chain: Dcp2Cone -> CvxAttr2Constr -> ConeMatrixStuffing -> SCIP
(CVXPY) Feb 21 01:07:46 AM: Applying reduction Dcp2Cone
(CVXPY) Feb 21 01:07:46 AM: Applying reduction CvxAttr2Constr
(CVXPY) Feb 21 01:07:46 AM: Applying reduction ConeMatrixStuffing
(CVXPY) Feb 21 01:07:47 AM: Applying reduction SCIP
(CVXPY) Feb 21 01:07:47 AM: Finished problem compilation (took 1

In [9]:
print(type(e_splits))
for key in e_splits.keys():
    print(f"{key} - Type: {type(e_splits[key])} - Length: {len(e_splits[key])}")
    for run in range(len(e_splits[key])):
        print(f"\tRun {run + 1} - Type: {type(e_splits[key][run])} - {len(e_splits[key][run])} assignments")
print("\n" + "\n".join(f"ID: {idx} - Split: {split}" for idx, split in list(e_splits[key][0].items())[:5]))


<class 'dict'>
C1e - Type: <class 'list'> - Length: 3
	Run 1 - Type: <class 'dict'> - 5825 assignments
	Run 2 - Type: <class 'dict'> - 5825 assignments
	Run 3 - Type: <class 'dict'> - 5825 assignments

ID: Comp000001 - Split: train
ID: Comp000003 - Split: train
ID: Comp000005 - Split: test
ID: Comp000007 - Split: train
ID: Comp000008 - Split: train
