In [13]:
import pandas as pd
import numpy as np
from rdkit.Chem import inchi
from rdkit.Chem import MolFromSmiles, MolToSmiles
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect

In [14]:
def smiles2ecfp(smiles, radius=4, bits=2048):
    mol = MolFromSmiles(smiles)
    fp = GetMorganFingerprintAsBitVect(mol, radius, nBits=bits)
    return "".join(map(str, list(fp)))

In [2]:
df = pd.read_csv("../data/MolNet/tox21.csv")

In [11]:
tasks = [
    "NR-AR",
    "NR-AR-LBD",
    "NR-AhR",
    "NR-Aromatase",
    "NR-ER",
    "NR-ER-LBD",
    "NR-PPAR-gamma",
    "SR-ARE",
    "SR-ATAD5",
    "SR-HSE",
    "SR-MMP",
    "SR-p53",
]
na2us = {1.0: "1", 0.0: "0", np.nan: "_"}
for task in tasks:
    df[task] = df[task].map(na2us)

In [15]:
df["Label"] = ""

In [17]:
for task in tasks:
    df["Label"] = df["Label"] + df[task]

In [19]:
df["ECFP"] = df["smiles"].map(smiles2ecfp)



In [23]:
df.loc[pd.notna(df["ECFP"])]

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles,Label,ECFP
0,0,0,1,_,_,0,0,1,0,0,0,0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,001__0010000,0000000000000000010000000000000000000000000000...
1,0,0,0,0,0,0,0,_,0,_,0,0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,0000000_0_00,0000010000000000000000000101000000100000000000...
2,_,_,_,_,_,_,_,0,_,0,_,_,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,_______0_0__,0000000000010000000000000000000000000000000000...
3,0,0,0,0,0,0,0,_,0,_,0,0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,0000000_0_00,0100100000000000000000000000000000000100000000...
4,0,0,0,0,0,0,0,0,0,0,0,0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,000000000000,0000000000000000000000000000000000000000000000...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7826,_,_,_,_,_,_,_,0,_,0,_,_,TOX2725,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,_______0_0__,0000000001000000000000000000000000000000010000...
7827,1,1,0,0,1,0,_,_,0,0,_,0,TOX2370,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,110010__00_0,0000000100000000100000000000000100000000000000...
7828,1,1,0,0,1,1,0,1,0,0,0,0,TOX2371,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,110011010000,0000001100000000100000000000000000000000000000...
7829,1,1,0,_,1,1,0,0,0,0,1,1,TOX2377,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,110_11000011,0000001000000000000000000000000000000000000000...


In [20]:
df.head()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles,Label,ECFP
0,0,0,1,_,_,0,0,1,0,0,0,0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,001__0010000,0000000000000000010000000000000000000000000000...
1,0,0,0,0,0,0,0,_,0,_,0,0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,0000000_0_00,0000010000000000000000000101000000100000000000...
2,_,_,_,_,_,_,_,0,_,0,_,_,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,_______0_0__,0000000000010000000000000000000000000000000000...
3,0,0,0,0,0,0,0,_,0,_,0,0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,0000000_0_00,0100100000000000000000000000000000000100000000...
4,0,0,0,0,0,0,0,0,0,0,0,0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,000000000000,0000000000000000000000000000000000000000000000...


In [9]:
df["NR-AR"].map({1.0: 1, 0.0: 0, np.nan: "_"})

0       0
1       0
2       _
3       0
4       0
       ..
7826    _
7827    1
7828    1
7829    1
7830    0
Name: NR-AR, Length: 7831, dtype: object