In [17]:
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem.rdchem import HybridizationType
from rdkit.Chem.rdchem import BondType as BT
from rdkit.Chem import AllChem
import torch
import numpy as np
from tqdm import tqdm

In [None]:
tqdm.pandas()

In [52]:
df = pd.read_csv('data_100k.csv')

In [None]:
# decided to take them as target
# df.drop(columns=['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'CX Acidic pKa', 'CX Basic pKa'])

In [53]:
ATOM_LIST = list(range(1,119))
CHIRALITY_LIST = [
    Chem.rdchem.ChiralType.CHI_UNSPECIFIED,
    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW,
    Chem.rdchem.ChiralType.CHI_OTHER
]
BOND_LIST = [
    BT.SINGLE,
    BT.DOUBLE,
    BT.TRIPLE,
    BT.AROMATIC
]
BONDDIR_LIST = [
    Chem.rdchem.BondDir.NONE,
    Chem.rdchem.BondDir.ENDUPRIGHT,
    Chem.rdchem.BondDir.ENDDOWNRIGHT
]
BONDTYPES_LIST = [Chem.rdchem.BondType.names,
                  Chem.rdchem.BondType.values
                  ]

def get_graph_columns(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return torch.tensor([[], []], dtype=torch.long).tolist(), \
                torch.tensor(np.array([]), dtype=torch.long).tolist(), \
                torch.tensor(np.array([]), dtype=torch.long).tolist(), \
                0

    N = mol.GetNumAtoms()
    M = mol.GetNumBonds()

    type_idx = []
    chirality_idx = []
    atomic_number = []
    
    for atom in mol.GetAtoms():
        type_idx.append(ATOM_LIST.index(atom.GetAtomicNum()))
        chirality_idx.append(CHIRALITY_LIST.index(atom.GetChiralTag()))
        atomic_number.append(atom.GetAtomicNum())
    
    x1 = torch.tensor(type_idx, dtype=torch.long).view(-1,1)
    x2 = torch.tensor(chirality_idx, dtype=torch.long).view(-1,1)
    x = torch.cat([x1, x2], dim=-1).tolist()

    row, col, edge_feat = [], [], []
    for bond in mol.GetBonds():
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        row += [start, end]
        col += [end, start]
        
        edge_feat.append([
            bond.GetBondTypeAsDouble(),
            BONDDIR_LIST.index(bond.GetBondDir())
        ])
        edge_feat.append([
            bond.GetBondTypeAsDouble(),
            BONDDIR_LIST.index(bond.GetBondDir())
        ])

    edge_index = torch.tensor([row, col], dtype=torch.long).tolist()
    edge_attr = torch.tensor(edge_feat, dtype=torch.long).tolist()
    num_nodes = N

    return x, edge_index, edge_attr, num_nodes

In [54]:
df[["node_feat", "edge_index", 'edge_attr', 'num_nodes']] = df.progress_apply(lambda row: get_graph_columns(row.Smiles), axis='columns', result_type='expand')

100%|████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [01:15<00:00, 1318.24it/s]


In [55]:
df

Unnamed: 0,Molecular Weight,Bioactivities,AlogP,Polar Surface Area,CX Acidic pKa,CX Basic pKa,Smiles,ecfp1,ecfp2,ecfp3,node_feat,edge_index,edge_attr,num_nodes
0,415.99,6.0,4.09,56.15,,,CC(Sc1nn(-c2ccc(Cl)cc2)c(=S)s1)C(=O)NCC1CCCO1,"['2246728737', '2245273601', '1026928756', '32...","['3537119515', '2417640586', '1609928163', '34...","['932877461', '2488184240', '2075550874', '338...","[[5, 0], [5, 0], [15, 0], [5, 0], [6, 0], [6, ...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",25
1,215.25,51.0,2.79,42.23,3.97,,Cc1ccc(C)n1-c1cccc(C(=O)O)c1,"['2246728737', '3217380708', '3218693969', '32...","['422715066', '3288032115', '951226070', '9512...","['1023969374', '2742742175', '2742742175', '10...","[[5, 0], [5, 0], [5, 0], [5, 0], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 4, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",16
2,475.94,2.0,6.00,37.39,,7.76,Fc1cc(F)c(F)c(/C=C/c2cc(NCCCCN3CCOCC3)c3cc(Cl)...,"['882399112', '3217380708', '3218693969', '321...","['3337745083', '1637836422', '994485099', '185...","['2775257855', '1365279474', '636391830', '186...","[[8, 0], [5, 0], [5, 0], [5, 0], [8, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 3, 5, 5, 6, 5, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",33
3,548.59,1.0,6.37,186.56,6.08,2.29,Cc1cc2c(C(C)C)c(O)c(O)c(/C=N/O)c2c(O)c1-c1c(C)...,"['2246728737', '3217380708', '3218693969', '32...","['422715066', '3124581743', '994485099', '3983...","['1303619485', '960947942', '1702835203', '356...","[[5, 0], [5, 0], [5, 0], [5, 0], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 5, 7, 4,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",40
4,314.35,23.0,1.33,92.92,8.77,5.70,Cc1nc2c(c(C)c1CC(=O)NCc1ccco1)c(=O)[nH]n2C,"['2246728737', '3217380708', '2041434490', '32...","['422715066', '4033380444', '1101907775', '553...","['298300585', '1959239016', '2090946584', '410...","[[5, 0], [5, 0], [6, 0], [5, 0], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 5, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,483.48,2.0,4.17,123.59,12.67,2.80,CC(=O)c1c(N2CCOCC2)c2ccc(Nc3nccc(-c4ccc5ocnc5c...,"['2246728737', '2246699815', '864942730', '321...","['3545365497', '1822332700', '1510328189', '23...","['1362960850', '1490705805', '2175439447', '34...","[[5, 0], [5, 0], [7, 0], [5, 0], [5, 0], [6, 0...","[[0, 1, 1, 2, 1, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,...","[[1, 0], [1, 0], [2, 0], [2, 0], [1, 0], [1, 0...",36
99996,417.92,10.0,3.01,74.43,12.96,5.22,O=C(NCc1ccc(Cl)cc1)c1c[nH]c2sc(CN3CCOCC3)cc2c1=O,"['864942730', '2246699815', '847961216', '2245...","['1510328189', '1054767590', '717512901', '390...","['2843970853', '2024004397', '1272192549', '16...","[[7, 0], [5, 0], [6, 0], [5, 0], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,...","[[2, 0], [2, 0], [1, 0], [1, 0], [1, 0], [1, 0...",28
99997,454.89,7.0,1.86,122.24,8.49,,O=C(COC(=O)c1ccc(O)cc1)Nc1cc(S(=O)(=O)N2CCOCC2...,"['864942730', '2246699815', '2245384272', '864...","['1510328189', '3315826729', '3995043796', '22...","['1517842599', '859821531', '381695949', '1554...","[[7, 0], [5, 0], [5, 0], [7, 0], [5, 0], [7, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 4, 6, 6, 7, 7,...","[[2, 0], [2, 0], [1, 0], [1, 0], [1, 0], [1, 0...",30
99998,976.02,4.0,5.80,192.83,12.67,7.94,CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...,"['2246728737', '2245384272', '2976033787', '31...","['3542456614', '3594356142', '1916236386', '26...","['3834501247', '3352858976', '4210593593', '29...","[[5, 0], [5, 0], [5, 1], [7, 0], [5, 0], [7, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 4, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",66


In [56]:
df.rename(columns={'Molecular Weight': 'y'}, inplace=True)

In [58]:
df = df.drop(['Bioactivities', 'AlogP', 'Polar Surface Area', 'CX Acidic pKa', 'CX Basic pKa'], axis=1)

In [59]:
df 

Unnamed: 0,y,Smiles,ecfp1,ecfp2,ecfp3,node_feat,edge_index,edge_attr,num_nodes
0,415.99,CC(Sc1nn(-c2ccc(Cl)cc2)c(=S)s1)C(=O)NCC1CCCO1,"['2246728737', '2245273601', '1026928756', '32...","['3537119515', '2417640586', '1609928163', '34...","['932877461', '2488184240', '2075550874', '338...","[[5, 0], [5, 0], [15, 0], [5, 0], [6, 0], [6, ...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",25
1,215.25,Cc1ccc(C)n1-c1cccc(C(=O)O)c1,"['2246728737', '3217380708', '3218693969', '32...","['422715066', '3288032115', '951226070', '9512...","['1023969374', '2742742175', '2742742175', '10...","[[5, 0], [5, 0], [5, 0], [5, 0], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 4, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",16
2,475.94,Fc1cc(F)c(F)c(/C=C/c2cc(NCCCCN3CCOCC3)c3cc(Cl)...,"['882399112', '3217380708', '3218693969', '321...","['3337745083', '1637836422', '994485099', '185...","['2775257855', '1365279474', '636391830', '186...","[[8, 0], [5, 0], [5, 0], [5, 0], [8, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 3, 5, 5, 6, 5, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",33
3,548.59,Cc1cc2c(C(C)C)c(O)c(O)c(/C=N/O)c2c(O)c1-c1c(C)...,"['2246728737', '3217380708', '3218693969', '32...","['422715066', '3124581743', '994485099', '3983...","['1303619485', '960947942', '1702835203', '356...","[[5, 0], [5, 0], [5, 0], [5, 0], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 5, 7, 4,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",40
4,314.35,Cc1nc2c(c(C)c1CC(=O)NCc1ccco1)c(=O)[nH]n2C,"['2246728737', '3217380708', '2041434490', '32...","['422715066', '4033380444', '1101907775', '553...","['298300585', '1959239016', '2090946584', '410...","[[5, 0], [5, 0], [6, 0], [5, 0], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 5, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",23
...,...,...,...,...,...,...,...,...,...
99995,483.48,CC(=O)c1c(N2CCOCC2)c2ccc(Nc3nccc(-c4ccc5ocnc5c...,"['2246728737', '2246699815', '864942730', '321...","['3545365497', '1822332700', '1510328189', '23...","['1362960850', '1490705805', '2175439447', '34...","[[5, 0], [5, 0], [7, 0], [5, 0], [5, 0], [6, 0...","[[0, 1, 1, 2, 1, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,...","[[1, 0], [1, 0], [2, 0], [2, 0], [1, 0], [1, 0...",36
99996,417.92,O=C(NCc1ccc(Cl)cc1)c1c[nH]c2sc(CN3CCOCC3)cc2c1=O,"['864942730', '2246699815', '847961216', '2245...","['1510328189', '1054767590', '717512901', '390...","['2843970853', '2024004397', '1272192549', '16...","[[7, 0], [5, 0], [6, 0], [5, 0], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,...","[[2, 0], [2, 0], [1, 0], [1, 0], [1, 0], [1, 0...",28
99997,454.89,O=C(COC(=O)c1ccc(O)cc1)Nc1cc(S(=O)(=O)N2CCOCC2...,"['864942730', '2246699815', '2245384272', '864...","['1510328189', '3315826729', '3995043796', '22...","['1517842599', '859821531', '381695949', '1554...","[[7, 0], [5, 0], [5, 0], [7, 0], [5, 0], [7, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 4, 6, 6, 7, 7,...","[[2, 0], [2, 0], [1, 0], [1, 0], [1, 0], [1, 0...",30
99998,976.02,CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...,"['2246728737', '2245384272', '2976033787', '31...","['3542456614', '3594356142', '1916236386', '26...","['3834501247', '3352858976', '4210593593', '29...","[[5, 0], [5, 0], [5, 1], [7, 0], [5, 0], [7, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 4, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",66


In [60]:
df.to_csv("data_100k_graph.csv")