In [1]:
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem.rdchem import HybridizationType
from rdkit.Chem.rdchem import BondType as BT
from rdkit.Chem import AllChem
import torch
import numpy as np
from tqdm import tqdm

In [2]:
tqdm.pandas()

In [3]:
df = pd.read_csv('data_10k.csv')

In [None]:
# decided to take them as target
# df.drop(columns=['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'CX Acidic pKa', 'CX Basic pKa'])

In [5]:
ATOM_LIST = list(range(1,119))
CHIRALITY_LIST = [
    Chem.rdchem.ChiralType.CHI_UNSPECIFIED,
    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW,
    Chem.rdchem.ChiralType.CHI_OTHER
]
BOND_LIST = [
    BT.SINGLE,
    BT.DOUBLE,
    BT.TRIPLE,
    BT.AROMATIC
]
BONDDIR_LIST = [
    Chem.rdchem.BondDir.NONE,
    Chem.rdchem.BondDir.ENDUPRIGHT,
    Chem.rdchem.BondDir.ENDDOWNRIGHT
]
BONDTYPES_LIST = [Chem.rdchem.BondType.names,
                  Chem.rdchem.BondType.values
                  ]

def get_graph_columns(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return torch.tensor([[], []], dtype=torch.long).tolist(), \
                torch.tensor(np.array([]), dtype=torch.long).tolist(), \
                torch.tensor(np.array([]), dtype=torch.long).tolist(), \
                0

    N = mol.GetNumAtoms()
    M = mol.GetNumBonds()

    type_idx = []
    chirality_idx = []
    atomic_number = []
    
    for atom in mol.GetAtoms():
        type_idx.append(ATOM_LIST.index(atom.GetAtomicNum()))
        chirality_idx.append(CHIRALITY_LIST.index(atom.GetChiralTag()))
        atomic_number.append(atom.GetAtomicNum())
    
    x1 = torch.tensor(type_idx, dtype=torch.long).view(-1,1)
    x2 = torch.tensor(chirality_idx, dtype=torch.long).view(-1,1)
    node_feat = torch.cat([x1, x2], dim=-1).tolist()

    row, col, edge_feat = [], [], []
    for bond in mol.GetBonds():
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        row += [start, end]
        col += [end, start]
        
        edge_feat.append([
            bond.GetBondTypeAsDouble(),
            BONDDIR_LIST.index(bond.GetBondDir())
        ])
        edge_feat.append([
            bond.GetBondTypeAsDouble(),
            BONDDIR_LIST.index(bond.GetBondDir())
        ])

    edge_index = torch.tensor([row, col], dtype=torch.long).tolist()
    edge_attr = torch.tensor(edge_feat, dtype=torch.long).tolist()
    num_nodes = N

    return node_feat, edge_index, edge_attr, num_nodes

In [6]:
df[["node_feat", "edge_index", 'edge_attr', 'num_nodes']] = df.progress_apply(lambda row: get_graph_columns(row.Smiles), axis='columns', result_type='expand')

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:08<00:00, 1224.67it/s]


In [6]:
df

Unnamed: 0,Molecular Weight,Bioactivities,AlogP,Polar Surface Area,CX Acidic pKa,CX Basic pKa,Smiles,ecfp1,ecfp2,ecfp3,node_feat,edge_index,edge_attr,num_nodes
0,478.57,12.0,5.88,52.83,,4.26,COc1cc(C2(C)CCCc3nc(SCc4ncccn4)n(-c4ccc(F)cc4)...,"['2246728737', '864674487', '3217380708', '321...","['3975275337', '2076190208', '1135286194', '99...","['932712697', '2628046163', '3102147921', '424...","[[5, 0], [7, 0], [5, 0], [5, 0], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 5, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",34
1,437.47,17.0,4.51,90.93,10.14,,COC(=O)c1sc(NC(=O)C2c3ccccc3Oc3ccccc32)c(C(=O)...,"['2246728737', '864674487', '2246699815', '864...","['3975275337', '2154935424', '2827868305', '15...","['1673980810', '3163669616', '2541195453', '15...","[[5, 0], [7, 0], [5, 0], [7, 0], [5, 0], [15, ...","[[0, 1, 1, 2, 2, 3, 2, 4, 4, 5, 5, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [2, 0], [2, 0...",31
2,1010.29,4.0,,,,,CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...,"['2246728737', '2245384272', '2976033787', '31...","['3542456614', '3594356142', '1916236386', '26...","['3834501247', '3352858976', '4210593593', '29...","[[5, 0], [5, 0], [5, 1], [7, 0], [5, 0], [7, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 4, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",73
3,337.41,2.0,1.67,70.69,7.32,,Cc1cccc(-n2cc(C(=O)N3CCC[C@@H]([n+]4cc[nH]c4)C...,"['2246728737', '3217380708', '3218693969', '32...","['422715066', '3207567135', '951226070', '9851...","['950023157', '4244175903', '3692055567', '390...","[[5, 0], [5, 0], [5, 0], [5, 0], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",25
4,504.41,1.0,5.48,83.91,4.23,,CCOC(=O)[C@H](C1CC1)N1C(=O)[C@@H](CC(=O)O)C[C@...,"['2246728737', '2245384272', '864674487', '224...","['3542456614', '3994088662', '2222715027', '40...","['2072239802', '3899543322', '3540073353', '17...","[[5, 0], [5, 0], [7, 0], [5, 0], [7, 0], [5, 2...","[[0, 1, 1, 2, 2, 3, 3, 4, 3, 5, 5, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,313.24,24.0,2.12,26.71,,7.95,CCN1CCN(CC(O)c2ccc(Br)cc2)CC1,"['2246728737', '2245384272', '2092489639', '29...","['3542456614', '2251845666', '1634606847', '28...","['3665875809', '53971451', '434423882', '43442...","[[5, 0], [5, 0], [6, 0], [5, 0], [5, 0], [6, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",18
9996,367.38,,-1.01,141.67,3.64,,O=C(O)CNC(=O)CNC(=O)CNC(=O)CSC(=O)c1ccccc1,"['864942730', '2246699815', '864662311', '2245...","['1510328189', '4278941385', '1533864325', '77...","['2423896454', '1210173779', '2718878803', '12...","[[7, 0], [5, 0], [7, 0], [5, 0], [6, 0], [5, 0...","[[0, 1, 1, 2, 1, 3, 3, 4, 4, 5, 5, 6, 5, 7, 7,...","[[2, 0], [2, 0], [1, 0], [1, 0], [1, 0], [1, 0...",25
9997,349.41,1.0,3.49,54.88,,3.56,O=C(N[C@@]12CCC[C@@](C#Cc3ccccn3)(CC1)C2)c1ccc...,"['864942730', '2246699815', '847961216', '2976...","['1510328189', '1054767590', '1693589848', '36...","['1981852759', '3356387992', '1715017697', '28...","[[7, 0], [5, 0], [6, 0], [5, 1], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,...","[[2, 0], [2, 0], [1, 0], [1, 0], [1, 0], [1, 0...",26
9998,362.43,6.0,3.77,51.66,,1.42,CCOc1ccccc1-c1cc(C(=O)N2CCOCC2)c2ccccc2n1,"['2246728737', '2245384272', '864674487', '321...","['3542456614', '3994088662', '2115476908', '11...","['2677858541', '3088822697', '1573444561', '20...","[[5, 0], [5, 0], [7, 0], [5, 0], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",27


In [7]:
# df.rename(columns={'Molecular Weight': 'y'}, inplace=True)

In [11]:
# df['y'] = df['y'].apply(lambda row: [x]) # for graphormer

In [8]:
# df = df.drop(['Bioactivities', 'AlogP', 'Polar Surface Area', 'CX Acidic pKa', 'CX Basic pKa'], axis=1)

In [None]:
df['y'] = df.progress_apply(lambda row: [df['Molecular Weight']], axis='columns', result_type='expand')

In [12]:
df 

Unnamed: 0,y,Smiles,ecfp1,ecfp2,ecfp3,node_feat,edge_index,edge_attr,num_nodes
0,[478.57],COc1cc(C2(C)CCCc3nc(SCc4ncccn4)n(-c4ccc(F)cc4)...,"['2246728737', '864674487', '3217380708', '321...","['3975275337', '2076190208', '1135286194', '99...","['932712697', '2628046163', '3102147921', '424...","[[5, 0], [7, 0], [5, 0], [5, 0], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 5, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",34
1,[437.47],COC(=O)c1sc(NC(=O)C2c3ccccc3Oc3ccccc32)c(C(=O)...,"['2246728737', '864674487', '2246699815', '864...","['3975275337', '2154935424', '2827868305', '15...","['1673980810', '3163669616', '2541195453', '15...","[[5, 0], [7, 0], [5, 0], [7, 0], [5, 0], [15, ...","[[0, 1, 1, 2, 2, 3, 2, 4, 4, 5, 5, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [2, 0], [2, 0...",31
2,[1010.29],CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...,"['2246728737', '2245384272', '2976033787', '31...","['3542456614', '3594356142', '1916236386', '26...","['3834501247', '3352858976', '4210593593', '29...","[[5, 0], [5, 0], [5, 1], [7, 0], [5, 0], [7, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 4, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",73
3,[337.41],Cc1cccc(-n2cc(C(=O)N3CCC[C@@H]([n+]4cc[nH]c4)C...,"['2246728737', '3217380708', '3218693969', '32...","['422715066', '3207567135', '951226070', '9851...","['950023157', '4244175903', '3692055567', '390...","[[5, 0], [5, 0], [5, 0], [5, 0], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",25
4,[504.41],CCOC(=O)[C@H](C1CC1)N1C(=O)[C@@H](CC(=O)O)C[C@...,"['2246728737', '2245384272', '864674487', '224...","['3542456614', '3994088662', '2222715027', '40...","['2072239802', '3899543322', '3540073353', '17...","[[5, 0], [5, 0], [7, 0], [5, 0], [7, 0], [5, 2...","[[0, 1, 1, 2, 2, 3, 3, 4, 3, 5, 5, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",34
...,...,...,...,...,...,...,...,...,...
9995,[313.24],CCN1CCN(CC(O)c2ccc(Br)cc2)CC1,"['2246728737', '2245384272', '2092489639', '29...","['3542456614', '2251845666', '1634606847', '28...","['3665875809', '53971451', '434423882', '43442...","[[5, 0], [5, 0], [6, 0], [5, 0], [5, 0], [6, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",18
9996,[367.38],O=C(O)CNC(=O)CNC(=O)CNC(=O)CSC(=O)c1ccccc1,"['864942730', '2246699815', '864662311', '2245...","['1510328189', '4278941385', '1533864325', '77...","['2423896454', '1210173779', '2718878803', '12...","[[7, 0], [5, 0], [7, 0], [5, 0], [6, 0], [5, 0...","[[0, 1, 1, 2, 1, 3, 3, 4, 4, 5, 5, 6, 5, 7, 7,...","[[2, 0], [2, 0], [1, 0], [1, 0], [1, 0], [1, 0...",25
9997,[349.41],O=C(N[C@@]12CCC[C@@](C#Cc3ccccn3)(CC1)C2)c1ccc...,"['864942730', '2246699815', '847961216', '2976...","['1510328189', '1054767590', '1693589848', '36...","['1981852759', '3356387992', '1715017697', '28...","[[7, 0], [5, 0], [6, 0], [5, 1], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,...","[[2, 0], [2, 0], [1, 0], [1, 0], [1, 0], [1, 0...",26
9998,[362.43],CCOc1ccccc1-c1cc(C(=O)N2CCOCC2)c2ccccc2n1,"['2246728737', '2245384272', '864674487', '321...","['3542456614', '3994088662', '2115476908', '11...","['2677858541', '3088822697', '1573444561', '20...","[[5, 0], [5, 0], [7, 0], [5, 0], [5, 0], [5, 0...","[[0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,...","[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0...",27


In [14]:
df.to_csv("data_10k_graph.csv", index=False)