In [None]:
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem.rdchem import HybridizationType
from rdkit.Chem.rdchem import BondType as BT
from rdkit.Chem import AllChem
import torch
import numpy as np
from tqdm import tqdm

In [None]:
tqdm.pandas()

In [None]:
df = pd.read_csv('data_10k.csv')

In [None]:
# decided to take them as target
# df.drop(columns=['Molecular Weight', 'Bioactivities', 'AlogP', 'Polar Surface Area', 'CX Acidic pKa', 'CX Basic pKa'])

In [None]:
ATOM_LIST = list(range(1,119))
CHIRALITY_LIST = [
    Chem.rdchem.ChiralType.CHI_UNSPECIFIED,
    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW,
    Chem.rdchem.ChiralType.CHI_OTHER
]
BOND_LIST = [
    BT.SINGLE,
    BT.DOUBLE,
    BT.TRIPLE,
    BT.AROMATIC
]
BONDDIR_LIST = [
    Chem.rdchem.BondDir.NONE,
    Chem.rdchem.BondDir.ENDUPRIGHT,
    Chem.rdchem.BondDir.ENDDOWNRIGHT
]
BONDTYPES_LIST = [Chem.rdchem.BondType.names,
                  Chem.rdchem.BondType.values
                  ]

def get_graph_columns(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return torch.tensor([[], []], dtype=torch.long).tolist(), \
                torch.tensor(np.array([]), dtype=torch.long).tolist(), \
                torch.tensor(np.array([]), dtype=torch.long).tolist(), \
                0

    N = mol.GetNumAtoms()
    M = mol.GetNumBonds()

    type_idx = []
    chirality_idx = []
    atomic_number = []
    
    for atom in mol.GetAtoms():
        type_idx.append(ATOM_LIST.index(atom.GetAtomicNum()))
        chirality_idx.append(CHIRALITY_LIST.index(atom.GetChiralTag()))
        atomic_number.append(atom.GetAtomicNum())
    
    x1 = torch.tensor(type_idx, dtype=torch.long).view(-1,1)
    x2 = torch.tensor(chirality_idx, dtype=torch.long).view(-1,1)
    node_feat = torch.cat([x1, x2], dim=-1).tolist()

    row, col, edge_feat = [], [], []
    for bond in mol.GetBonds():
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        row += [start, end]
        col += [end, start]
        
        edge_feat.append([
            bond.GetBondTypeAsDouble(),
            BONDDIR_LIST.index(bond.GetBondDir())
        ])
        edge_feat.append([
            bond.GetBondTypeAsDouble(),
            BONDDIR_LIST.index(bond.GetBondDir())
        ])

    edge_index = torch.tensor([row, col], dtype=torch.long).tolist()
    edge_attr = torch.tensor(edge_feat, dtype=torch.long).tolist()
    num_nodes = N

    return node_feat, edge_index, edge_attr, num_nodes

In [None]:
df[["node_feat", "edge_index", 'edge_attr', 'num_nodes']] = df.progress_apply(lambda row: get_graph_columns(row.Smiles), axis='columns', result_type='expand')

In [None]:
df

In [None]:
# df.rename(columns={'Molecular Weight': 'y'}, inplace=True)

In [None]:
# df['y'] = df['y'].apply(lambda row: [x]) # for graphormer

In [None]:
# df = df.drop(['Bioactivities', 'AlogP', 'Polar Surface Area', 'CX Acidic pKa', 'CX Basic pKa'], axis=1)

In [None]:
df['y'] = df.progress_apply(lambda row: [df['Molecular Weight']], axis='columns', result_type='expand')

In [None]:
df 

In [None]:
df.to_csv("data_10k_graph.csv", index=False)