In [1]:
import torch,os
from torch.utils.data import TensorDataset,random_split
import pandas as pd
from rdkit import Chem,DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import ChemicalFeatures
from rdkit import RDConfig
import os
from dgllife.data import MoleculeCSVDataset
from functools import partial
from dgllife.utils import smiles_to_bigraph, ConsecutiveSplitter

# Obtain the features of atoms and bonds
def load_coeff(mol):
    mol = Chem.MolToSmiles(mol, canonical=True) 
    idx=sms.index(mol)  
    coeff = torch.load('orb_coeff.t')
    return coeff[idx]


def featurize_atoms(mol):  
    feats = []
    coo = load_coeff(mol)
    for atom in mol.GetAtoms():
        hy = [int(atom.GetHybridization()==y) for y in [Chem.rdchem.HybridizationType.SP,
              Chem.rdchem.HybridizationType.SP2,Chem.rdchem.HybridizationType.SP3]]
        feats.append([coo[atom.GetIdx()]])
        
    return {'atomic': torch.tensor(feats)}


def featurize_edges(mol, self_loop=False):     
    feats = []
    coo = load_coeff(mol)
    num_atoms = mol.GetNumAtoms()
    for i in range(num_atoms):
        for j in range(num_atoms):
            e_ij = mol.GetBondBetweenAtoms(i,j)
            if e_ij is None:
                bond_type = None
            else:
                bond_type = e_ij.GetBondType()
                feats.append([float(bond_type == x)for x in (None, Chem.rdchem.BondType.SINGLE,
                              Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC)]
                             +[coo[i] * coo[j]])
    return {'edgic': torch.tensor(feats)}


if __name__ == "__main__":

    df = pd.read_csv('train_set_5631.csv')
    sms=[Chem.MolToSmiles(Chem.MolFromSmiles(sm), canonical=True) for sm in df['smiles'].tolist()]
   
    # SMILES to graph-based dataset for prediction model with DGL-Life
    dataset=MoleculeCSVDataset(df=df,
                               smiles_to_graph=partial(smiles_to_bigraph, add_self_loop=False),
                               node_featurizer=featurize_atoms,
                               edge_featurizer=None,#featurize_edges,
                               smiles_column='smiles',
                               cache_file_path='graph.pt',log_every=1000)
#    print(dataset)
    train_set, val_set, test_set = ConsecutiveSplitter.train_val_test_split(dataset, frac_train=0.8, frac_val=0.1, frac_test=0.1)
    torch.save([train_set,val_set,test_set], "opv_graph-onlycoeff100.pt")


ModuleNotFoundError: No module named 'torch'

In [2]:
dataset.load_full = True
dataset[0]

('c1cc2[nH]c3c4ncc5ccCc5c4Cc3c2o1',
 Graph(num_nodes=18, num_edges=44,
       ndata_schemes={'atomic': Scheme(shape=(1,), dtype=torch.float32)}
       edata_schemes={}),
 tensor([246.9197]),
 tensor([1.]))

In [3]:
a=featurize_atoms(Chem.MolFromSmiles('c1cc2[nH]c3c4ncc5ccCc5c4Cc3c2o1'))


In [8]:
torch.load('opv_graph-onlycoeff100.pt')[0]


<dgl.data.utils.Subset at 0x1ec7a488e48>