Process the csv information into RDKit

In [2]:
from rdkit import Chem
import pandas as pd
import os

DATA_FILE_PATH = os.path.join(os.getcwd(), 'data', 'HIV.csv')
OUTPUT_FILE_PATH = os.path.join(os.getcwd(), 'processed')

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

def process_data(df):
    for i, row in df.iterrows():
        smiles = row['smiles']
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            df.drop(i, inplace=True)
        else:
            df.at[i, 'mol'] = mol
    return df
            
    

In [2]:
from torch_geometric.data import Dataset, Data
import torch
import numpy as np
import os
from tqdm import tqdm

class HIVDataset(Dataset):

    def __init__(self, root, filename, test=False, transform=None, pre_transform=None):
        self.test = test
        self.filename = filename
        super(HIVDataset, self).__init__(root, transform, pre_transform)

    def process_data(self):
        self.data = pd.read_csv(os.path.join(self.root, self.filename))
        self.data = process_data(self.data)

        for i, mol in tqdm(self.data.iterrows(), total=self.data.shape[0]):
            self.node_features = self._get_node_features(mol['mol'])
            self.edge_features = self._get_edge_features(mol['mol'])
            self.labels = self._get_labels(mol)
            self.adjacency_info = self._get_adjacency_info(mol['mol'])

            data = Data(x=self.node_features, edge_features=self.adjacency_info, edge_attr=self.edge_features, y=self.labels, smiles=df['smiles'])

            if self.test:
                    torch.save(data, 
                        os.path.join(self.processed_dir, 
                                    f'data_test_{i}.pt'))
            else:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                    f'data_{i}.pt'))


    def _get_node_features(self, mol):
        all_node_feats = []

        for atom in mol.GetAtoms():
            node_feats = []
            # Feature 1: Atomic number        
            node_feats.append(atom.GetAtomicNum())
            # Feature 2: Atom degree
            node_feats.append(atom.GetDegree())
            # Feature 3: Formal charge
            node_feats.append(atom.GetFormalCharge())
            # Feature 4: Hybridization
            node_feats.append(atom.GetHybridization())
            # Feature 5: Aromaticity
            node_feats.append(atom.GetIsAromatic())
            # Feature 6: Total Num Hs
            node_feats.append(atom.GetTotalNumHs())
            # Feature 7: Radical Electrons
            node_feats.append(atom.GetNumRadicalElectrons())
            # Feature 8: In Ring
            node_feats.append(atom.IsInRing())
            # Feature 9: Chirality
            node_feats.append(atom.GetChiralTag())

            # Append node features to matrix
            all_node_feats.append(node_feats)

        all_node_feats = np.asarray(all_node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)
    
    def _get_edge_features(self, mol):
        edge_feats = []

        for bond in mol.GetBonds():
            bond_feats = []
            # Feature 1: Bond type
            bond_feats.append(bond.GetBondTypeAsDouble())
            # Feature 2: Is conjugated
            bond_feats.append(bond.GetIsConjugated())
            # Feature 3: Is in ring
            bond_feats.append(bond.IsInRing())
            # Append edge features to matrix
            edge_feats.append(bond_feats)

        edge_feats = np.asarray(edge_feats)
        return torch.tensor(edge_feats, dtype=torch.float)
    
    def _get_labels(self, df):
        return torch.tensor(df['HIV_active'], dtype=torch.float)

    def _get_adjacency_info(self, mol):
        # return torch.tensor(Chem.rdmolops.GetAdjacencyMatrix(mol), dtype=torch.float)
        # We don't use the above function because it needs to match the order of the edge features
        
        adj = [] 

        for bond in mol.GetBonds():
            start_idx = bond.GetBeginAtomIdx()
            end_idx = bond.GetEndAtomIdx()
            adj += ([start_idx, end_idx], [end_idx, start_idx])

        return torch.tensor(adj, dtype=torch.long)
    
    def len(self):
        return len(os.listdir(self.processed_dir))
    
    def get(self, idx):
        return torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt'))


In [3]:
ds = HIVDataset(os.getcwd(), "HIV.csv", test=False)
ds.get(0)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/oliviali/Development/generative-hiv/processed/data_0.pt'