In [6]:
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import pandas as pd

In [7]:
"Ref: https://github.com/chemprop/chemprop/blob/93e0aefda97e7153da7715120ff55a66863948f9/chemprop/features/features_generators.py"

'Ref: https://github.com/chemprop/chemprop/blob/93e0aefda97e7153da7715120ff55a66863948f9/chemprop/features/features_generators.py'

In [8]:
MORGAN_RADIUS = 2
MORGAN_NUM_BITS = 1024


#@register_features_generator('morgan')
def morgan_binary_features_generator(mol,
                                     radius: int = MORGAN_RADIUS,
                                     num_bits: int = MORGAN_NUM_BITS) -> np.ndarray:
    """
    Generates a binary Morgan fingerprint for a molecule.
    :param mol: A molecule (i.e., either a SMILES or an RDKit molecule).
    :param radius: Morgan fingerprint radius.
    :param num_bits: Number of bits in Morgan fingerprint.
    :return: A 1D numpy array containing the binary Morgan fingerprint.
    """
    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=num_bits)
    features = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(features_vec, features)

    return features

In [9]:
def morgan_counts_features_generator(mol,
                                     radius: int = MORGAN_RADIUS,
                                     num_bits: int = MORGAN_NUM_BITS) -> np.ndarray:
    """
    Generates a counts-based Morgan fingerprint for a molecule.
    :param mol: A molecule (i.e., either a SMILES or an RDKit molecule).
    :param radius: Morgan fingerprint radius.
    :param num_bits: Number of bits in Morgan fingerprint.
    :return: A 1D numpy array containing the counts-based Morgan fingerprint.
    """
    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    features_vec = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=num_bits)
    features = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(features_vec, features)

    return features

In [10]:
def MACCS_features_generator(mol) -> np.ndarray:
    """
    Generates a counts-based Morgan fingerprint for a molecule.
    :param mol: A molecule (i.e., either a SMILES or an RDKit molecule).
    :param radius: Morgan fingerprint radius.
    :param num_bits: Number of bits in Morgan fingerprint.
    :return: A 1D numpy array containing the counts-based Morgan fingerprint.
    """
    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    features_vec = AllChem.GetMACCSKeysFingerprint(mol)
    features = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(features_vec, features)
    
    return features

In [14]:
load_path='../Data02-different_descriptors/SMILES/'
save_path='../Data02-different_descriptors/graph_descriptor/'
train_data='SMILES_train.pkl'
test_data='SMILES_test.pkl'

In [18]:
train_df=pd.read_pickle(load_path+train_data)
test_df=pd.read_pickle(load_path+test_data)
train_SMILES=train_df['SMILES']
test_SMILES=test_df['SMILES']

In [28]:
#get morgan_binary_features_generator
Morgan_train = [morgan_binary_features_generator(smi) for smi in train_SMILES]
Morgan_test = [morgan_binary_features_generator(smi) for smi in test_SMILES]
df_MFtrain = pd.DataFrame(np.array(Morgan_train, int)) 
df_MFtest = pd.DataFrame(np.array(Morgan_test, int)) 
Column=[("MF"+str(i+1)) for i in range(0,MORGAN_NUM_BITS)]   
df_MFtrain.columns=Column
df_MFtest.columns=Column
df_MFtrain.to_pickle(save_path+"MF_train.pkl")
df_MFtest.to_pickle(save_path+"MF_test.pkl")

In [40]:
#get morgan_counts_features_generator
cMorgan_train = [morgan_counts_features_generator(smi) for smi in train_SMILES]
cMorgan_test= [morgan_counts_features_generator(smi) for smi in test_SMILES]
df_cMFtrain = pd.DataFrame(np.array(cMorgan_train, int)) 
df_cMFtest = pd.DataFrame(np.array(cMorgan_test, int)) 
Column1=[("cMF"+str(i+1)) for i in range(0,MORGAN_NUM_BITS)]
df_cMFtrain.columns=Column1
df_cMFtest.columns=Column1
df_cMFtrain.to_pickle(save_path+"cMF_train.pkl")
df_cMFtest.to_pickle(save_path+"cMF_test.pkl")

In [41]:
#get MACCS_features_generator
MACCS_train= [MACCS_features_generator(smi) for smi in train_SMILES]
MACCS_test= [MACCS_features_generator(smi) for smi in test_SMILES]
df_MACCStrain = pd.DataFrame(np.array(MACCS_train, int)) 
df_MACCStest = pd.DataFrame(np.array(MACCS_test, int)) 
Column=[("MACCS"+str(i+1)) for i in range(0,167)]   
df_MACCStrain.columns=Column
df_MACCStest.columns=Column
df_MACCStrain.to_pickle(save_path+"MACCS_train.pkl")
df_MACCStest.to_pickle(save_path+"MACCS_test.pkl")