In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, MACCSkeys, RDKFingerprint
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("combined_data.csv")

Descriptors

In [3]:
def extract_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Physicochemical descriptors
    features = {
        "MolWt": Descriptors.MolWt(mol),
        "LogP": Descriptors.MolLogP(mol),
        "HDonors": Descriptors.NumHDonors(mol),
        "HAcceptors": Descriptors.NumHAcceptors(mol),
        "TPSA": Descriptors.TPSA(mol),
        "RotatableBonds": Descriptors.NumRotatableBonds(mol),
        "AromaticProportion": sum([1 for atom in mol.GetAromaticAtoms()]) / mol.GetNumAtoms()
    }

    # Constitutional descriptors
    features.update({
        "HeavyAtoms": Descriptors.HeavyAtomCount(mol),
        "NumRings": Descriptors.RingCount(mol),
        "FractionCSP3": Descriptors.FractionCSP3(mol),
        "ValenceElectrons": Descriptors.NumValenceElectrons(mol),
        "Heteroatoms": Descriptors.NumHeteroatoms(mol)
    })

    # Topological descriptors
    features.update({
        "Chi0": Descriptors.Chi0(mol),
        "Chi1": Descriptors.Chi1(mol),
        "Kappa1": Descriptors.Kappa1(mol),
        "BalabanJ": Descriptors.BalabanJ(mol),
        "HallKierAlpha": Descriptors.HallKierAlpha(mol)
    })

    return features

In [4]:
desc_df = df['Smiles'].apply(extract_features)
desc_df = pd.DataFrame(desc_df.tolist())

In [5]:
desc_df.info

<bound method DataFrame.info of          MolWt    LogP  HDonors  HAcceptors    TPSA  RotatableBonds  \
0      977.123 -1.0608       13          13  396.07              30   
1      620.541  2.9198        6           6  165.06              14   
2      805.926  1.9623        9           9  260.56              22   
3      732.807 -1.1643        9          13  300.67              18   
4      396.401  2.0840        1           6   89.40               5   
...        ...     ...      ...         ...     ...             ...   
12657  477.048  5.7980        2           3   52.57              11   
12658  477.048  5.7980        2           3   52.57              11   
12659  320.368  3.0886        1           5   64.16               2   
12660  284.790  4.4664        1           2   38.91               0   
12661  338.403  4.6160        1           4   55.76               4   

       AromaticProportion  HeavyAtoms  NumRings  FractionCSP3  \
0                0.086957          69         2   

Structural fingerprints

In [6]:
USE_MORGAN = True
USE_MACCS = True
USE_RDKIT = True

fingerprints_list = []

for i, row in df.iterrows():
    smiles = row['Smiles']
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        continue

    fp_data = {"SMILES": smiles}

    if USE_MORGAN:
        morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        for j in range(morgan_fp.GetNumBits()):
            fp_data[f'Morgan_{j}'] = int(morgan_fp[j])

    if USE_MACCS:
        maccs_fp = MACCSkeys.GenMACCSKeys(mol)
        for j in range(1, maccs_fp.GetNumBits()): 
            fp_data[f'MACCS_{j}'] = int(maccs_fp[j])

    if USE_RDKIT:
        rdkit_fp = RDKFingerprint(mol)
        for j in range(rdkit_fp.GetNumBits()):
            fp_data[f'RDKitFP_{j}'] = int(rdkit_fp[j])

    fingerprints_list.append(fp_data)





In [8]:
fingerprints_df = pd.DataFrame(fingerprints_list)

In [9]:
mergerd_df = pd.concat([df, desc_df, fingerprints_df], axis=1)
remove = mergerd_df.pop('SMILES')

mergerd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12662 entries, 0 to 12661
Columns: 4284 entries, Smiles to RDKitFP_2047
dtypes: float64(11), int64(4270), object(3)
memory usage: 413.8+ MB


In [10]:
mergerd_df.isnull().any().any()

np.False_

In [11]:
mergerd_df.to_csv('compiled_data.csv', index=False)