In [1]:
pip install rdkit biopython

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import numpy as np

In [3]:
# Load your dataset
df = pd.read_csv('no_split_kd.csv')  # adjust filename if needed

In [4]:
df.isna().sum()

SMILES             0
target_sequence    0
Kd                 0
dtype: int64

In [5]:
# --- Molecular descriptors from SMILES ---
def compute_molecular_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol)
    ]

mol_desc = df['SMILES'].apply(compute_molecular_descriptors)
mol_desc = mol_desc.dropna()
mol_desc_df = pd.DataFrame(mol_desc.tolist(), columns=[
    'MolWt', 'LogP', 'RotatableBonds', 'HDonors', 'HAcceptors'
])

[18:07:25] Explicit valence for atom # 22 N, 4, is greater than permitted
[18:07:25] Explicit valence for atom # 22 N, 4, is greater than permitted
[18:07:27] Explicit valence for atom # 22 N, 4, is greater than permitted
[18:07:27] Explicit valence for atom # 22 N, 4, is greater than permitted
[18:07:27] Explicit valence for atom # 22 N, 4, is greater than permitted
[18:07:27] Explicit valence for atom # 22 N, 4, is greater than permitted
[18:08:57] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20 21 22 23 25


In [6]:
# --- Sequence descriptors from target protein ---
def compute_protein_descriptors(seq):
    try:
        analyzed_seq = ProteinAnalysis(seq)
        return [
            analyzed_seq.molecular_weight(),
            analyzed_seq.aromaticity(),
            analyzed_seq.instability_index(),
            analyzed_seq.gravy()
        ]
    except:
        return None

prot_desc = df.loc[mol_desc.index, 'target_sequence'].apply(compute_protein_descriptors)
prot_desc = prot_desc.dropna()
prot_desc_df = pd.DataFrame(prot_desc.tolist(), columns=[
    'Prot_MW', 'Aromaticity', 'Instability', 'Hydropathy'
])

In [7]:
# --- Final dataset ---
kd_filtered = df.loc[prot_desc.index, 'Kd'].reset_index(drop=True)
final_df = pd.concat([mol_desc_df.reset_index(drop=True), prot_desc_df.reset_index(drop=True), kd_filtered], axis=1)
final_df.columns = list(final_df.columns[:-1]) + ['Kd']

In [8]:
final_df.isna().sum()

MolWt              0
LogP               0
RotatableBonds     0
HDonors            0
HAcceptors         0
Prot_MW           25
Aromaticity       25
Instability       25
Hydropathy        25
Kd                25
dtype: int64

In [14]:
final_df = final_df.dropna()

In [15]:
len(final_df)

61092

In [17]:
# Save to CSV
final_df.to_csv("descriptor_based_dataset.csv", index=False)