In [28]:
pip install rdkit biopython

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [29]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import numpy as np

In [30]:
# Load your dataset
df = pd.read_csv('test_dataset.csv')  # adjust filename if needed

In [31]:
df.isna().sum()

SMILES             0
target_sequence    0
Kd                 0
dtype: int64

In [32]:
def compute_selected_smiles_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    return [
        Descriptors.SlogP_VSA5(mol),
        Descriptors.BCUT2D_CHGHI(mol),
        Descriptors.SMR_VSA5(mol),
        Descriptors.MinEStateIndex(mol),
        Descriptors.Chi4v(mol)
    ]

# Apply to SMILES column
mol_desc = df['SMILES'].apply(compute_selected_smiles_descriptors)
mol_desc = mol_desc.dropna()

# Create DataFrame
mol_desc_df = pd.DataFrame(mol_desc.tolist(), columns=[
    'SlogP_VSA5', 'BCUT2D_CHGHI', 'SMR_VSA5', 'MinEStateIndex', 'Chi4v'
])

In [33]:
def compute_protein_descriptors(seq):
    try:
        analyzed_seq = ProteinAnalysis(seq)
        return [
            analyzed_seq.molecular_weight(),
            analyzed_seq.aromaticity(),
            analyzed_seq.instability_index(),
            analyzed_seq.gravy(),
            analyzed_seq.isoelectric_point()
        ]
    except:
        return None

# Only apply on rows with valid SMILES descriptors
prot_desc = df.loc[mol_desc.index, 'target_sequence'].apply(compute_protein_descriptors)
prot_desc = prot_desc.dropna()

# Create DataFrame
prot_desc_df = pd.DataFrame(prot_desc.tolist(), columns=[
    'Prot_MW', 'Aromaticity', 'Instability', 'Hydropathy', 'IsoelectricPoint'
])

In [34]:
# --- Final dataset ---
kd_filtered = df.loc[prot_desc.index, 'Kd'].reset_index(drop=True)
final_df = pd.concat([mol_desc_df.reset_index(drop=True), prot_desc_df.reset_index(drop=True), kd_filtered], axis=1)
final_df.columns = list(final_df.columns[:-1]) + ['Kd']

In [35]:
final_df.isna().sum()

SlogP_VSA5           0
BCUT2D_CHGHI        13
SMR_VSA5             0
MinEStateIndex       0
Chi4v                0
Prot_MW              1
Aromaticity          1
Instability          1
Hydropathy           1
IsoelectricPoint     1
Kd                   1
dtype: int64

In [36]:
final_df = final_df.dropna()

In [37]:
len(final_df)

6099

In [38]:
# Save to CSV
final_df.to_csv("descriptor_test.csv", index=False)