In [None]:
# Load sdf files and merge

from rdkit import Chem
import pandas as pd

def load_sdf_to_df(sdf_path):
    suppl = Chem.SDMolSupplier(sdf_path)
    records = []
    for mol in suppl:
        if mol is None:
            continue
        name = mol.GetProp('Name') if mol.HasProp('Name') else None
        smiles = Chem.MolToSmiles(mol)
        records.append({'name': name, 'smiles': smiles})
    return pd.DataFrame(records)

df1 = load_sdf_to_df('../data/enamine_kinase_library.sdf')
df2 = load_sdf_to_df('../data/enamine_hit_locator_library_460k.sdf')
df3 = load_sdf_to_df('../data/enamine_dds_50k.sdf')
df4 = load_sdf_to_df('../data/enamine_dds_10k.sdf')

df_enamine = pd.concat([df1, df2, df3, df4], ignore_index=True)
df_enamine.drop_duplicates(subset='smiles', inplace=True)
# Print the first few rows of the DataFrame
print(df_enamine.head())

# Save extracted data to CSV
csv_path = '../data/enamine_extracted.csv'
df_enamine.to_csv(csv_path, index=False)
print(f"Saved extracted Enamine data to {csv_path}")



In [7]:
df_enamine.shape

(561590, 2)

In [2]:
## The names column is empty, I need to fill it with random number "EN!!!!!!" where EN is Enamine and is constant


# Load your CSV file
df = pd.read_csv('../data/enamine_extracted.csv')

# Number of digits for zero-padding (e.g., 6 digits)
num_digits = 6

# Generate the name column with "EN" + zero-padded numbers starting from 0
df['name'] = ['EN' + str(i).zfill(num_digits) for i in range(len(df))]

# Save the updated dataframe to a new CSV file
df.to_csv('../data/enamine_extracted_with_ID.csv', index=False)

In [3]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from tqdm import tqdm
import joblib

# --- Step 1: Load dataset with SMILES ---

df = pd.read_csv('../data/enamine_extracted_with_ID.csv')  

# --- Step 2: Initialize Morgan fingerprint generator ---
generator = GetMorganGenerator(radius=2, fpSize=2048)

# --- Step 3: Define descriptor names and calculation functions ---

descriptor_names = [
    'MolWt', 'MolLogP', 'NumRotatableBonds',
    'NumHAcceptors', 'NumHDonors', 'TPSA', 'RingCount'
]

def calc_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [np.nan] * len(descriptor_names)
    return [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.TPSA(mol),
        Descriptors.RingCount(mol)
    ]

def smiles_to_morgan_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = generator.GetFingerprint(mol)
    return np.array(fp)

# --- Step 4: Generate features ---

morgan_fps = []
descriptor_features = []
valid_indices = []

print("Generating Morgan fingerprints and descriptors...")
for i, smi in enumerate(tqdm(df['smiles'], desc='Processing molecules')):
    fp = smiles_to_morgan_fp(smi)
    desc = calc_descriptors(smi)
    if fp is not None and not any(np.isnan(desc)):
        morgan_fps.append(fp)
        descriptor_features.append(desc)
        valid_indices.append(i)

# Filter valid molecules
df_valid = df.iloc[valid_indices].reset_index(drop=True)
X_fp = np.array(morgan_fps, dtype=np.uint8)
X_desc = np.array(descriptor_features)

# Combine features
X_combined = np.concatenate([X_fp, X_desc], axis=1)

print(f"Feature matrix shape: {X_combined.shape}")

Generating Morgan fingerprints and descriptors...


Processing molecules: 100%|██████████| 561590/561590 [27:19<00:00, 342.44it/s]


Feature matrix shape: (561590, 2055)


In [4]:
# --- Step 5: Load trained Random Forest model ---

rf_model = joblib.load('../models/random_forest_model.pkl')  # Update path accordingly

# --- Step 6: Predict and filter actives ---

print("Predicting activity...")
predictions = rf_model.predict(X_combined)

# Map numeric predictions to labels
labels = ['inactive' if p == 0 else 'active' for p in predictions]

# Add predicted labels to DataFrame
df_valid['predicted_activity'] = labels

# Filter to keep only predicted actives
df_actives = df_valid[df_valid['predicted_activity'] == 'active']

# Select relevant columns
df_actives_filtered = df_actives[['name', 'smiles', 'predicted_activity']]

# --- Step 7: Save results ---

output_csv = '../data/predicted_enamine_actives.csv'
df_actives_filtered.to_csv(output_csv, index=False)

print(f"Saved {len(df_actives_filtered)} predicted active compounds to '{output_csv}'")

Predicting activity...
Saved 181136 predicted active compounds to '../data/predicted_enamine_actives.csv'


In [5]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from tqdm import tqdm


# --- Step 4: Calculate descriptors ---
descriptor_names = [
    'MolWt', 'MolLogP', 'NumRotatableBonds',
    'NumHAcceptors', 'NumHDonors', 'TPSA', 'RingCount'
]

def calc_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [np.nan] * len(descriptor_names)
    return [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.TPSA(mol),
        Descriptors.RingCount(mol)
    ]

print("Calculating descriptors for filtered compounds...")
desc_list = []
for smi in tqdm(df_actives_filtered['smiles']):
    desc_list.append(calc_descriptors(smi))

df_desc = pd.DataFrame(desc_list, columns=descriptor_names)

# --- Step 5: Combine descriptors with filtered dataframe ---
df_filtered = df_actives_filtered.reset_index(drop=True)
df_final = pd.concat([df_filtered, df_desc], axis=1)

# --- Step 6: Apply Lipinski's Rule of 5 filter ---
# Lipinski's rules:
# - Molecular weight <= 500
# - LogP <= 5
# - Hydrogen bond donors <= 5
# - Hydrogen bond acceptors <= 10

lipinski_filter = (
    (df_final['MolWt'] <= 500) &
    (df_final['MolLogP'] <= 5) &
    (df_final['NumHDonors'] <= 5) &
    (df_final['NumHAcceptors'] <= 10)
)

df_lipinski = df_final[lipinski_filter].reset_index(drop=True)

print(f"Count after Lipinski's Rule of 5 filter: {len(df_lipinski)}")

# --- Step 7: Save the final filtered dataset ---
output_path = '../data/filtered_enamine_actives.csv'
df_lipinski.to_csv(output_path, index=False)
print(f"Final filtered dataset saved to: {output_path}")


Calculating descriptors for filtered compounds...


100%|██████████| 181136/181136 [03:16<00:00, 920.95it/s] 


Count after Lipinski's Rule of 5 filter: 180467
Final filtered dataset saved to: ../data/filtered_enamine_actives.csv
