In [43]:
## Load libraries
import pandas as pd
from rdkit import Chem

import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from tqdm import tqdm
import joblib


In [38]:
#### Import Datasets

### import dataset from Coconut database
coco = pd.read_csv("../../coconut_csv-06-2025.csv", usecols=['identifier', 'canonical_smiles'])

## Import dataset from the SuperNaT 3.0 database; then remove the rows where smiles are not available
super = pd.read_csv("../../full_data_download.csv", sep=';', usecols=['id', 'smiles'])
super = super[super['smiles'].notnull()]

## Import dataset from the Argentinan database
Argen = pd.read_csv("../../NaturAr_query.csv", usecols=['NatID', 'SMILES'])

## import dataset from the Afrodabase; contains data in .smi and need to be converted to a df with the IDS and SMILES only retained
file_path = '../../smiles_unique_all.smi'

data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        # Split on the first whitespace only
        parts = line.strip().split(maxsplit=1)
        smiles = parts[0]
        name = parts[1] if len(parts) > 1 else ''  # Handle lines with no name
        data.append({'SMILES': smiles, 'ID': name})

afro = pd.DataFrame(data)
afro = afro[['ID', 'SMILES']]


## rename all the headers to standardize and then concat the whole dataframe

# Example renaming for each DataFrame
coco = coco.rename(columns={coco.columns[0]: 'ID', coco.columns[1]: 'SMILES'})
super = super.rename(columns={super.columns[0]: 'ID', super.columns[1]: 'SMILES'})
Argen = Argen.rename(columns={Argen.columns[0]: 'ID', Argen.columns[1]: 'SMILES'})
afro = afro.rename(columns={afro.columns[0]: 'ID', afro.columns[1]: 'SMILES'})


## combine the data into single dataframe
df = pd.concat([coco, super, Argen, afro], ignore_index=True)




In [None]:
## we then canonize the smiles, removed all the NAN and duplicates to get a final compound list

def canonicalize_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None  # invalid SMILES, skip
    return Chem.MolToSmiles(mol, canonical=True)

# Apply canonicalization, skipping invalid SMILES
df['Canonical_SMILES'] = df['SMILES'].apply(canonicalize_smiles)

# Drop rows where canonicalization failed (None values)
filtered_df = df.dropna(subset=['Canonical_SMILES']).copy()

# Replace original SMILES with canonical SMILES
filtered_df['SMILES'] = filtered_df['Canonical_SMILES']

# Remove duplicates based on canonical SMILES
final_df = filtered_df.drop_duplicates(subset=['Canonical_SMILES'])

# Drop the helper column if you want
final_df = final_df.drop(columns=['Canonical_SMILES'])

print(final_df)


In [42]:
## then I save the file into a csv
final_df.to_csv('../data/Natural_product_cpds.csv', index=False)
print('saved to Natural_product_cpds.csv')

saved to Natural_product_cpds.csv


In [45]:
## it is time to generate descriptors and morgan fingerprints

# --- Step 1: Load dataset with SMILES ---

df = pd.read_csv('../data/Natural_product_cpds.csv')  # Replace with your dataset path

# --- Step 2: Initialize Morgan fingerprint generator ---
generator = GetMorganGenerator(radius=2, fpSize=2048)

# --- Step 3: Define descriptor names and calculation functions ---

descriptor_names = [
    'MolWt', 'MolLogP', 'NumRotatableBonds',
    'NumHAcceptors', 'NumHDonors', 'TPSA', 'RingCount'
]

def calc_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [np.nan] * len(descriptor_names)
    return [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.TPSA(mol),
        Descriptors.RingCount(mol)
    ]

def smiles_to_morgan_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = generator.GetFingerprint(mol)
    return np.array(fp)

# --- Step 4: Generate features ---

morgan_fps = []
descriptor_features = []
valid_indices = []

print("Generating Morgan fingerprints and descriptors...")
for i, smi in enumerate(tqdm(df['SMILES'], desc='Processing molecules')):
    fp = smiles_to_morgan_fp(smi)
    desc = calc_descriptors(smi)
    if fp is not None and not any(np.isnan(desc)):
        morgan_fps.append(fp)
        descriptor_features.append(desc)
        valid_indices.append(i)

# Filter valid molecules
df_valid = df.iloc[valid_indices].reset_index(drop=True)
X_fp = np.array(morgan_fps)
X_desc = np.array(descriptor_features)

# Combine features
X_combined = np.concatenate([X_fp, X_desc], axis=1)

print(f"Feature matrix shape: {X_combined.shape}")

Generating Morgan fingerprints and descriptors...


Processing molecules:  30%|██▉       | 398573/1329645 [20:42<1:40:19, 154.68it/s][03:06:04] Unusual charge on atom 37 number of radical electrons set to zero
[03:06:04] Unusual charge on atom 37 number of radical electrons set to zero
Processing molecules: 100%|██████████| 1329645/1329645 [1:10:46<00:00, 313.14it/s]


Feature matrix shape: (1329645, 2055)


In [48]:
# --- Step 5: Load trained Random Forest model ---

rf_model = joblib.load('../models/random_forest_model.pkl')  # Update path accordingly

# --- Step 6: Predict and filter actives ---

print("Predicting activity...")
predictions = rf_model.predict(X_combined)

# Map numeric predictions to labels
labels = ['inactive' if p == 0 else 'active' for p in predictions]

# Add predicted labels to DataFrame
df_valid['predicted_activity'] = labels

# Filter to keep only predicted actives
df_actives = df_valid[df_valid['predicted_activity'] == 'active']

# Select relevant columns
df_actives_filtered = df_actives[['ID', 'SMILES', 'predicted_activity']]

# --- Step 7: Save results ---

##output_csv = '../data/predicted_actives.csv'
##df_actives_filtered.to_csv(output_csv, index=False)

##print(f"Saved {len(df_actives_filtered)} predicted active compounds to '{output_csv}'")

Predicting activity...


In [49]:
# --- Step 4: Calculate descriptors ---
descriptor_names = [
    'MolWt', 'MolLogP', 'NumRotatableBonds',
    'NumHAcceptors', 'NumHDonors', 'TPSA', 'RingCount'
]

def calc_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [np.nan] * len(descriptor_names)
    return [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.TPSA(mol),
        Descriptors.RingCount(mol)
    ]

print("Calculating descriptors for filtered compounds...")
desc_list = []
for smi in tqdm(df_actives_filtered['SMILES']):
    desc_list.append(calc_descriptors(smi))

df_desc = pd.DataFrame(desc_list, columns=descriptor_names)

# --- Step 5: Combine descriptors with filtered dataframe ---
df_filtered = df_actives_filtered.reset_index(drop=True)
df_final = pd.concat([df_filtered, df_desc], axis=1)

# --- Step 6: Apply Lipinski's Rule of 5 filter ---
# Lipinski's rules:
# - Molecular weight <= 500
# - LogP <= 5
# - Hydrogen bond donors <= 5
# - Hydrogen bond acceptors <= 10

lipinski_filter = (
    (df_final['MolWt'] <= 500) &
    (df_final['MolLogP'] <= 5) &
    (df_final['NumHDonors'] <= 5) &
    (df_final['NumHAcceptors'] <= 10)
)

df_lipinski = df_final[lipinski_filter].reset_index(drop=True)

print(f"Count after Lipinski's Rule of 5 filter: {len(df_lipinski)}")

Calculating descriptors for filtered compounds...


 24%|██▍       | 82138/343875 [01:29<08:44, 499.38it/s][05:15:39] Unusual charge on atom 37 number of radical electrons set to zero
100%|██████████| 343875/343875 [07:01<00:00, 816.44it/s] 


Count after Lipinski's Rule of 5 filter: 139743


In [51]:
# --- Step 7: Save the final filtered dataset ---
output_path = '../data/filtered_Natural_Product_actives.csv'
df_lipinski.to_csv(output_path, index=False)
print(f"Final filtered dataset saved to: {output_path}")

Final filtered dataset saved to: ../data/filtered_Natural_Product_actives.csv
