In [39]:
## Load libraries
import pandas as pd
from rdkit import Chem


In [38]:
#### Import Datasets

### import dataset from Coconut database
coco = pd.read_csv("../../coconut_csv-06-2025.csv", usecols=['identifier', 'canonical_smiles'])

## Import dataset from the SuperNaT 3.0 database; then remove the rows where smiles are not available
super = pd.read_csv("../../full_data_download.csv", sep=';', usecols=['id', 'smiles'])
super = super[super['smiles'].notnull()]

## Import dataset from the Argentinan database
Argen = pd.read_csv("../../NaturAr_query.csv", usecols=['NatID', 'SMILES'])

## import dataset from the Afrodabase; contains data in .smi and need to be converted to a df with the IDS and SMILES only retained
file_path = '../../smiles_unique_all.smi'

data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        # Split on the first whitespace only
        parts = line.strip().split(maxsplit=1)
        smiles = parts[0]
        name = parts[1] if len(parts) > 1 else ''  # Handle lines with no name
        data.append({'SMILES': smiles, 'ID': name})

afro = pd.DataFrame(data)
afro = afro[['ID', 'SMILES']]


## rename all the headers to standardize and then concat the whole dataframe

# Example renaming for each DataFrame
coco = coco.rename(columns={coco.columns[0]: 'ID', coco.columns[1]: 'SMILES'})
super = super.rename(columns={super.columns[0]: 'ID', super.columns[1]: 'SMILES'})
Argen = Argen.rename(columns={Argen.columns[0]: 'ID', Argen.columns[1]: 'SMILES'})
afro = afro.rename(columns={afro.columns[0]: 'ID', afro.columns[1]: 'SMILES'})


## combine the data into single dataframe
df = pd.concat([coco, super, Argen, afro], ignore_index=True)




In [None]:
## we then canonize the smiles, removed all the NAN and duplicates to get a final compound list

def canonicalize_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None  # invalid SMILES, skip
    return Chem.MolToSmiles(mol, canonical=True)

# Apply canonicalization, skipping invalid SMILES
df['Canonical_SMILES'] = df['SMILES'].apply(canonicalize_smiles)

# Drop rows where canonicalization failed (None values)
filtered_df = df.dropna(subset=['Canonical_SMILES']).copy()

# Replace original SMILES with canonical SMILES
filtered_df['SMILES'] = filtered_df['Canonical_SMILES']

# Remove duplicates based on canonical SMILES
final_df = filtered_df.drop_duplicates(subset=['Canonical_SMILES'])

# Drop the helper column if you want
final_df = final_df.drop(columns=['Canonical_SMILES'])

print(final_df)


In [42]:
## then I save the file into a csv
final_df.to_csv('../data/Natural_product_cpds.csv', index=False)
print('saved to Natural_product_cpds.csv')

saved to Natural_product_cpds.csv
