### Derive Chemistry
- Project: DSA103 derive annotation chemistry from R script
- Author: Julian Stoerr
- Date: 30 November 2025

In [35]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from rdkit.Chem import Descriptors

Load data and inspect head

In [None]:
tropical_df = pd.read_csv("../data/mtbs_tropical_annotations.tsv",sep="\t")
tropical_df.head(3)

Unnamed: 0,feature_id,component_id,libname,structure_inchikey,structure_smiles,structure_molecular_formula,structure_taxonomy_npclassifier_01pathway,structure_taxonomy_npclassifier_02superclass,structure_taxonomy_npclassifier_03class
0,64000,2173,MS1_match,SRBFZHDQGSBBOR-HWQSCIPKSA-N,OC1OC[C@H](O)[C@H](O)[C@H]1O,C5H10O5,Carbohydrates,Saccharides,Monosaccharides
1,64000,2173,MS1_match,PYMYPHUHKUWMLA-WDCZJNDASA-N,O=C[C@@H](O)[C@H](O)[C@H](O)CO,C5H10O5,Carbohydrates,Saccharides,Monosaccharides
2,63994,2138,MS1_match,GLDOVTGHNKAZLK-UHFFFAOYSA-N,CCCCCCCCCCCCCCCCCCO,C18H38O,Fatty acids,Fatty acyls,Fatty alcohols


Create mask for is smiles ok

In [24]:
mask = (
    tropical_df["structure_smiles"].notna() & 
    tropical_df["structure_smiles"].ne("")
)

compounds_ok = tropical_df[mask]


In [25]:
compounds_ok.groupby("structure_smiles")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002877E4E7490>

In [26]:
def most_common(series):
    counts = series.value_counts(dropna=True)
    if len(counts) == 0:
        return None
    else:
        return counts.idxmax()

In [27]:
compReady = (
    compounds_ok
    .groupby("structure_smiles")["structure_taxonomy_npclassifier_01pathway"]
    .agg(most_common)
    .reset_index()
)


In [28]:
print(compReady.shape)
compReady.head(5)


(24046, 2)


Unnamed: 0,structure_smiles,structure_taxonomy_npclassifier_01pathway
0,C#C/C=C\CCCC#C/C=C/CCCCCCC/C=C\C#C,Fatty acids
1,C#C/C=C\CCCC#CCCCCCCCCCCC#C,Fatty acids
2,C#C/C=C\CCCCC#CCCCCC#CCCCC#CCO,Fatty acids
3,C#CC#CC/C=C/CCCCC/C=C/C(=O)N1CCCCC1,Alkaloids
4,C#CC#CC=CC=CC=CCCO,Fatty acids


In [29]:
compReady["SID"] = "S" + (compReady.index + 1).astype(str)

In [30]:
compReady.head(5)

Unnamed: 0,structure_smiles,structure_taxonomy_npclassifier_01pathway,SID
0,C#C/C=C\CCCC#C/C=C/CCCCCCC/C=C\C#C,Fatty acids,S1
1,C#C/C=C\CCCC#CCCCCCCCCCCC#C,Fatty acids,S2
2,C#C/C=C\CCCCC#CCCCCC#CCCCC#CCO,Fatty acids,S3
3,C#CC#CC/C=C/CCCCC/C=C/C(=O)N1CCCCC1,Alkaloids,S4
4,C#CC#CC=CC=CC=CCCO,Fatty acids,S5


In [42]:
#do not use these descriptors:   redundant <- c(2, 7, 8, 11, 15, 17, 18, 20, 21, 24, 29, 33:38, 41, 43:45)

redundant = (
    [2, 7, 8, 11, 15, 17, 18, 20, 21, 24, 29] +
    list(range(33, 39)) +
    [41] +
    list(range(43, 46))
)
redundant = [i - 1 for i in redundant]

all_desc = list(Descriptors.descList)

desc_used = []

for id, name in enumerate(all_desc):
    if id not in redundant:
        desc_used.append(name)
print(desc_used)



[('MaxAbsEStateIndex', <function MaxAbsEStateIndex at 0x00000287052D3600>), ('MinAbsEStateIndex', <function MinAbsEStateIndex at 0x00000287052D36A0>), ('MinEStateIndex', <function MinEStateIndex at 0x00000287052D3560>), ('qed', <function qed at 0x0000028704752480>), ('SPS', <function SPS at 0x0000028704752A20>), ('ExactMolWt', <function <lambda> at 0x0000028704753240>), ('NumValenceElectrons', <function NumValenceElectrons at 0x00000287047532E0>), ('MaxPartialCharge', <function MaxPartialCharge at 0x00000287047534C0>), ('MinPartialCharge', <function MinPartialCharge at 0x0000028704753560>), ('MaxAbsPartialCharge', <function MaxAbsPartialCharge at 0x0000028704753600>), ('FpDensityMorgan1', <function FpDensityMorgan1 at 0x0000028704753880>), ('BCUT2D_MWHI', <function BCUT2D_MWHI at 0x0000028704753A60>), ('BCUT2D_CHGLO', <function BCUT2D_CHGLO at 0x0000028704753C40>), ('BCUT2D_LOGPHI', <function BCUT2D_LOGPHI at 0x0000028704753CE0>), ('BCUT2D_MRHI', <function BCUT2D_MRHI at 0x000002870475

In [44]:
from rdkit import Chem
compReady["mol"] = compReady["structure_smiles"].apply(Chem.MolFromSmiles)

In [None]:
rows = []

for mol in compReady["mol"]:
    row = {}
    for name, func in desc_used:
        row[name] = func(mol)

    rows.append(row)

desc_df = pd.DataFrame(rows)
#desc_df.head(5)


Unnamed: 0,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,ExactMolWt,NumValenceElectrons,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,5.138392,0.937871,0.937871,0.332032,10.636364,292.219101,116,0.009513,-0.115306,0.115306,...,0,0,0,2,0,0,0,0,8,0
1,5.223961,0.943318,0.943318,0.309396,9.809524,282.234751,114,0.009159,-0.120109,0.120109,...,0,0,0,2,0,0,0,0,11,0
2,8.490439,0.043231,-0.043231,0.492037,9.043478,308.214016,122,0.10373,-0.383656,0.383656,...,0,0,0,1,0,0,0,0,8,0
3,11.886906,0.190474,0.190474,0.285907,14.772727,297.209264,118,0.245728,-0.339176,0.339176,...,0,0,0,1,0,0,0,0,4,0
4,8.439692,0.188318,0.188318,0.504961,10.461538,172.088815,66,0.046513,-0.396054,0.396054,...,0,0,0,1,0,0,0,0,0,0


In [49]:
compOut = pd.concat(
    [compReady.reset_index(drop=True), desc_df],
    axis=1
)
print(compOut.shape)
compOut.head(5)


(24046, 200)


Unnamed: 0,structure_smiles,structure_taxonomy_npclassifier_01pathway,SID,mol,MaxAbsEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,ExactMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,C#C/C=C\CCCC#C/C=C/CCCCCCC/C=C\C#C,Fatty acids,S1,<rdkit.Chem.rdchem.Mol object at 0x00000287082...,5.138392,0.937871,0.937871,0.332032,10.636364,292.219101,...,0,0,0,2,0,0,0,0,8,0
1,C#C/C=C\CCCC#CCCCCCCCCCCC#C,Fatty acids,S2,<rdkit.Chem.rdchem.Mol object at 0x00000287082...,5.223961,0.943318,0.943318,0.309396,9.809524,282.234751,...,0,0,0,2,0,0,0,0,11,0
2,C#C/C=C\CCCCC#CCCCCC#CCCCC#CCO,Fatty acids,S3,<rdkit.Chem.rdchem.Mol object at 0x00000287082...,8.490439,0.043231,-0.043231,0.492037,9.043478,308.214016,...,0,0,0,1,0,0,0,0,8,0
3,C#CC#CC/C=C/CCCCC/C=C/C(=O)N1CCCCC1,Alkaloids,S4,<rdkit.Chem.rdchem.Mol object at 0x00000287082...,11.886906,0.190474,0.190474,0.285907,14.772727,297.209264,...,0,0,0,1,0,0,0,0,4,0
4,C#CC#CC=CC=CC=CCCO,Fatty acids,S5,<rdkit.Chem.rdchem.Mol object at 0x00000287082...,8.439692,0.188318,0.188318,0.504961,10.461538,172.088815,...,0,0,0,1,0,0,0,0,0,0
