In [2]:
from rdkit import Chem
import pandas as pd
from rdkit import RDConfig
import os

In [3]:
# SMARTS pattern for ester
ester_smarts = '[#6][C](=O)O[#6]'
ester_pattern = Chem.MolFromSmarts(ester_smarts)

# Load your CSV file (make sure it has a 'smiles' column)
df = pd.read_csv('C:/Users/suman/OneDrive/Bureau/Internship_Study/GNN_On_OdorPrediction/data/(Saturated)SoS_Full.csv', encoding='ISO-8859-1')

# Function to check if SMILES has an ester
def has_ester(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False
    return mol.HasSubstructMatch(ester_pattern)

# Apply check and collect results
df['has_ester'] = df['SMILES'].apply(has_ester)
ester_df = df[df['has_ester']]

# Print summary
print(f"Number of molecules with ester: {len(ester_df)}\n")
print("Molecules containing ester groups:\n")
for idx, row in ester_df.iterrows():
    print(row['SMILES'])




Number of molecules with ester: 1199

Molecules containing ester groups:

C/C(=C/CC[C@@]1(C2C[C@H]3C1([C@H]3C2)C)C)/COC(=O)C
C/C=C(/C)\C(=O)OCC(C)C
C/C=C(/C)\C(=O)OCCC(C)C=C
C/C=C(/C)C(=O)OCCC(C)C
C/C=C(/C)C(=O)OCCc1ccccc1
C/C=C(\C)/C(=O)OCC=C(C)C
C/C=C(\C)/COC(=O)/C(=C/C)/C
C/C=C/C(=O)OC(C)(C)CC1=CC=CC=C1
C/C=C/C(=O)OC(C)C1CCCCC1
C/C=C/C(=O)OC(C)CC(C)C
C/C=C/C(=O)OC/C=C(C)/CCC=C(C)C
C/C=C/C(=O)OC1CCCC1C2CCCC2
C/C=C/C(=O)OC1CCCCC1
C/C=C/C(=O)OCC(C)C
C/C=C/C(=O)OCc1ccccc1
C/C=C/C=C/C(=O)OCC=C
C/C=C/C=C/C(=O)OCC=C
C/C=C/C=C/COC(=O)C
C/C=C/C=C/COC(=O)C(C)C
C/C=C/C=C/COC(C)=O
C/C=C/C=C\COC(=O)C(C)C
C/C=C/C1=CC(=C(C=C1)OC(=O)C)OC
C/C=C/CCC1CCCC(=O)O1
C[C@@H]1CC[C@H]([C@@H](C1)OC(C)=O)C(C)=C
C[C@@H]1CC[C@H]2C(C)(C)[C@H]3C[C@@]12CC=C3COC(C)=O
C[C@@H]1CC[C@H]2C(C)(C)C3CC12CCC3(C)OC(C)=O
C[C@@H]1CCC2=C(C(=O)OC2=C1)C
C[C@H](C(=O)OCCC(C)CCC=C(C)C)O
C[C@H](C1=CC=CC=C1)OC(=O)C
C[C@H]1[C@@H]2CCC(=C[C@@H]2OC1=O)C
C[C@H]1CC[C@H](CC2=C1CC[C@@H]2C)C(C)(C)OC(C)=O
C[C@H]1CCC(=O)O1
C\C=C(/C)C(=O)OC/C=C/c1c

In [None]:
def parse_rdkit_functionalgroups():
    fg_path = os.path.join(RDConfig.RDDataDir, 'FunctionalGroups.txt')
    fg_dict = {}

    with open(fg_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            # Skip comments, empty lines, or lines starting with '//'
            if not line or line.startswith('//') or line.startswith('#'):
                continue
            
            parts = line.split('\t')
            if len(parts) < 2:
                print(f"Skipping malformed line: {line}")
                continue
            
            label = parts[0].strip()
            smarts = parts[1].strip().lstrip('*-')  # Remove any leading '*-' from SMARTS
            
            fg_dict[label] = smarts
    
    return fg_dict

# Run the parser and show results
fg_dict = parse_rdkit_functionalgroups()
print(f"Total functional groups parsed: {len(fg_dict)}\n")

print("First 10 functional groups and their SMARTS:")
for name, smarts in list(fg_dict.items())[:38]:
    print(f"{name}: {smarts}")


Total functional groups parsed: 38

First 10 functional groups and their SMARTS:
-NC(=O)CH3: [N;D2]-[C;D3](=O)-[C;D1;H3]
-C(=O)O: C(=O)[O;D1]
-C(=O)OMe: C(=O)[O;D2]-[C;D1;H3]
-C(=O)H: C(=O)-[C;D1]
-C(=O)N: C(=O)-[N;D1]
-C(=O)CH3: C(=O)-[C;D1;H3]
-N=C=O: [N;D2]=[C;D2]=[O;D1]
-N=C=S: [N;D2]=[C;D2]=[S;D1]
-NO2: [N;D3](=[O;D1])[O;D1]
-N=O: [N;R0]=[O;D1]
=N-O: =[N;R0]-[O;D1]
=NCH3: =[N;R0]-[C;D1;H3]
-N=CH2: [N;R0]=[C;D1;H2]
-N=NCH3: [N;D2]=[N;D2]-[C;D1;H3]
-N=N: [N;D2]=[N;D1]
-N#N: [N;D2]#[N;D1]
-C#N: [C;D2]#[N;D1]
-SO2NH2: [S;D4](=[O;D1])(=[O;D1])-[N;D1]
-NHSO2CH3: [N;D2]-[S;D4](=[O;D1])(=[O;D1])-[C;D1;H3]
-SO3H: [S;D4](=O)(=O)-[O;D1]
-SO3CH3: [S;D4](=O)(=O)-[O;D2]-[C;D1;H3]
-SO2CH3: [S;D4](=O)(=O)-[C;D1;H3]
-SO2Cl: [S;D4](=O)(=O)-[Cl]
-SOCH3: [S;D3](=O)-[C;D1]
-SCH3: [S;D2]-[C;D1;H3]
-S: [S;D1]
=S: =[S;D1]
-X: 
-tBu: [C;D4]([C;D1])([C;D1])-[C;D1]
-CF3: 
-C#CH: [C;D2]#[C;D1;H]
-cPropyl: [C;D3]1-[C;D2]-[C;D2]1
-OEt: [O;D2]-[C;D2]-[C;D1;H3]
-OMe: [O;D2]-[C;D1;H3]
-O: [O;D1]
=O: =[O;D1]
-N: [

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools

df = pd.read_csv('C:/Users/suman/OneDrive/Bureau/Internship_Study/GNN_On_OdorPrediction/data/(Saturated)SoS_Full.csv')

if 'SMILES' not in df.columns:
    raise ValueError("The CSV file must contain a 'SMILES' column.")
molecules = [Chem.MolFromSmiles(smi) for smi in df['SMILES']]

FG_SMARTS_PATTERNS = {
    'CarboxylicAcid': Chem.MolFromSmarts('C(=O)[O,H]'),
    'Alcohol': Chem.MolFromSmarts('[#6][OH]'),
    'Amine': Chem.MolFromSmarts('[NX3;H2,H1;!$(NC=O)]'),
    'Methoxy': Chem.MolFromSmarts('CO'),
    'Aldehyde': Chem.MolFromSmarts('[CX3H1](=O)[#6]'),
    'Acetyl': Chem.MolFromSmarts('CC(=O)'),
    'Nitrile': Chem.MolFromSmarts('C#N'),
    'Tert-butyl': Chem.MolFromSmarts('C(C)(C)C'),
    'Thiol': Chem.MolFromSmarts('[#6][SH]'),
    'Thioether': Chem.MolFromSmarts('[#6][S][#6]'),
    'Carbonyl': Chem.MolFromSmarts('C=O'),
    'Ethoxy': Chem.MolFromSmarts('CCO'),
    'Ester': Chem.MolFromSmarts('C(=O)O'),
    'Terpenes': Chem.MolFromSmarts('C=C(C)C'),
    'Halogen': Chem.MolFromSmarts('[F,Cl,Br,I]')
}

FG_NAMES = list(FG_SMARTS_PATTERNS.keys())

def count_molecules_with_functional_groups(mols):
    fg_molecule_counts = {name: 0 for name in FG_NAMES}
    for mol in mols:
        if mol is None:
            continue
        for name in FG_NAMES:
            pattern = FG_SMARTS_PATTERNS[name]
            if mol.HasSubstructMatch(pattern):
                fg_molecule_counts[name] += 1
    return fg_molecule_counts

fg_counts = count_molecules_with_functional_groups(molecules)

print("Functional Group Counts (number of molecules containing each FG):")
for fg, count in fg_counts.items():
    print(f"{fg}: {count}")

df['Mol'] = molecules
def has_any_functional_group(mol):
    if mol is None:
        return False
    return any(mol.HasSubstructMatch(pattern) for pattern in FG_SMARTS_PATTERNS.values())

df['HasFG'] = df['Mol'].apply(has_any_functional_group)

df_with_fg = df[df['HasFG']]
df_without_fg = df[~df['HasFG']]

print("Functional Group Summary:")
print(f"Total molecules: {len(df)}")
print(f"Molecules with at least one functional group: {len(df_with_fg)}")
print(f"Molecules with no functional group: {len(df_without_fg)}")

print("\nMolecules with at least one functional group:")
print(df_with_fg['SMILES'].to_string(index=False))

print("\nMolecules with NO functional group:")
print(df_without_fg['SMILES'].to_string(index=False))



Functional Group Counts (number of molecules containing each FG):
CarboxylicAcid: 1740
Alcohol: 813
Amine: 75
Methoxy: 2358
Aldehyde: 331
Acetyl: 1986
Nitrile: 40
Tert-butyl: 1390
Thiol: 110
Thioether: 191
Carbonyl: 2252
Ethoxy: 2165
Ester: 1379
Terpenes: 729
Halogen: 14
Functional Group Summary:
Total molecules: 3687
Molecules with at least one functional group: 3417
Molecules with no functional group: 270

Molecules with at least one functional group:
                                  [Br].CCSc1ccccn1
              [Fe++].CC(O)C([O-])=O.CC(O)C([O-])=O
                                [H+].[Cl-].C1CCNC1
                         [H+].[Cl-].NCCCC(N)C(O)=O
                  [K+].CC1=CC(=O)N=[S]([O-])(=O)O1
                  [Na+].[Na+].[O-]C(=O)CCC([O-])=O
                        [Na+].CC(C)CC(=O)C([O-])=O
                 [Na+].COc1cc(\C=C\C([O-])=O)ccc1O
                       [Na+].NC(CCC(O)=O)C([O-])=O
                                   Br\C=C\c1ccccc1
              Brc1ccc(NC(=O)c2cc

In [3]:
from rdkit import Chem
from rdkit.Chem import Draw

# SMILES string for propanoic acid
smiles = "CCC(=O)C(=O)O"

# Convert SMILES to molecule
mol = Chem.MolFromSmiles(smiles)
Chem.rdDepictor.Compute2DCoords(mol)

# Create drawing options with transparent background
drawer = Draw.MolDraw2DCairo(300, 300)  # width, height
drawer.drawOptions().setBackgroundColour((1, 1, 1, 0))  # RGBA where A=0 means fully transparent

drawer.DrawMolecule(mol)
drawer.FinishDrawing()

# Save image with transparency
with open("propanal.png", "wb") as f:
    f.write(drawer.GetDrawingText())

print("Transparent molecule image saved as 'propanal.png'")


Transparent molecule image saved as 'propanal.png'
