In [2]:
import pandas as pd
from rdflib import Graph, RDFS, URIRef, RDF
from collections import defaultdict

#### Saturate Data with RDF Graph

In [None]:
# Load RDF Graph
g = Graph()
g.parse("graph_odor.ttl", format="ttl")

# Build Subclass Hierarchy: child -> set of parents
subclass_map = defaultdict(set)
instance_map = defaultdict(set)

for child, _, parent in g.triples((None, RDFS.subClassOf, None)):
    child_label = child.split('#')[-1].split('/')[-1].lower()
    parent_label = parent.split('#')[-1].split('/')[-1].lower()
    subclass_map[child_label].add(parent_label)

# Map individuals to classes
for instance, _, cls in g.triples((None, RDF.type, None)):
    if cls != RDFS.Class:  # <-- skip RDF class declarations
        instance_label = instance.split('#')[-1].split('/')[-1].lower()
        class_label = cls.split('#')[-1].split('/')[-1].lower()
        instance_map[instance_label].add(class_label)

# Recursively find all ancestors
def get_all_parents(term, visited=None):
    if visited is None:
        visited = set()
    for parent in subclass_map.get(term, []):
        if parent not in visited:
            visited.add(parent)
            get_all_parents(parent, visited)
    return visited

# Load CSV
df = pd.read_csv("PreprocessData/FrequentOdorExtraction/(Raw)SoS_Full.csv")
df.columns = [col.lower() for col in df.columns]

# Get list of descriptor columns (excluding 'cas_number')
descriptor_cols = list(df.columns)
descriptor_cols.remove("cas_number")

# Track Update
all_new_descriptors = set()
modifications_log = []
total_added_descriptors = 0
total_modified_cas = 0
excluded_descriptors = {"odeur"}

# Enrich Descriptors Per Row
for index, row in df.iterrows():
    cas_number = row["cas_number"]
    active_descriptors = {col for col in descriptor_cols if row[col] == 1}
    enriched_descriptors = set()

    # Get all parent descriptors for each active descriptor
    for desc in active_descriptors:
        # 1. Add class parents
        enriched_descriptors.update(get_all_parents(desc))

        # 2. Add rdf:type class if desc is an instance
        for parent_class in instance_map.get(desc, []):
            enriched_descriptors.add(parent_class)
            enriched_descriptors.update(get_all_parents(parent_class))

    # List of newly added descriptors
    added_descriptors = []
    for new_desc in enriched_descriptors:
        if new_desc not in excluded_descriptors:  # Skip excluded descriptors
            if new_desc not in df.columns:
                all_new_descriptors.add(new_desc)
                df[new_desc] = 0 
                
            # If the descriptor wasn't already set to 1, set it to 1
            if df.at[index, new_desc] == 0:
                df.at[index, new_desc] = 1
                added_descriptors.append(new_desc)

    # Log modification if any new descriptors were added
    if added_descriptors:
        modifications_log.append({
            "cas_number": cas_number,
            "new_descriptors_added": added_descriptors
        })
        total_added_descriptors += len(added_descriptors)
        total_modified_cas += 1

# Save the updated CSV
df.to_csv("(Saturated)SoS_Full.csv", index=False)
print("Updated '(Raw)SoS_Full.csv'")

# Show the modification summary
print(f"\nNumber of CAS modified: {total_modified_cas}")
print(f"Number of descriptors added: {total_added_descriptors}")

# Show and save modification log if any changes were made
if modifications_log:
    print("\n ----- Modification Summary -----")
    for entry in modifications_log:
        print(f"CAS {entry['cas_number']} ➜ added descriptors: {', '.join(entry['new_descriptors_added'])}")
    
    # Save the log to a file
    log_df = pd.DataFrame(modifications_log)
    log_df.to_csv("LOG_label_added.csv", index=False)
    print("\nLog saved.")
else:
    print("\nNo changes were made to descriptors.")
    
if all_new_descriptors:
    print("\n ---- New Columns Added to the CSV ----")
    for col in sorted(all_new_descriptors):
        print(col)
    print(f"\nTotal new descriptor columns added: {len(all_new_descriptors)}")
else:
    print("\nNo new columns were added to the CSV.")

new_descriptor_list = sorted(all_new_descriptors)
print("List of new descriptor columns:\n", new_descriptor_list)


Updated '(Raw)SoS_Full.csv'

Number of CAS modified: 3497
Number of descriptors added: 23077

 ----- Modification Summary -----
CAS 6313-76-4 ➜ added descriptors: vegetal_vert, frais, malte, vegetal
CAS 85993-25-5 ➜ added descriptors: metal, mineral
CAS 123-75-1 ➜ added descriptors: chimique, amine, frais, marin, fruit_de_mer
CAS 3184-13-2 ➜ added descriptors: vegetal_vert, frais, vegetal
CAS 55589-62-3 ➜ added descriptors: empyreumatique, cuit, sucre_cuit
CAS 150-90-3 ➜ added descriptors: cuit, empyreumatique
CAS 4502-00-5 ➜ added descriptors: lactique, gras
CAS 24276-84-4 ➜ added descriptors: chimique, medicinal
CAS 142-47-2 ➜ added descriptors: cuit, empyreumatique
CAS 120-72-9 ➜ added descriptors: fermentaire, produit_petrolier
CAS 109-97-7 ➜ added descriptors: fruite, fruit_sec, chimique, solvant, gras, frais, oleagineux
CAS 12135-76-1 ➜ added descriptors: animal
CAS 103-64-0 ➜ added descriptors: vegetal_vert, frais, fleur_du_jardin, floral, vegetal
CAS 7511-41-3 ➜ added descripto

#### Saturate mapped OpenPOM dataset

In [None]:
from rdflib import Graph, RDF, RDFS
from collections import defaultdict
import pandas as pd

# Load RDF Graph
g = Graph()
g.parse("graph_odor.ttl", format="ttl")

# Build Subclass Hierarchy: child → parents
subclass_map = defaultdict(set)
instance_map = defaultdict(set)

for child, _, parent in g.triples((None, RDFS.subClassOf, None)):
    child_label = child.split('#')[-1].split('/')[-1].lower()
    parent_label = parent.split('#')[-1].split('/')[-1].lower()
    subclass_map[child_label].add(parent_label)

# Map individuals to classes
for instance, _, cls in g.triples((None, RDF.type, None)):
    if cls != RDFS.Class:
        instance_label = instance.split('#')[-1].split('/')[-1].lower()
        class_label = cls.split('#')[-1].split('/')[-1].lower()
        instance_map[instance_label].add(class_label)

# Build label mapping: French name → English label
french_to_english_label = {}

for s, _, label in g.triples((None, RDFS.label, None)):
    if label.language == 'en':
        name = s.split('#')[-1].split('/')[-1].lower()
        french_to_english_label[name] = label.lower()

# Reverse mapping: English label → RDF class name (French)
english_to_french_class = {v: k for k, v in french_to_english_label.items()}

# Recursive Ancestor Finder
def get_all_parents(term, visited=None):
    if visited is None:
        visited = set()
    for parent in subclass_map.get(term, []):
        if parent not in visited:
            visited.add(parent)
            get_all_parents(parent, visited)
    return visited

# Load CSV
df = pd.read_csv("PreprocessData/FrequentOdorExtraction/Mapped_OpenPOM + unmapped_odor.csv")
df.columns = [col.strip().lower() for col in df.columns]
print("Columns:", df.columns.tolist())

# First column is assumed to be SMILES
id_column = df.columns[0] 
descriptor_cols = [col for col in df.columns if col != id_column]

all_new_descriptors = set()
modifications_log = []
total_added_descriptors = 0
total_modified_rows = 0
excluded_descriptors = {"odeur"}

for index, row in df.iterrows():
    molecule_id = row[id_column]
    active_descriptors = {col for col in descriptor_cols if row[col] == 1}
    enriched_descriptors = set()

    for desc in active_descriptors:
        rdf_name = english_to_french_class.get(desc.lower())
        if rdf_name:
            enriched_descriptors.update(get_all_parents(rdf_name))
            for parent_class in instance_map.get(rdf_name, []):
                enriched_descriptors.add(parent_class)
                enriched_descriptors.update(get_all_parents(parent_class))

    added_descriptors = []
    for new_desc in enriched_descriptors:
        if new_desc not in excluded_descriptors:
            english_label = french_to_english_label.get(new_desc, new_desc)
            if english_label not in df.columns:
                all_new_descriptors.add(english_label)
                df[english_label] = 0
            if df.at[index, english_label] == 0:
                df.at[index, english_label] = 1
                english_label = french_to_english_label.get(new_desc, new_desc)
                added_descriptors.append(english_label)

    if added_descriptors:
        modifications_log.append({
            id_column: molecule_id,
            "new_descriptors_added": added_descriptors
        })
        total_added_descriptors += len(added_descriptors)
        total_modified_rows += 1

df.to_csv("(sat)openpom_data.csv", index=False)
print("Saved enriched CSV as '(sat)openpom_data.csv'")

print(f"\nNumber of molecules modified: {total_modified_rows}")
print(f"Number of descriptors added: {total_added_descriptors}")

if modifications_log:
    print("\nModification Summary:")
    for entry in modifications_log:
        print(f"{entry[id_column]} ➜ added: {', '.join(entry['new_descriptors_added'])}")
    log_df = pd.DataFrame(modifications_log)
    log_df.to_csv("LOG_label_added.csv", index=False)
    print("\nSaved log to 'LOG_label_added.csv'")
else:
    print("\nNo descriptors were added to any rows.")

if all_new_descriptors:
    print("\nNew Descriptor Columns Added:")
    for col in sorted(all_new_descriptors):
        print(f"- {col}")
    print(f"\nTotal new columns: {len(all_new_descriptors)}")

print("\nFinal list of new descriptor columns:\n", sorted(all_new_descriptors))


Columns: ['smiles', 'descriptors', 'alcohol', 'aldehydic', 'alliaceous', 'bitter almond', 'amber', 'animal', 'anisic', 'apple', 'apricot', 'aromatic', 'balsamic', 'banana', 'beefy', 'bergamot', 'berry', 'bitter', 'black currant', 'brandy', 'burnt', 'butter', 'cabbage', 'camphor', 'caramel', 'cedarwood', 'celery', 'chamomile', 'cheese', 'maraschino cherry', 'dark chocolate', 'cinnamon', 'citrus peel', 'clean', 'clove', 'cocoa', 'coconut', 'roasted coffee', 'cognac', 'cooked', 'cooling', 'cortex', 'coumarin', 'cream', 'cucumber', 'dairy', 'dry', 'earthy', 'ethereal', 'fatty', 'fermented', 'fish', 'floral', 'fresh', 'fruit skin', 'fruity', 'garlic', 'gassy', 'geranium', 'grape', 'grapefruit', 'grass', 'green', 'bush flower', 'hay', 'hazelnut', 'grass.1', 'honey', 'hyacinth', 'jasmine', 'juicy', 'ketonic', 'lactone', 'lavender', 'leafy', 'leathery', 'lemony', 'lily', 'malt', 'cooked meat', 'medicinal', 'melon', 'metallic', 'milky', 'minty', 'muguet', 'mushroom', 'musky', 'musty', 'natural'