In [1]:
import pandas as pd
from rdflib import Graph, RDFS, URIRef, RDF
from collections import defaultdict

In [4]:
# === Load RDF Graph ===
g = Graph()
g.parse("graph_odor.ttl", format="ttl")

# === Build Subclass Hierarchy: child -> set of parents ===
subclass_map = defaultdict(set)
instance_map = defaultdict(set)

for child, _, parent in g.triples((None, RDFS.subClassOf, None)):
    child_label = child.split('#')[-1].split('/')[-1].lower()
    parent_label = parent.split('#')[-1].split('/')[-1].lower()
    subclass_map[child_label].add(parent_label)

# Map individuals to classes
for instance, _, cls in g.triples((None, RDF.type, None)):
    if cls != RDFS.Class:  # <-- skip RDF class declarations
        instance_label = instance.split('#')[-1].split('/')[-1].lower()
        class_label = cls.split('#')[-1].split('/')[-1].lower()
        instance_map[instance_label].add(class_label)

# === Recursively find all ancestors ===
def get_all_parents(term, visited=None):
    if visited is None:
        visited = set()
    for parent in subclass_map.get(term, []):
        if parent not in visited:
            visited.add(parent)
            get_all_parents(parent, visited)
    return visited

# === Load CSV ===
df = pd.read_csv("C:/Users/suman/OneDrive/Bureau/Internship_Study/GNN_On_OdorPrediction/data/raw_data/OdorCAS_preprocessed.csv")
df.columns = [col.lower() for col in df.columns]

# Get list of descriptor columns (excluding 'cas_number')
descriptor_cols = list(df.columns)
descriptor_cols.remove("cas_number")

# Track Update
all_new_descriptors = set()
modifications_log = []
total_added_descriptors = 0
total_modified_cas = 0
excluded_descriptors = {"odeur"}

# === Enrich Descriptors Per Row ===
for index, row in df.iterrows():
    cas_number = row["cas_number"]
    active_descriptors = {col for col in descriptor_cols if row[col] == 1}
    enriched_descriptors = set()

    # Get all parent descriptors for each active descriptor
    for desc in active_descriptors:
        # 1. Add class parents
        enriched_descriptors.update(get_all_parents(desc))

        # 2. Add rdf:type class if desc is an instance
        for parent_class in instance_map.get(desc, []):
            enriched_descriptors.add(parent_class)
            enriched_descriptors.update(get_all_parents(parent_class))

    # List of newly added descriptors
    added_descriptors = []
    for new_desc in enriched_descriptors:
        if new_desc not in excluded_descriptors:  # Skip excluded descriptors
            if new_desc not in df.columns:
                all_new_descriptors.add(new_desc)
                df[new_desc] = 0  # Initialize new column with 0
                
            # If the descriptor wasn't already set to 1, set it to 1
            if df.at[index, new_desc] == 0:
                df.at[index, new_desc] = 1
                added_descriptors.append(new_desc)

    # Log modification if any new descriptors were added
    if added_descriptors:
        modifications_log.append({
            "cas_number": cas_number,
            "new_descriptors_added": added_descriptors
        })
        total_added_descriptors += len(added_descriptors)
        total_modified_cas += 1

# === Save the updated CSV ===
df.to_csv("Hierarchy_OdorCAS.csv", index=False)
print("Updated 'OdorCAS_preprocessed.csv'")

# Show the modification summary
print(f"\nNumber of CAS modified: {total_modified_cas}")
print(f"Number of descriptors added: {total_added_descriptors}")

# Show and save modification log if any changes were made
if modifications_log:
    print("\n=== Modification Summary ===")
    for entry in modifications_log:
        print(f"CAS {entry['cas_number']} ➜ added descriptors: {', '.join(entry['new_descriptors_added'])}")
    
    # Save the log to a file
    log_df = pd.DataFrame(modifications_log)
    log_df.to_csv("LOG_label_added.csv", index=False)
    print("\nLog saved.")
else:
    print("\nNo changes were made to descriptors.")


if all_new_descriptors:
    print("\n=== New Columns Added to the CSV ===")
    for col in sorted(all_new_descriptors):
        print(col)
    print(f"\nTotal new descriptor columns added: {len(all_new_descriptors)}")
else:
    print("\nNo new columns were added to the CSV.")

new_descriptor_list = sorted(all_new_descriptors)
print("List of new descriptor columns:\n", new_descriptor_list)


Updated 'OdorCAS_preprocessed.csv'

Number of CAS modified: 3627
Number of descriptors added: 23863

=== Modification Summary ===
CAS 50-70-4 ➜ added descriptors: empyreumatique, cuit, sucre_cuit
CAS 51-67-2 ➜ added descriptors: empyreumatique, chimique, vegetal, soufre
CAS 56-12-2 ➜ added descriptors: cuit, empyreumatique
CAS 56-85-9 ➜ added descriptors: empyreumatique, lactique, malte, creme, torrefie, gras
CAS 56-86-0 ➜ added descriptors: empyreumatique, cereale, malte, cuit, fermentaire
CAS 57-06-7 ➜ added descriptors: racine, crucifere, terreux, vegetal, soufre, legume
CAS 57-10-3 ➜ added descriptors: animal, graisse_animale, aldehyde, vegetal_vert, vegetal
CAS 57-50-1 ➜ added descriptors: empyreumatique, cuit, sucre_cuit
CAS 58-08-2 ➜ added descriptors: mineral, metal
CAS 58-86-6 ➜ added descriptors: empyreumatique, cuit, fume, phenole, sucre_cuit
CAS 60-01-5 ➜ added descriptors: lactique, fermentaire
CAS 60-12-8 ➜ added descriptors: empyreumatique, fleur_d_arbuste, malte, cuit, 

### Print newly added Descriptors Columns

In [9]:
import textwrap

# Load the files for comaparision
original_df = pd.read_csv("C:/Users/suman/OneDrive/Bureau/Internship_Study/GNN_On_OdorPrediction/data/raw_data/OdorCAS_preprocessed.csv")
enriched_df = pd.read_csv("Hierarchy_OdorCAS.csv")

# Get descriptor columns (excluding the cas_number column)
original_cols = set(original_df.columns) - {"cas_number"}
enriched_cols = set(enriched_df.columns) - {"cas_number"}

# Find newly added descriptors
new_descriptors = sorted(enriched_cols - original_cols)

# Output
print(f"Number of new descriptors added: {len(new_descriptors)}")
print("Newly added descriptor columns:")

output = textwrap.fill(", ".join(new_descriptors), width=100)
print(output)

Number of new descriptors added: 42
Newly added descriptor columns:
alcool_blanc, alcool_de_fut, aldehyde_frais, amine, biscuit, bois, boisson_fermentee,
boissons_alcoolisees, cereale_torrefiee, citronne, crucifere, cucurbitacee, empyreumatique,
fleur_d_arbre, fleur_d_arbuste, fleur_du_jardin, fruit_a_coque_torrefie, fruit_a_noyau,
fruit_a_pepins, fruit_cuit, fruit_noir, fruit_rouge, fume, graisse_animale, legume_vert,
legumes_cuits, metal, mielle, mineral, musque, oleagineux, patisserie, peponide, pied,
plante_aromatique, pyrazique, sucre_cuit, vegetal, vegetal_sec, vegetal_vert, vert_pyrazique, zeste
