In [56]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [58]:
import pandas as pd

# Load the file with tab ('\t') separator
file_path = "OdorCAS.csv"
df = pd.read_csv(file_path, sep="\t", dtype=str)

# Step 1: Rename CAS column
df.rename(columns={'Unnamed: 0': 'Cas_Number'}, inplace=True)
print("CAS column renamed.")

# Identify duplicate CAS numbers
duplicate_cas = df[df.duplicated(subset=['Cas_Number'], keep=False)]
print(f"Total duplicate CAS values: {duplicate_cas['Cas_Number'].nunique()}")

# Step 2: Check if duplicates have different odor descriptors
def has_different_odor_descriptors(sub_df):
    return not sub_df.iloc[:, 1:].duplicated().all()  # Check if all odor descriptors are identical

# Group by CAS and filter out the ones with identical odor descriptors
unique_cas = duplicate_cas.groupby('Cas_Number').filter(has_different_odor_descriptors)
removed_cas = set(duplicate_cas['Cas_Number']) - set(unique_cas['Cas_Number'])
print(f"Removed {len(removed_cas)} CAS values where all odor descriptors were identical.")

# Keep only unique CAS entries where duplicates had different descriptors
df = pd.concat([df[~df['Cas_Number'].isin(removed_cas)], unique_cas]).drop_duplicates()

# Step 3: Standardize column names
df.columns = df.columns.str.lower().str.replace(" ", "_")
print("Column names standardized.")

# Step 4: Check for missing values
missing_values = df.isnull().sum().sum()
print(f"Total missing values: {missing_values}")

# Step 5: Remove duplicate odor descriptor columns only if all values are identical
def has_identical_values(col):
    return df[col].nunique() == 1  # Checks if the column has only one unique value

# Identify duplicate columns
duplicate_columns = df.T.duplicated(keep=False)
identical_columns = [col for col in df.columns[1:] if duplicate_columns[col] and has_identical_values(col)]

# Drop identical duplicate columns
df = df.drop(columns=identical_columns)
print(f"Removed {len(identical_columns)} duplicate odor descriptor columns with identical values.")

# Step 6: Analyze distribution of odor descriptors
odor_distribution = df.iloc[:, 1:].sum().sort_values(ascending=False)
print("Odor descriptor distribution analyzed.")

# Step 7: Save the cleaned dataset to a single final CSV
df.to_csv("OdorCAS_preprocessed.csv", index=False)
print("Preprocessing complete. Cleaned dataset saved as 'OdorCAS_preprocessed.csv'.")


CAS column renamed.
Total duplicate CAS values: 75
Removed 0 CAS values where all odor descriptors were identical.
Column names standardized.
Total missing values: 0
Removed 0 duplicate odor descriptor columns with identical values.
Odor descriptor distribution analyzed.
Preprocessing complete. Cleaned dataset saved as 'OdorCAS_preprocessed.csv'.


In [45]:
odor_cas_df = pd.read_csv("OdorCAS_Eng.csv")

In [46]:
odor_cas_df.head()

Unnamed: 0,Cas_Number,acetic,citrus,aldehyde,alliace,Amande_amere,animal,anise,bay,balsamic,...,verbena,detect,wine,vinegar,violet,wasabi,whiskey,yogurt,ylang_ylang,zest_citron
0,50-70-4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,51-67-2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,56-12-2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,56-40-6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,56-41-7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
odor_cas_df.info()
#<class 'pandas.core.frame.DataFrame'>
#RangeIndex: 3841 entries, 0 to 3840
#Columns: 395 entries, cas_number to zeste_citron
#dtypes: int64(394), object(1)
#memory usage: 11.6+ MBNMM<>?

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3841 entries, 0 to 3840
Columns: 395 entries, Cas_Number to zest_citron
dtypes: int64(394), object(1)
memory usage: 11.6+ MB


In [17]:
# Load the dataset
file_path = "OdorCAS_preprocessed.csv"
df = pd.read_csv(file_path)

In [None]:
# Count the occurrences of each odor descriptor (excluding 'cas_number')
odor_counts = df.iloc[:, 1:].sum().sort_values(ascending=False)

# Plot the top 20 most common odors
plt.figure(figsize=(12, 6))
sns.barplot(x=odor_counts[:20].index, y=odor_counts[:20].values, palette="viridis")
plt.xticks(rotation=90)
plt.xlabel("Odor Descriptor")
plt.ylabel("Count")
plt.title("Top 20 Most Common Odors in the Dataset")
plt.show()


### Co-occurrence Analysis: Which odors frequently appear together?

In [None]:
# Load the dataset
file_path = "OdorCAS_preprocessed.csv"
df = pd.read_csv(file_path)

# Compute the co-occurrence matrix for odor descriptors (excluding 'cas_number')
co_occurrence_matrix = df.iloc[:, 1:].T.dot(df.iloc[:, 1:])

# Set diagonal to zero to remove self-co-occurrence
np.fill_diagonal(co_occurrence_matrix.values, 0)

# Select the top 20 most co-occurring odors
top_odors = co_occurrence_matrix.sum(axis=1).sort_values(ascending=False).index[:20]

# Plot the co-occurrence heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(co_occurrence_matrix.loc[top_odors, top_odors], cmap="coolwarm", annot=True, fmt=".0f")
plt.title("Co-occurrence Heatmap of Top 20 Odors")
plt.xlabel("Odor Descriptor")
plt.ylabel("Odor Descriptor")
plt.show()


- Brighter colors (red) indicate stronger co-occurrence.
- Darker colors (blue) indicate less frequent co-occurrence.

### Correlation Analysis: Which odors are highly correlated?

In [None]:
# Compute the correlation matrix (excluding 'cas_number')
correlation_matrix = df.iloc[:, 1:].corr(method="pearson")

# Select the top 20 odors based on their overall correlation strength
top_odors_corr = correlation_matrix.abs().sum(axis=1).sort_values(ascending=False).index[:20]

# Plot the correlation heatmap for the top 20 most correlated odors
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix.loc[top_odors_corr, top_odors_corr], cmap="coolwarm", annot=True, fmt=".2f")
plt.title("Correlation Heatmap of Top 20 Most Correlated Odors")
plt.xlabel("Odor Descriptor")
plt.ylabel("Odor Descriptor")
plt.show()


- Red indicates strong positive correlation (odors often appear together).
- Blue indicates strong negative correlation (rare, since it's binary data).

### Highly correlated odor pairs extracted from the matrix

In [14]:
# Extract highly correlated odor pairs
threshold = 0.5  # Define a strong correlation threshold
correlated_pairs = (correlation_matrix.abs() > threshold).stack()
high_corr_pairs = correlated_pairs[correlated_pairs].reset_index()
high_corr_pairs.columns = ["Odor 1", "Odor 2", "Correlation"]
high_corr_pairs = high_corr_pairs[high_corr_pairs["Odor 1"] != high_corr_pairs["Odor 2"]]  # Remove self-correlations

# Keep only unique pairs (since correlation matrix is symmetric)
high_corr_pairs = high_corr_pairs.loc[high_corr_pairs["Odor 1"] < high_corr_pairs["Odor 2"]]
high_corr_pairs = high_corr_pairs.sort_values(by="Correlation", ascending=False)

# Display top 10 most correlated odor pairs
print(high_corr_pairs.head(10))


          Odor 1   Odor 2  Correlation
86           ail   oignon         True
95    ammoniaque  poisson         True
154        chene    sapin         True
158     chicoree  praline         True
173   citrouille   persil         True
191        datte  pruneau         True
194  dinde_cuite   huitre         True


### Hanling Missing values

In [4]:
import pandas as pd
from rdflib import Graph, RDFS, URIRef, RDF
from collections import defaultdict

In [9]:
# === Load RDF Graph ===
g = Graph()
g.parse("graph_odor.ttl", format="ttl")

# === Build Subclass Hierarchy: child -> set of parents ===
subclass_map = defaultdict(set)
instance_map = defaultdict(set)

for child, _, parent in g.triples((None, RDFS.subClassOf, None)):
    child_label = child.split('#')[-1].split('/')[-1].lower()
    parent_label = parent.split('#')[-1].split('/')[-1].lower()
    subclass_map[child_label].add(parent_label)

# Map individuals to classes
for instance, _, cls in g.triples((None, RDF.type, None)):
    if cls != RDFS.Class:  # <-- skip RDF class declarations
        instance_label = instance.split('#')[-1].split('/')[-1].lower()
        class_label = cls.split('#')[-1].split('/')[-1].lower()
        instance_map[instance_label].add(class_label)

# === Recursively find all ancestors ===
def get_all_parents(term, visited=None):
    if visited is None:
        visited = set()
    for parent in subclass_map.get(term, []):
        if parent not in visited:
            visited.add(parent)
            get_all_parents(parent, visited)
    return visited

# === Load CSV ===
df = pd.read_csv("OdorCAS_preprocessed.csv")
df.columns = [col.lower() for col in df.columns]

# Get list of descriptor columns (excluding 'cas_number')
descriptor_cols = list(df.columns)
descriptor_cols.remove("cas_number")

# Track Update
all_new_descriptors = set()
modifications_log = []
total_added_descriptors = 0
total_modified_cas = 0
excluded_descriptors = {"odeur"}

# === Enrich Descriptors Per Row ===
for index, row in df.iterrows():
    cas_number = row["cas_number"]
    active_descriptors = {col for col in descriptor_cols if row[col] == 1}
    enriched_descriptors = set()

    # Get all parent descriptors for each active descriptor
    for desc in active_descriptors:
        # 1. Add class parents
        enriched_descriptors.update(get_all_parents(desc))

        # 2. Add rdf:type class if desc is an instance
        for parent_class in instance_map.get(desc, []):
            enriched_descriptors.add(parent_class)
            enriched_descriptors.update(get_all_parents(parent_class))
    # List of newly added descriptors
    added_descriptors = []
    for new_desc in enriched_descriptors:
        if new_desc not in excluded_descriptors:  # Skip excluded descriptors
            if new_desc not in df.columns:
                all_new_descriptors.add(new_desc)
                df[new_desc] = 0  # Initialize new column with 0
            # If the descriptor wasn't already set to 1, set it to 1
            if df.at[index, new_desc] == 0:
                df.at[index, new_desc] = 1
                added_descriptors.append(new_desc)

    # Log modification if any new descriptors were added
    if added_descriptors:
        modifications_log.append({
            "cas_number": cas_number,
            "new_descriptors_added": added_descriptors
        })
        total_added_descriptors += len(added_descriptors)
        total_modified_cas += 1

# === Save the updated CSV ===
df.to_csv("OdorCAS_Updated.csv", index=False)
print("Enriched CSV saved as 'OdorCAS_Updated.csv'")

# Show the modification summary
print(f"\nNumber of CAS modified: {total_modified_cas}")
print(f"Number of descriptors added: {total_added_descriptors}")

# Show and save modification log if any changes were made
if modifications_log:
    print("\n=== Modification Summary ===")
    for entry in modifications_log:
        print(f"CAS {entry['cas_number']} ➜ added descriptors: {', '.join(entry['new_descriptors_added'])}")
    
    # Optional: Save the log to a file
    log_df = pd.DataFrame(modifications_log)
    log_df.to_csv("LOG_label_added.csv", index=False)
    print("\nLog saved.")
else:
    print("\nNo changes were made to descriptors.")


if all_new_descriptors:
    print("\n=== New Columns Added to the CSV ===")
    for col in sorted(all_new_descriptors):
        print(col)
    print(f"\nTotal new descriptor columns added: {len(all_new_descriptors)}")
else:
    print("\nNo new columns were added to the CSV.")

new_descriptor_list = sorted(all_new_descriptors)
print("List of new descriptor columns:\n", new_descriptor_list)


Enriched CSV saved as 'OdorCAS_Updated.csv'

Number of CAS modified: 3627
Number of descriptors added: 23863

=== Modification Summary ===
CAS 50-70-4 ➜ added descriptors: sucre_cuit, empyreumatique, cuit
CAS 51-67-2 ➜ added descriptors: soufre, chimique, empyreumatique, vegetal
CAS 56-12-2 ➜ added descriptors: empyreumatique, cuit
CAS 56-85-9 ➜ added descriptors: lactique, empyreumatique, torrefie, malte, creme, gras
CAS 56-86-0 ➜ added descriptors: cuit, fermentaire, empyreumatique, cereale, malte
CAS 57-06-7 ➜ added descriptors: soufre, terreux, legume, vegetal, racine, crucifere
CAS 57-10-3 ➜ added descriptors: animal, vegetal, aldehyde, graisse_animale, vegetal_vert
CAS 57-50-1 ➜ added descriptors: sucre_cuit, empyreumatique, cuit
CAS 58-08-2 ➜ added descriptors: mineral, metal
CAS 58-86-6 ➜ added descriptors: sucre_cuit, empyreumatique, fume, phenole, cuit
CAS 60-01-5 ➜ added descriptors: lactique, fermentaire
CAS 60-12-8 ➜ added descriptors: empyreumatique, malte, fleur_d_arbust

### Get newly added descriptors columns

In [None]:
import pandas as pd

# Load the original and enriched CSV files
original_df = pd.read_csv("OdorCAS_preprocessed.csv")
enriched_df = pd.read_csv("OdorCAS_Updated.csv")

# Get descriptor columns (excluding the cas_number column)
original_cols = set(original_df.columns) - {"cas_number"}
enriched_cols = set(enriched_df.columns) - {"cas_number"}

# Find newly added descriptors
new_descriptors = sorted(enriched_cols - original_cols)

# Output
print(f"Number of new descriptors added: {len(new_descriptors)}")
print("Newly added descriptor columns:\n")
for desc in new_descriptors:
    print(f"- {desc}")


Number of new descriptors added: 15
Newly added descriptor columns:

- amine
- bois
- crucifere
- empyreumatique
- fleur_d_arbre
- fleur_d_arbuste
- fruit_a_pepins
- fruit_rouge
- fume
- mielle
- oleagineux
- sucre_cuit
- vegetal
- vegetal_sec
- vegetal_vert


### Convert CAS -> SMILES

#### Try 1

In [11]:
import polars as pl
import pubchempy as pcp
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# Function to resolve SMILES for CAS numbers in parallel
def resolve_all_smiles(cas_list):
    def cas_to_smiles_cir(cas_str):
        try:
            compounds = pcp.get_compounds(cas_str, 'name')
            return compounds[0].isomeric_smiles if compounds else None
        except Exception:
            return None
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        results = list(tqdm(executor.map(cas_to_smiles_cir, cas_list), total=len(cas_list)))
    return results

# Load CSV with polars
df = pl.read_csv("OdorCAS_Updated.csv")

# Identify the CAS column (case-insensitive) and cast it to string
cas_column = next((col for col in df.columns if 'cas' in col.lower()), None)
if not cas_column:
    raise ValueError("No column containing CAS numbers found.")

# Ensure the CAS column is of string type for consistency
df = df.with_columns(pl.col(cas_column).cast(pl.Utf8))
cas_list = df[cas_column].to_list()

# Extract sensory label columns dynamically (assuming these columns contain 'alcoholic', 'aldehydic', etc.)
sensory_columns = [col for col in df.columns if col.lower() not in ['cas', 'updated_desc', 'isomericsmiles']]

# Resolve SMILES for the CAS numbers in parallel
smiles_list = resolve_all_smiles(cas_list)

# Prepare for the resulting data
smiles_with_labels = []
for i, smiles in enumerate(smiles_list):
    # Prepare the entry for each SMILES with its corresponding labels
    if smiles:
        updated_desc = df["Updated_Desc"][i] if "Updated_Desc" in df.columns else ""
        
        # Generate sensory labels (1 or 0) based on whether the value in the sensory columns is 1
        labels = [1 if df[sensory_columns[j]][i] == 1 else 0 for j in range(len(sensory_columns))]
        
        # Append data as a dictionary (to handle consistency)
        smiles_with_labels.append({
            "IsomericSMILES": smiles,
            "Updated_Desc": updated_desc,
            **{sensory_columns[j]: labels[j] for j in range(len(sensory_columns))}
        })
    else:
        # Handle missing SMILES (None value) by assigning empty strings for SMILES and empty description
        smiles_with_labels.append({
            "IsomericSMILES": None,
            "Updated_Desc": "",
            **{sensory_columns[j]: 0 for j in range(len(sensory_columns))}
        })

# Create the resulting DataFrame using a list of dictionaries
smiles_df = pl.DataFrame(smiles_with_labels)

# Write the new DataFrame with SMILES, descriptions, and labels to CSV
smiles_df.write_csv("OdorCAS_SMILES_with_labels.csv")


100%|██████████| 3841/3841 [03:07<00:00, 20.52it/s]


#### Try 2

In [None]:
import pandas as pd
import pubchempy as pcp
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from tqdm import tqdm

# Load file
df = pd.read_csv("OdorCAS_Updated.csv")

# Create a set of unique CAS numbers to reduce redundant lookups
unique_cas_numbers = df['cas_number'].dropna().unique()

# Caching dictionary
cas_to_smiles_cache = {}

# Function to fetch SMILES for one CAS number
def fetch_smiles(cas):
    try:
        compounds = pcp.get_compounds(cas, 'name')
        if compounds:
            return cas, compounds[0].isomeric_smiles
        else:
            return cas, None
    except Exception:
        return cas, None

# Multithreading + caching
print("Fetching SMILES from PubChem...")
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(fetch_smiles, cas): cas for cas in unique_cas_numbers}

    for future in tqdm(as_completed(futures), total=len(futures)):
        cas, smiles = future.result()
        cas_to_smiles_cache[cas] = smiles
        time.sleep(0.1)  # small delay to avoid server issues

# Add a new 'SMILES' column using the cache
df['SMILES'] = df['cas_number'].map(lambda cas: cas_to_smiles_cache.get(cas, None))

# Reorder columns to have 'cas_number' and 'SMILES' first
ordered_columns = ['cas_number', 'SMILES'] + [col for col in df.columns if col not in ['cas_number', 'SMILES']]
df = df[ordered_columns]

# Save to new CSV
df.to_csv("Odor_SMILES_with_CAS.csv", index=False)
print("Conversion complete. Saved as 'Odor_SMILES_with_CAS.csv'.")


#### Try 3 with Cirpy https://sharifsuliman.medium.com/convert-a-list-of-cas-numbers-to-smiles-using-pubmed-cirpy-or-a-recurrent-neural-network-0154b74397de

In [4]:
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def cas_to_smiles(cas_number):
    url = f"https://cactus.nci.nih.gov/chemical/structure/{cas_number}/smiles"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        smiles = response.text.strip()
        return cas_number, smiles if smiles else None
    except requests.exceptions.RequestException:
        return cas_number, None

if __name__ == "__main__":
    # Load CSV
    df = pd.read_csv("OdorCAS_Updated.csv")
    df = df.dropna(subset=["cas_number"])

    # Get unique CAS numbers
    unique_cas_numbers = df["cas_number"].unique()

    # Fetch SMILES in parallel
    print("Fetching SMILES using multithreading...")
    cas_to_smiles_dict = {}

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(cas_to_smiles, cas): cas for cas in unique_cas_numbers}
        for future in tqdm(as_completed(futures), total=len(futures)):
            cas, smiles = future.result()
            cas_to_smiles_dict[cas] = smiles

    # Map SMILES to all rows (including duplicates)
    df["SMILES"] = df["cas_number"].map(cas_to_smiles_dict)

    # Reorder columns: CAS + SMILES first
    first_cols = ["cas_number", "SMILES"]
    other_cols = [col for col in df.columns if col not in first_cols]
    df = df[first_cols + other_cols]

    # Save result
    df.to_csv("Odor_SMILES_with_CAS.csv", index=False)
    print("✅ Done! Saved as 'Odor_SMILES_with_CAS.csv'")


Fetching SMILES using multithreading...


100%|██████████| 3839/3839 [01:39<00:00, 38.71it/s]


✅ Done! Saved as 'Odor_SMILES_with_CAS.csv'


In [None]:
import cirpy
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def cas_to_smiles(cas_number):
    try:
        # Use CIRpy to get SMILES
        smiles = cirpy.query(cas_number, 'smiles')
        return cas_number, smiles if smiles else None
    except Exception:
        return cas_number, None

if __name__ == "__main__":
    # Load CSV
    df = pd.read_csv("OdorCAS_Updated.csv")
    df = df.dropna(subset=["cas_number"])

    # Get unique CAS numbers
    unique_cas_numbers = df["cas_number"].unique()

    # Fetch SMILES in parallel
    print("Fetching SMILES using multithreading...")
    cas_to_smiles_dict = {}

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(cas_to_smiles, cas): cas for cas in unique_cas_numbers}
        for future in tqdm(as_completed(futures), total=len(futures)):
            cas, smiles = future.result()
            cas_to_smiles_dict[cas] = smiles

    # Map SMILES to all rows (including duplicates)
    df["SMILES"] = df["cas_number"].map(cas_to_smiles_dict)

    # Reorder columns: CAS + SMILES first
    first_cols = ["cas_number", "SMILES"]
    other_cols = [col for col in df.columns if col not in first_cols]
    df = df[first_cols + other_cols]

    # Save result
    df.to_csv("Odor_SMILES_with_CAS.csv", index=False)
    print("✅ Done! Saved as 'Odor_SMILES_with_CAS.csv'")
