In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import defaultdict, deque

### Generate Unsaturated Data

In [None]:
# Load dataset
df = pd.read_csv('PreprocessData/FrequentOdorExtraction/(Raw)SoS_Full.csv')

# Separate meta data and label columns
label_columns = df.columns[2:]
smiles_df = df.iloc[:, :2]
label_df = df[label_columns]
label_array = label_df.values.astype(int)
n_labels = len(label_columns)

# Identify parent-child relationships (child implies parent)
child_to_parents = defaultdict(set)
threshold = 0.95

for i in range(n_labels):
    for j in range(n_labels):
        if i == j:
            continue
        child = label_array[:, i]
        parent = label_array[:, j]
        child_present = (child == 1)
        both_present = (child == 1) & (parent == 1)
        if np.sum(child_present) == 0:
            continue
        containment_ratio = np.sum(both_present) / (np.sum(child_present) + 1e-6)
        if containment_ratio >= threshold:
            child_to_parents[label_columns[i]].add(label_columns[j])  # child → parent

# Recursive parent graph
def build_parent_closure(child_to_parents):
    closure_map = defaultdict(set)
    for child in label_columns:
        visited = set()
        queue = deque(child_to_parents.get(child, []))
        while queue:
            parent = queue.popleft()
            if parent not in visited:
                visited.add(parent)
                closure_map[child].add(parent)
                queue.extend(child_to_parents.get(parent, []))
    return closure_map

child_to_all_parents = build_parent_closure(child_to_parents)

# Unsaturate rows (remove parent labels if child is active)
removed_counts = defaultdict(int)

def unsaturate_row(row):
    active_labels = {label for label, val in zip(label_columns, row) if val == 1}
    to_remove = set()
    for label in active_labels:
        parents = child_to_all_parents.get(label, set())
        for parent in parents:
            if parent in active_labels:
                removed_counts[parent] += 1
                to_remove.add(parent)
    return [0 if label in to_remove else val for label, val in zip(label_columns, row)]

# Apply unsaturation
unsaturated_array = np.array([unsaturate_row(row) for row in label_array])
unsaturated_df = pd.DataFrame(unsaturated_array, columns=label_columns)

# Save result
final_df = pd.concat([smiles_df, unsaturated_df], axis=1)
output_path = 'PreprocessData/dataUnsaturate/Unsaturated_SoS_Full.csv'

# Summary stats
total_labels_active_before = (label_array == 1).sum()
total_labels_active_after = (unsaturated_array == 1).sum()
total_removed = total_labels_active_before - total_labels_active_after

print("Unsaturation complete and saved.")
print(f"Total label columns: {len(label_columns)}")
print(f"Total labels before unsaturation: {total_labels_active_before}")
print(f"Total labels after unsaturation:  {total_labels_active_after}")
print(f"Total removed labels:             {total_removed}")
print()

# Print top 20 removed parent labels
print("Top 20 parent labels removed (most frequently):")
for label, count in sorted(removed_counts.items(), key=lambda x: x[1], reverse=True)[:20]:
    print(f"{label}: {count} removals")


Unsaturation complete and saved.
Total label columns: 394
Total labels before unsaturation: 19695
Total labels after unsaturation:  19118
Total removed labels:             577

Top 20 parent labels removed (most frequently):
vert: 63 removals
fruite: 53 removals
doux: 42 removals
floral: 34 removals
epice: 30 removals
herbe: 23 removals
fruit_a_coque: 19 removals
gras: 15 removals
viande_cuite: 14 removals
balsamique: 14 removals
animal: 12 removals
soufre: 11 removals
boise: 11 removals
oignon: 10 removals
fruit_exotique: 8 removals
huile: 7 removals
pain: 7 removals
agrume: 7 removals
cacao: 7 removals
frais: 6 removals


#### Check if any Parent -> Child relationship still exists

In [None]:
# Load unsaturated dataset
unsat_df = pd.read_csv('PreprocessData/dataUnsaturate/Unsaturated_SoS_Full.csv')

# Extract label columns
label_columns = unsat_df.columns[2:]

# Find parent-child pairs
parent_child_pairs = []

for possible_parent in label_columns:
    for possible_child in label_columns:
        if possible_parent == possible_child:
            continue
        
        # If whenever child is 1, parent is also 1 → parent-child relationship
        child_active = unsat_df[possible_child] == 1
        if all(unsat_df.loc[child_active, possible_parent] == 1):
            parent_child_pairs.append((possible_parent, possible_child))

# Output result
if parent_child_pairs:
    print("Parent-child relationships still exist in the unsaturated data:")
    for parent, child in parent_child_pairs:
        print(f"Parent: {parent} → Child: {child}")
else:
    print("No parent-child relationships found. The dataset is properly unsaturated.")


Parent-child relationships still exist in the unsaturated data:
Parent: boise → Child: praline
Parent: gras → Child: dinde_cuite
Parent: viande_cuite → Child: dinde_cuite
Parent: viande_cuite → Child: palourde
Parent: aiguille_de_sapin → Child: sapin
Parent: poulet → Child: dinde_cuite


#### Perform Second-Level Unsaturation

From the previous cell the Unsaturation is not complete the Parent-Child relationship still exists.</br>
Parent: boise → Child: praline</br>
Parent: gras → Child: dinde_cuite</br>
Parent: viande_cuite → Child: dinde_cuite</br>
Parent: viande_cuite → Child: palourde</br>
Parent: aiguille_de_sapin → Child: sapin</br>
Parent: poulet → Child: dinde_cuite

In [None]:
unsat_path = 'PreprocessData/dataUnsaturate/Unsaturated_SoS_Full.csv'
unsat_df = pd.read_csv(unsat_path)
label_columns = unsat_df.columns[2:]
label_array = unsat_df[label_columns].values.astype(int)

# Hard-coded remaining parent-child relationships
remaining_pairs = [
    ('viande_cuite', 'palourde'),
    ('gras', 'dinde_cuite'),
    ('viande_cuite', 'dinde_cuite'),
    ('poulet', 'dinde_cuite'),
    ('aiguille_de_sapin', 'sapin'),
    ('boise', 'praline'),
]

# Remove parents where child is active
removed_count = 0
for parent, child in remaining_pairs:
    child_active = unsat_df[child] == 1
    parent_before = unsat_df[parent].sum()
    unsat_df.loc[child_active, parent] = 0
    parent_after = unsat_df[parent].sum()
    removed_count += (parent_before - parent_after)
    print(f"Removed {parent_before - parent_after} '{parent}' labels where '{child}' was active.")

# Save fixed dataset
final_path = 'PreprocessData/dataUnsaturate/Unsaturated_SoS_Final.csv'
unsat_df.to_csv(final_path, index=False)

print("\nSecond-level unsaturation complete and saved.")
print(f"Total additional labels removed: {removed_count}")
print(f"Output file: {final_path}")


Removed 4 'viande_cuite' labels where 'palourde' was active.
Removed 2 'gras' labels where 'dinde_cuite' was active.
Removed 2 'viande_cuite' labels where 'dinde_cuite' was active.
Removed 2 'poulet' labels where 'dinde_cuite' was active.
Removed 2 'aiguille_de_sapin' labels where 'sapin' was active.
Removed 1 'boise' labels where 'praline' was active.

Second-level unsaturation complete and saved.
Total additional labels removed: 13
Output file: C:/Users/suman/OneDrive/Bureau/Internship_Study/GNN_On_OdorPrediction/data/Data_Sampling/Unsaturate_data/Unsaturated_SoS_Full_Final.csv


#### Roll back to cell [21] to validation the saturation with new file. 