In [60]:
from __future__ import annotations
import json, re, sys
from pathlib import Path
from typing import List, Dict, Any, Optional
import pandas as pd
import yaml
from numbers_parser import Document  # pip install numbers-parser


In [62]:
# Convert an Apple Numbers (.numbers) file to one or more CSV files
# ----------------------------------------------------------------
# Requirements (install once in your environment):
#   pip install numbers-parser pandas
#
# Usage (Python):
#   numbers_to_csv("data/Lab Material.numbers", out_dir="data")                     # export every table -> multiple CSVs
#   numbers_to_csv("data/Lab Material.numbers", out_dir="data", sheet="Sheet 1")   # export all tables from a specific sheet
#   numbers_to_csv("data/Lab Material.numbers", out_dir="data", sheet=0, table=0)  # export a single table by index
#
# Notes:
# - The code detects the first row as headers when it looks like text; otherwise
#   it generates Column_1, Column_2, ...
# - Output filenames: "<basename> - <sheet> - <table>.csv"
# - Safe to run on macOS, Linux, or Windows. No AppleScript needed.

from __future__ import annotations
from pathlib import Path
from typing import Union, Optional, List
import re
import pandas as pd

def _safe_name(s: str) -> str:
    s = re.sub(r"[\\/:*?\"<>|]+", "-", str(s))  # strip illegal path chars
    return s.strip().replace("\n", " ").replace("\r", " ")

def _rows_to_dataframe(table) -> pd.DataFrame:
    # Extract raw grid of values
    grid: List[List[object]] = []
    for row in table.rows():
        grid.append([cell.value for cell in row])

    if not grid:
        return pd.DataFrame()

    # Heuristic: treat first row as header if most cells are strings
    first = grid[0]
    num_text = sum(isinstance(x, str) for x in first)
    header_like = num_text >= max(1, len(first) // 2)  # at least half strings

    if header_like:
        headers = [str(h).strip() if h is not None else "" for h in first]
        # Deduplicate empty/duplicate headers
        seen = {}
        for i, h in enumerate(headers):
            base = h or f"Column_{i+1}"
            n = seen.get(base, 0)
            headers[i] = base if n == 0 else f"{base}_{n+1}"
            seen[base] = n + 1
        data = grid[1:]
        df = pd.DataFrame(data, columns=headers)
    else:
        n_cols = max(len(r) for r in grid)
        headers = [f"Column_{i+1}" for i in range(n_cols)]
        data = [r + [None] * (n_cols - len(r)) for r in grid]
        df = pd.DataFrame(data, columns=headers)

    return df

def numbers_to_csv(
    numbers_path: Union[str, Path],
    out_dir: Union[str, Path] = ".",
    sheet: Optional[Union[int, str]] = None,
    table: Optional[Union[int, str]] = None,
    encoding: str = "utf-8-sig",
) -> None:
    """
    Convert an Apple Numbers file to CSV(s).

    numbers_path : path to .numbers file
    out_dir      : folder where CSV(s) will be saved
    sheet        : optional sheet selector (index or name)
    table        : optional table selector within the chosen sheet (index or name)
    encoding     : CSV encoding; utf-8-sig helps Excel recognize UTF-8
    """
    try:
        from numbers_parser import Document  # pip install numbers-parser
    except ModuleNotFoundError as e:
        raise RuntimeError(
            "numbers-parser is required. Install with: pip install numbers-parser"
        ) from e

    numbers_path = Path(numbers_path)
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    doc = Document(numbers_path)
    sheets = doc.sheets

    # Choose sheet(s)
    target_sheets = []
    if sheet is None:
        target_sheets = list(sheets)
    elif isinstance(sheet, int):
        target_sheets = [sheets[sheet]]
    else:
        # name contains match (case-insensitive)
        for sh in sheets:
            if sheet.lower() in sh.name.lower():
                target_sheets.append(sh)
        if not target_sheets:
            raise ValueError(f"Sheet '{sheet}' not found in {numbers_path.name}")

    base = numbers_path.stem
    exported = 0

    for sh in target_sheets:
        tables = sh.tables

        # Choose table(s)
        target_tables = []
        if table is None:
            target_tables = list(tables)
        elif isinstance(table, int):
            target_tables = [tables[table]]
        else:
            for tb in tables:
                if table.lower() in tb.name.lower():
                    target_tables.append(tb)
            if not target_tables:
                raise ValueError(f"Table '{table}' not found in sheet '{sh.name}'")

        for tb in target_tables:
            df = _rows_to_dataframe(tb)
            sheet_name = _safe_name(sh.name or f"Sheet_{getattr(sh, 'index', '')}")
            table_name = _safe_name(tb.name or f"Table_{getattr(tb, 'index', '')}")
            out_file = out_dir / f"{base} - {sheet_name} - {table_name}.csv"
            df.to_csv(out_file, index=False, encoding=encoding)
            exported += 1

    if exported == 0:
        raise RuntimeError("No tables exported; the Numbers file may be empty.")

# Example call (uncomment to run locally):
# numbers_to_csv("data/Lab Material.numbers", out_dir="data")


In [72]:
materials_csv = '/Users/ranykhirbawi/Desktop/LunarAIccord/data/Lab Material.numbers'


In [73]:
numbers_to_csv(materials_csv, out_dir="/Users/ranykhirbawi/Desktop/LunarAIccord/data")

In [None]:

# A) Lab materials: prefer CSV; .numbers is optional fallback
materials_csv = '/Users/ranykhirbawi/Desktop/LunarAIccord/data/Lab_Material.csv'
lab_numbers = DATA_IN / "Lab Material.numbers"
lab_df = None

if lab_csv.exists():
    lab_df = pd.read_csv(lab_csv)
else:
    if lab_numbers.exists():
        try:
            from numbers_parser import Document  # pip install numbers-parser
            doc = Document(lab_numbers)
            sheets = doc.sheets
            tbl = sheets[0].tables[0]
            lab_df = pd.DataFrame(tbl.rows())
            lab_df.columns = [str(c) for c in tbl.columns()]
        except Exception as e:
            print("⚠️ Could not parse .numbers file. Install `numbers-parser` or provide Lab_Material.csv.", e)
            lab_df = pd.DataFrame()
    else:
        lab_df = pd.DataFrame()

lab_df = norm_cols(lab_df)

d


In [74]:
materials_csv = '/Users/ranykhirbawi/Desktop/LunarAIccord/data/Lab_Materials.csv'


In [75]:
lab_df = pd.read_csv(materials_csv)
lab_df

Unnamed: 0,NAME,IAO SUPPLEIR,TENACITY @ 100%,REC. % SOLUTION,TOP,MID,BASE,FAMILY,AROMA PRIMARY,AROMA SECONDARY,...,Column_16,Column_17,Column_18,Column_19,Column_20,Column_21,Column_22,Column_23,Column_24,Column_25
0,Adoxal,Perfumers Apprentice,168hrs,0.1,,,x,aldehydic,fresh,marine,...,,,,,,,,,,
1,Agarwood,Eden Botanicals,,,,,,woody,oud,dry,...,,,,,,,,,,
2,Agrumen Aldehyde,Vigon,276hrs,,,,x,Agrestic,hay,aldehydic,...,,,,,,,,,,
3,Aldehyde C-10 (decanal),Perfumers Apprentice,224hrs,,,,x,aldehydic,orange peel,green,...,,,,,,,,,,
4,Aldehyde C-11 Lenic,Perfumers Apprentice,380hrs,,performs as top,,x,aldehydic,fatty,waxy,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,Trans-2-Decen-1-Al FCC,Bedoukian,57 hours @ 100%,,,,,citrus,waxy,aldehydic,...,,,,,,,,,,
1076,Vionil (10% DPG),Bedoukian,400 hour(s) at 100.00 %,,,,,floral,green,violet,...,,,,,,,,,,
1077,Honeyflor,Bedoukian,> 1 hour(s) at 100.00 %,,,,,green,sweet,sharp,...,,,,,,,,,,
1078,Nuezate,Bedoukian,,0.01,,,,nutty,sweet,walnut,...,,,,,,,,,,


In [31]:
lab_df.columns

Index(['NAME', 'IAO SUPPLEIR', 'TENACITY @ 100%', 'REC. % SOLUTION', 'TOP',
       'MID', 'BASE', 'FAMILY', 'AROMA PRIMARY', 'AROMA SECONDARY',
       'AROMA TERTIARY', 'IAO STOCK', 'NOTES ON SMELLING', 'Column_14',
       'Column_15', 'Column_16', 'Column_17', 'Column_18', 'Column_19',
       'Column_20', 'Column_21', 'Column_22', 'Column_23', 'Column_24',
       'Column_25'],
      dtype='object')

In [32]:
lab_df["FAMILY"].unique()

array(['aldehydic', 'woody', 'Agrestic', 'aldehydic / lactonic',
       'lactonic', 'musk', ' ', 'fruity', 'gourmand', 'citrus', 'amber',
       'animalic', 'Amber', 'amber ', nan, 'amber/marine', 'fruity?',
       'anisic', 'anisic?', 'leathery', 'marine', 'oud',
       'woody / aromatic', 'floral?', 'terpenic', 'balsamic', 'green',
       'honey', 'gourmand ', 'terpenic?', 'floral', 'incense',
       'fruity (sulphur)', 'woody?', 'smoky', 'herbaceous',
       'camphoraceous', 'spice', 'powdery', 'minty?', 'fantasy',
       'floral / agrestic', 'aromatic', 'gourmand  ', 'aquatic', 'fresh',
       'earthy', 'woody/ balsamic', 'hebaceous', 'resinous', 'herbal ',
       'herbal', 'ozonic?', 'resinous ', 'green ', 'latonic', 'vegetal',
       'Industrial', 'wood', 'ozonic', 'Balsamic', 'woods', 'bready',
       'floral/green', 'floral ', 'musty', 'leather', 'licorice', 'foral',
       'Floral', 'watery', 'ozone', 'minty', 'spicy',
       'animalic / gourmand', 'woody mossy', 'woody amber'

In [34]:
FAMILY_DESCRIPTORS = {
    "aldehydic": ["fatty","waxy","soapy","sparkling"],
    "amber":     ["sweet","resinous","warm","vanillic","oriental"],
    "animalic":  ["musky","civet","leathery","dirty"],
    "aromatic":  ["herbal","camphoraceous","clean","green"],
    "aquatic":   ["marine","watery","ozonic","fresh"],
    "balsamic":  ["resinous","sweet","vanillic","ambery"],
    "camphoraceous": ["cool","eucalyptus","minty","herbal"],
    "citrus":    ["zesty","sparkling","bright","juicy"],
    "earthy":    ["soil","mossy","rooty","dark"],
    "fantasy":   ["abstract","accord","synthetic","conceptual"],
    "floral":    ["petal","powdery","headspace","opulent"],
    "fruity":    ["juicy","lactonic","jammy","sweet"],
    "fresh":     ["clean","airy","bright","crisp"],
    "gourmand":  ["edible","dessert-like","chocolate","caramel"],
    "green":     ["leafy","stemmy","dewy","cut grass"],
    "honey":     ["sweet","syrupy","animalic","warm"],
    "herbal":    ["aromatic","spicy","camphor","leafy"],
    "incense":   ["church","resinous","smoky","spiritual"],
    "leather":   ["tarry","suede","smoky","animalic"],
    "licorice":  ["anise","sweet","spicy","dark"],
    "marine":    ["sea","salty","fresh","watery"],
    "minty":     ["cool","fresh","menthol","sharp"],
    "mossy":     ["forest","earthy","green","woody"],
    "musk":      ["soft","clean","round","fixative"],
    "nutty":     ["toasted","warm","oily","gourmand"],
    "oud":       ["resinous","woody","smoky","medicinal"],
    "ozonic":    ["airy","watery","fresh","light"],
    "powdery":   ["soft","dry","makeup","cosmetic"],
    "resinous":  ["sticky","ambery","incense","pine"],
    "smoky":     ["burnt","tar","incense","charcoal"],
    "spice":     ["warm","piquant","exotic","dry"],
    "terpenic":  ["piney","herbal","camphor","sharp"],
    "tobacco":   ["dry","sweet","leafy","ambery"],
    "vegetal":   ["leafy","green","earthy","stemmy"],
    "watery":    ["aqua","fresh","transparent","thin"],
    "woody":     ["cedar","sandalwood","dry","warm"],

}


In [76]:
import numpy as np

def assign_role(row):
    if pd.notna(row["TOP"]):
        return "top"
    elif pd.notna(row["MID"]):
        return "mid"
    elif pd.notna(row["BASE"]):
        return "base"
    else:
        return np.nan

lab_df["role"] = lab_df.apply(assign_role, axis=1)


In [77]:
lab_df.drop(columns=["TOP", "MID", "BASE"], inplace=True)
lab_df

Unnamed: 0,NAME,IAO SUPPLEIR,TENACITY @ 100%,REC. % SOLUTION,FAMILY,AROMA PRIMARY,AROMA SECONDARY,AROMA TERTIARY,IAO STOCK,NOTES ON SMELLING,...,Column_17,Column_18,Column_19,Column_20,Column_21,Column_22,Column_23,Column_24,Column_25,role
0,Adoxal,Perfumers Apprentice,168hrs,0.1,aldehydic,fresh,marine,floral,x,fresh linen,...,,,,,,,,,,base
1,Agarwood,Eden Botanicals,,,woody,oud,dry,autumnal,x,,...,,,,,,,,,,
2,Agrumen Aldehyde,Vigon,276hrs,,Agrestic,hay,aldehydic,fizzy,,like chanel no. 5 in a barn,...,,,,,,,,,,base
3,Aldehyde C-10 (decanal),Perfumers Apprentice,224hrs,,aldehydic,orange peel,green,waxy,x,,...,,,,,,,,,,base
4,Aldehyde C-11 Lenic,Perfumers Apprentice,380hrs,,aldehydic,fatty,waxy,limey,x,"dried seaweed, sour part of lime, dryer",...,,,,,,,,,,top
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,Trans-2-Decen-1-Al FCC,Bedoukian,57 hours @ 100%,,citrus,waxy,aldehydic,cilantro,x,waxy fatty earthy green cilantro mushroom alde...,...,,,,,,,,,,
1076,Vionil (10% DPG),Bedoukian,400 hour(s) at 100.00 %,,floral,green,violet,cucumber,x,watery note. Slight walnut aspect. violet,...,,,,,,,,,,
1077,Honeyflor,Bedoukian,> 1 hour(s) at 100.00 %,,green,sweet,sharp,rot,x,green honey fruity hyacinth cortex papaya gua...,...,,,,,,,,,,
1078,Nuezate,Bedoukian,,0.01,nutty,sweet,walnut,hazelnut,x,"Characteristic black walnut aroma, accompanied...",...,,,,,,,,,,


In [80]:
lab_df = lab_df.drop(columns=[
    "Column_17","Column_18","Column_19","Column_20","Column_21","Column_22","Column_23","Column_24","Column_25"
])
lab_df


Unnamed: 0,NAME,IAO SUPPLEIR,TENACITY @ 100%,REC. % SOLUTION,FAMILY,AROMA PRIMARY,AROMA SECONDARY,AROMA TERTIARY,NOTES ON SMELLING,role
0,Adoxal,Perfumers Apprentice,168hrs,0.1,aldehydic,fresh,marine,floral,fresh linen,base
1,Agarwood,Eden Botanicals,,,woody,oud,dry,autumnal,,
2,Agrumen Aldehyde,Vigon,276hrs,,Agrestic,hay,aldehydic,fizzy,like chanel no. 5 in a barn,base
3,Aldehyde C-10 (decanal),Perfumers Apprentice,224hrs,,aldehydic,orange peel,green,waxy,,base
4,Aldehyde C-11 Lenic,Perfumers Apprentice,380hrs,,aldehydic,fatty,waxy,limey,"dried seaweed, sour part of lime, dryer",top
...,...,...,...,...,...,...,...,...,...,...
1075,Trans-2-Decen-1-Al FCC,Bedoukian,57 hours @ 100%,,citrus,waxy,aldehydic,cilantro,waxy fatty earthy green cilantro mushroom alde...,
1076,Vionil (10% DPG),Bedoukian,400 hour(s) at 100.00 %,,floral,green,violet,cucumber,watery note. Slight walnut aspect. violet,
1077,Honeyflor,Bedoukian,> 1 hour(s) at 100.00 %,,green,sweet,sharp,rot,green honey fruity hyacinth cortex papaya gua...,
1078,Nuezate,Bedoukian,,0.01,nutty,sweet,walnut,hazelnut,"Characteristic black walnut aroma, accompanied...",


In [81]:
lab_df.to_csv("/Users/ranykhirbawi/Desktop/LunarAIccord/data/lab_materials.csv", index=False)


In [42]:
perfumes_path = "/Users/ranykhirbawi/Desktop/LunarAIccord/data/dataset for the paper _NLP-based perfume notes estimation_ - 시트1.csv"
IN  = Path("data")                             # your input folder
print(IN)

data


In [82]:
# Lunar AIccord — build minimal V0.1 data pack (no family→role mapping, no family remap)
# --------------------------------------------------------------------------------------
# What this cell does (copy–paste and run locally in your repo root):
#   • Reads your real datasets under ./data
#   • Writes:
#       - scentlab_data_pack_v01/data/materials_catalog.jsonl
#       - scentlab_data_pack_v01/data/notes_taxonomy.yaml
#   • Edits you asked for:
#       - Families are taken **exactly as-is** from lab_materials.csv (no FAM_MAP)
#       - No ROLE_BY_FAMILY inference; role comes from the lab file (fallback "mid" if missing)
#       - descriptors = FAMILY_DESCRIPTORS[family] (when available)
#                      + AROMA PRIMARY/SECONDARY/TERTIARY
#                      + NOTES ON SMELLING (split on ; or ,)
#       - usage_hint contains:
#             "recommended_%solution"  ← raw "REC. % SOLUTION"
#             "tenacity_for_100%"     ← raw "TENACITY @ 100%"
#         (keeps simple usage_hint_pct heuristics, using the role if present)
#       - pairs_with is computed from co-occurrence across formulas.jsonl and fra_cleaned.csv
#         (optional; if those files are missing, pairs_with will just be empty)

from __future__ import annotations
from pathlib import Path
from typing import List, Dict, Any, Optional
import pandas as pd, json, yaml, re, ast
from collections import defaultdict, Counter

# ---------------- paths ----------------
IN  = Path("data")                             # your input folder
OUT = Path("scentlab_data_pack_v01") / "data"  # outputs go here
OUT.mkdir(parents=True, exist_ok=True)

PATH_LAB   = "/Users/ranykhirbawi/Desktop/LunarAIccord/data/lab_materials.csv"
PATH_FRA   = "/Users/ranykhirbawi/Desktop/LunarAIccord/data/fra_cleaned.csv"   # optional (Fragrantica-style)
PATH_FORM  = "/Users/ranykhirbawi/Desktop/LunarAIccord/data/formulas.jsonl"    # optional (structured formulas)

# ---------------- helpers ----------------
def slug(s: Any) -> str:
    return re.sub(r"[^a-z0-9]+","-", str(s).strip().lower()).strip("-")

def norm_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy(); df.columns = [slug(c).replace("-", "_") for c in df.columns]; return df

def to_list(x) -> List[str]:
    if x is None or (isinstance(x, float) and pd.isna(x)): return []
    if isinstance(x, list): return [str(v).strip() for v in x if str(v).strip()]
    s = str(x)
    if s.startswith("[") and s.endswith("]"):
        try:
            arr = ast.literal_eval(s); return [str(v).strip() for v in arr if str(v).strip()]
        except Exception: pass
    return [t.strip() for t in re.split(r"[;,/|]+", s) if t.strip()]

def read_concat_json(path: Path) -> List[dict]:
    """Parse a file containing multiple JSON objects back-to-back (common in scraped dumps)."""
    s = path.read_text(encoding="utf-8")
    dec = json.JSONDecoder(); i = 0; objs = []
    while i < len(s):
        m = re.search(r"\S", s[i:])
        if not m: break
        j = i + m.start()
        obj, end = dec.raw_decode(s, j)
        objs.append(obj); i = end
    return objs

# ---------------- your FAMILY_DESCRIPTORS (as provided) ----------------
FAMILY_DESCRIPTORS = {
    "aldehydic": ["fatty","waxy","soapy","sparkling"],
    "amber":     ["sweet","resinous","warm","vanillic","oriental"],
    "animalic":  ["musky","civet","leathery","dirty"],
    "aromatic":  ["herbal","camphoraceous","clean","green"],
    "aquatic":   ["marine","watery","ozonic","fresh"],
    "balsamic":  ["resinous","sweet","vanillic","ambery"],
    "camphoraceous": ["cool","eucalyptus","minty","herbal"],
    "citrus":    ["zesty","sparkling","bright","juicy"],
    "earthy":    ["soil","mossy","rooty","dark"],
    "fantasy":   ["abstract","accord","synthetic","conceptual"],
    "floral":    ["petal","powdery","headspace","opulent"],
    "fruity":    ["juicy","lactonic","jammy","sweet"],
    "fresh":     ["clean","airy","bright","crisp"],
    "gourmand":  ["edible","dessert-like","chocolate","caramel"],
    "green":     ["leafy","stemmy","dewy","cut grass"],
    "honey":     ["sweet","syrupy","animalic","warm"],
    "herbal":    ["aromatic","spicy","camphor","leafy"],
    "incense":   ["church","resinous","smoky","spiritual"],
    "leather":   ["tarry","suede","smoky","animalic"],
    "licorice":  ["anise","sweet","spicy","dark"],
    "marine":    ["sea","salty","fresh","watery"],
    "minty":     ["cool","fresh","menthol","sharp"],
    "mossy":     ["forest","earthy","green","woody"],
    "musk":      ["soft","clean","round","fixative"],
    "nutty":     ["toasted","warm","oily","gourmand"],
    "oud":       ["resinous","woody","smoky","medicinal"],
    "ozonic":    ["airy","watery","fresh","light"],
    "powdery":   ["soft","dry","makeup","cosmetic"],
    "resinous":  ["sticky","ambery","incense","pine"],
    "smoky":     ["burnt","tar","incense","charcoal"],
    "spice":     ["warm","piquant","exotic","dry"],
    "terpenic":  ["piney","herbal","camphor","sharp"],
    "tobacco":   ["dry","sweet","leafy","ambery"],
    "vegetal":   ["leafy","green","earthy","stemmy"],
    "watery":    ["aqua","fresh","transparent","thin"],
    "woody":     ["cedar","sandalwood","dry","warm"],
}

# ---------------- 1) load lab materials ----------------
lab = norm_cols(pd.read_csv(PATH_LAB))

# detect key columns (by name fragments to be robust)
c_name   = "name" if "name" in lab.columns else next(c for c in lab.columns if "name" in c)
c_family = "family" if "family" in lab.columns else next(c for c in lab.columns if "famil" in c)
c_role   = "role" if "role" in lab.columns else None

c_p1     = next((c for c in lab.columns if "aroma_primary"   in c), None)
c_p2     = next((c for c in lab.columns if "aroma_secondary" in c), None)
c_p3     = next((c for c in lab.columns if "aroma_tertiary"  in c), None)
c_notes  = next((c for c in lab.columns if "notes_on_smelling" in c or "smelling" in c), None)
c_ten    = next((c for c in lab.columns if "tenacity" in c), None)                    # "TENACITY @ 100%"
c_rec    = next((c for c in lab.columns if "rec" in c and "solution" in c), None)     # "REC. % SOLUTION"

# Build base materials (family kept EXACTLY as in file; role not inferred)
materials: Dict[str, Dict[str, Any]] = {}
for _, r in lab.iterrows():
    name = str(r[c_name]).strip()
    if not name: continue
    nid   = slug(name)
    fam   = str(r.get(c_family, "") or "").strip().lower() or "other"
    role  = (str(r.get(c_role, "") or "").strip().lower() if c_role and pd.notna(r.get(c_role)) else "mid")

    # descriptors = FAMILY_DESCRIPTORS[family] + AROMA P/S/T + NOTES ON SMELLING
    desc = []
    desc += FAMILY_DESCRIPTORS.get(fam, [])
    for c in (c_p1, c_p2, c_p3):
        if c and pd.notna(r.get(c)):
            desc.append(str(r[c]).strip().lower())
    if c_notes and pd.notna(r.get(c_notes)):
        desc += [w.strip().lower() for w in re.split(r"[;,]+", str(r[c_notes])) if w.strip()]
    # unique & tidy
    seen=set(); descriptors=[]
    for d in desc:
        d = d.strip().lower()
        if d and d not in seen:
            descriptors.append(d); seen.add(d)
    descriptors = descriptors[:16]

    usage_hint = {
        "recommended_%solution": (str(r[c_rec]).strip() if c_rec and pd.notna(r.get(c_rec)) else None),
        "tenacity_for_100%":     (str(r[c_ten]).strip() if c_ten and pd.notna(r.get(c_ten)) else None),
    }

    materials[nid] = {
        "id": nid,
        "name": name.strip().lower(),
        "family": fam,              # <-- kept as-is
        "role": role,               # <-- no inference from family
        "descriptors": descriptors,
        "aliases": [],
        "allergens": [],            # populate later if you add columns
        "usage_hint": usage_hint,
        "usage_hint_pct": {         # simple heuristics still ok
            "edp_min": 0.1 if role=="top" else (0.2 if role=="mid" else 0.5),
            "edp_max": 3.0 if role=="top" else (5.0 if role=="mid" else 12.0),
        },
    }

# ---------------- 2) pairs_with from optional datasets ----------------


pairs_with_ids = {nid: [k for k,_ in co[nid].most_common(6)] for nid in materials}

# ---------------- 3) notes_taxonomy.yaml ----------------
taxonomy = {"schema_version":"0.1","families":{}}
# families that actually appear in your lab file (kept as-is)
for nid, m in sorted(materials.items(), key=lambda kv: kv[1]["name"]):
    fam = m["family"]
    taxonomy["families"].setdefault(
        fam,
        {"description": " / ".join(FAMILY_DESCRIPTORS.get(fam, [])) or fam, "notes":[]}
    )
    taxonomy["families"][fam]["notes"].append({
        "name": m["name"],
        "role": m["role"],
        "aliases": m.get("aliases", []),
        "pairs_with": [materials[p]["name"] for p in pairs_with_ids.get(nid, []) if p in materials]
    })

(OUT / "notes_taxonomy.yaml").write_text(
    yaml.safe_dump(taxonomy, sort_keys=False, allow_unicode=True), encoding="utf-8"
)

# ---------------- 4) materials_catalog.jsonl ----------------
with (OUT / "materials_catalog.jsonl").open("w", encoding="utf-8") as f:
    for nid, m in sorted(materials.items(), key=lambda kv: kv[1]["name"]):
        fam, role = m["family"], m["role"]
        doc = {
            "schema_version": "0.1",
            "id": m["id"],
            "name": m["name"],
            "family": fam,            # kept as-is
            "role": role,             # from lab or fallback "mid"
            "descriptors": m["descriptors"],
            "pairs_with": [materials[p]["name"] for p in pairs_with_ids.get(nid, []) if p in materials],
            "aliases": m.get("aliases", []),
            "compliance": {"material_key": m["id"], "allergen_keys": m.get("allergens", [])},
            "usage_hint": m.get("usage_hint"),
            "usage_hint_pct": m.get("usage_hint_pct"),
            "embedding_text": f"{m['name']} ({fam}, {role}) – {', '.join(m['descriptors'])}"
        }
        f.write(json.dumps(doc, ensure_ascii=False) + "\n")

print("✅ Wrote:", ("/Users/ranykhirbawi/Desktop/LunarAIccord/data/materials_catalog.jsonl"))
print("✅ Wrote:", ("/Users/ranykhirbawi/Desktop/LunarAIccord/data/notes_taxonomy.yaml"))


✅ Wrote: /Users/ranykhirbawi/Desktop/LunarAIccord/data/materials_catalog.jsonl
✅ Wrote: /Users/ranykhirbawi/Desktop/LunarAIccord/data/notes_taxonomy.yaml


In [84]:
# Normalize formulas.jsonl -> accords_examples.jsonl WITHOUT changing grams
# Absolute paths (macOS):
from pathlib import Path
import json, re

BASE      = Path("/Users/ranykhirbawi/Desktop/LunarAIccord/data")
IN_FORM   = BASE / "formulas_1.jsonl "
MAT_FILE  = BASE / "materials_catalog.jsonl"   # used only to attach roles / canonical names
OUT_FILE  = BASE / "formulas.jsonl"    # agent-facing, grams preserved

def slug(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "-", str(s).strip().lower()).strip("-")

# --- 1) Build name→(canonical_name, role) map from materials_catalog.jsonl ---
name_to_canon = {}   # e.g., "menthone" -> "menthone"
name_to_role  = {}   # e.g., "menthone" -> "mid"

if MAT_FILE.exists():
    with MAT_FILE.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            d = json.loads(line)
            canon = str(d.get("name", "")).strip().lower()
            role  = d.get("role", "mid")
            _id   = str(d.get("id", "")).strip().lower()
            aliases = [str(a).strip().lower() for a in (d.get("aliases") or []) if str(a).strip()]

            for key in filter(None, [canon, _id, *aliases]):
                name_to_canon[key] = canon or key
                name_to_role[key]  = role

def canon_and_role(raw_name: str):
    nm = (raw_name or "").strip().lower()
    if not nm:
        return "", "mid"
    return name_to_canon.get(nm, nm), name_to_role.get(nm, "mid")

# --- 2) Read formulas: supports JSONL and concatenated JSON objects ---
def read_json_records(path: Path):
    # Try JSONL first
    recs = []
    with path.open("r", encoding="utf-8") as f:
        for ln in f:
            s = ln.strip()
            if not s:
                continue
            try:
                recs.append(json.loads(s))
            except json.JSONDecodeError:
                recs = []  # not JSONL; fallback to concatenated
                break
    if recs:
        return recs

    # Fallback: concatenated JSON
    text = path.read_text(encoding="utf-8")
    dec = json.JSONDecoder()
    out, i = [], 0
    while i < len(text):
        m = re.search(r"\S", text[i:])
        if not m:
            break
        j = i + m.start()
        obj, end = dec.raw_decode(text, j)
        out.append(obj)
        i = end
    return out

records = read_json_records(IN_FORM)

# --- 3) Coerce without touching grams ---
def coerce_no_pct(rec: dict):
    title = rec.get("formula_name") or rec.get("formula_id") or "untitled"
    rid   = slug(rec.get("formula_id") or title)
    mats  = rec.get("materials") or []
    if not isinstance(mats, list) or not mats:
        return None

    # Preserve original order; preserve original amounts exactly
    items = []
    for m in mats:
        raw_name = m.get("material_name") or m.get("name") or m.get("material") or ""
        canon, role = canon_and_role(raw_name)
        item = {
            "material": canon,                     # canonical if known; else lowercased original
            "material_original": str(raw_name),    # preserve exact original label
            "grams": m.get("amount_grams"),        # <-- unchanged
        }
        # attach optional fields if present (no transformations)
        if "line_index" in m:        item["line_index"] = m["line_index"]
        if "supplier" in m:          item["supplier"] = m["supplier"]
        if "dilution_percent" in m:  item["dilution_percent"] = m["dilution_percent"]
        if role:                     item["role"] = role   # role from catalog (no math)
        items.append(item)

    out = {
        "schema_version": "0.1",
        "id": rid,
        "title": title,
        "season": "winter",      # defaults you can change later
        "mood": [],
        "style": "unisex",
        "formula": items,
        "meta": {
            # carry through useful provenance as-is
            "author": rec.get("author"),
            "co_author": rec.get("co_author"),
            "for_whom": rec.get("for_whom"),
            "year": rec.get("year"),
            "license": rec.get("license"),
            "total_grams_reported": rec.get("total_grams_reported"),
            "source_image": rec.get("source_image"),
            "further_comments": rec.get("further_comments"),
        }
    }
    return out

# --- 4) Write accords_examples.jsonl (grams preserved) ---
count = 0
with OUT_FILE.open("w", encoding="utf-8") as f:
    for rec in records:
        a = coerce_no_pct(rec)
        if a:
            f.write(json.dumps(a, ensure_ascii=False) + "\n")
            count += 1

print(f"✅ Wrote {count} formulas (grams preserved) -> {OUT_FILE}")


✅ Wrote 10 formulas (grams preserved) -> /Users/ranykhirbawi/Desktop/LunarAIccord/data/formulas.jsonl


In [85]:
# Change season for a single record (id == "blue-ja-2018") to "summer"
from pathlib import Path
import json, tempfile, shutil

p = Path("/Users/ranykhirbawi/Desktop/LunarAIccord/data/formulas.jsonl")
tmp = Path(str(p) + ".tmp")

changed = 0
with p.open("r", encoding="utf-8") as fin, tmp.open("w", encoding="utf-8") as fout:
    for line in fin:
        s = line.strip()
        if not s:
            continue
        try:
            obj = json.loads(s)
        except json.JSONDecodeError:
            fout.write(line)  # pass through any non-JSON lines
            continue
        if obj.get("id") == "blue-chacha-sikes-2018":
            obj["season"] = "summer"   # <- your change
            changed += 1
        elif obj.get("id") == "blue-rubia-chaudri-2018":
            obj["season"] = "summer"
            changed += 1
        elif obj.get("id") == "p22-mr-2018":
            obj["season"] = "summer"
            changed += 1
        elif obj.get("id") == "possession-rubia-chaudri-2018":
            obj["season"] = "summer"
            changed += 1
        else:
            obj = obj
        fout.write(json.dumps(obj, ensure_ascii=False) + "\n")

# Atomically replace the original file
shutil.move(tmp, p)
print(f"Done. Updated {changed} record(s).")


Done. Updated 4 record(s).
