In [1]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [5]:
# MetadataÔºåEmbeddingÔºåCosSim(NSAI)

# pip install openai scikit-learn pandas numpy

import os, json, hashlib, getpass
import numpy as np
import pandas as pd
from openai import OpenAI

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

csv_folder  = "/content/drive/MyDrive/Geochem_data_060"
json_folder = "/content/drive/MyDrive/Geochem_Knowledge_060"

# Metadata embedding
META_COLS = [
    "SYSTEM_TYPE",
    "PRIMARY_CLASS", "SECONDARY_CLASS", "SPECIFIC_NAME",
    "HOST_ROCK_TYPE",
    "STRATIGRAPHY",
    "MINERALIZATION",
    "ALTERATION",
    "IGNEOUS_FORM",
    "METAMORPHISM", "FACIES_GRADE",
    "GEOLOGIC_AGE", "GEOLOGIC_AGE_DEPOSIT", "GEOLOGIC_AGE_HOST",
    "HOST_NAME",
]


NUMERIC_COLS = None

MAX_PER_CLASS = 200

# Embedding cache
KB_CACHE_PATH     = "kb_embedding_cache.json"
SAMPLE_CACHE_PATH = "sample_meta_embedding_cache.json"

# OpenAI embedding
EMB_MODEL = "text-embedding-3-small"
EMB_DIM   = 1536

RF_PARAMS = dict(
    n_estimators=600,
    max_depth=None,
    min_samples_split=5,
    max_features="sqrt",
    n_jobs=-1
)

api_key = getpass.getpass("üîê Enter your OpenAI API key: ")
client = OpenAI(api_key=api_key)

def sha16(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8")).hexdigest()[:16]

def load_cache(path: str) -> dict:
    if os.path.exists(path):
        try:
            with open(path, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return {}
    return {}

def save_cache(path: str, obj: dict):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False)

def get_embedding(text: str) -> list:
    text = (text or "").replace("\n", " ").strip()
    if not text:
        return [0.0] * EMB_DIM
    return client.embeddings.create(input=[text], model=EMB_MODEL).data[0].embedding

def safe_load_json_as_text(path: str) -> str:
    raw = open(path, "r", encoding="utf-8").read().strip()
    if not raw:
        return ""
    try:
        obj = json.loads(raw)
        return json.dumps(obj, ensure_ascii=False)
    except Exception:
        return raw

def cosine_sim(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    # A: (n,d), B: (m,d) -> (n,m)
    A = A / (np.linalg.norm(A, axis=1, keepdims=True) + 1e-12)
    B = B / (np.linalg.norm(B, axis=1, keepdims=True) + 1e-12)
    return A @ B.T

# load data

def load_all_csv(csv_dir: str) -> pd.DataFrame:
    frames = []
    for fn in os.listdir(csv_dir):
        if fn.lower().endswith(".csv"):
            df = pd.read_csv(os.path.join(csv_dir, fn))
            if "DEPOSIT_TYPE" in df.columns:
                frames.append(df)
    if not frames:
        raise FileNotFoundError(f"No CSV with DEPOSIT_TYPE found in {csv_dir}")
    data = pd.concat(frames, ignore_index=True)
    data["DEPOSIT_TYPE"] = data["DEPOSIT_TYPE"].astype(str)
    return data

def downsample_per_class(df: pd.DataFrame, max_per_class: int | None) -> pd.DataFrame:
    if max_per_class is None:
        return df
    return (df.groupby("DEPOSIT_TYPE", group_keys=False)
              .apply(lambda x: x.sample(n=min(len(x), max_per_class), random_state=42))
              .reset_index(drop=True))

def infer_numeric_cols(df: pd.DataFrame, meta_cols: list[str]) -> list[str]:
    drop_cols = set(["DEPOSIT_TYPE"] + meta_cols)
    numeric_cols = []
    for c in df.columns:
        if c in drop_cols:
            continue
        if pd.api.types.is_numeric_dtype(df[c]):
            numeric_cols.append(c)
    return numeric_cols

# deposit knowledge embeddings

def build_kb_embeddings(json_dir: str) -> tuple[list[str], np.ndarray]:
    cache = load_cache(KB_CACHE_PATH)
    names = []
    embs  = []

    for fn in os.listdir(json_dir):
        if not fn.lower().endswith(".json"):
            continue
        deposit_type = fn.replace("_knowledge.json", "").strip()
        text = safe_load_json_as_text(os.path.join(json_dir, fn))
        key = f"{deposit_type}|{sha16(text)}"
        if key not in cache:
            cache[key] = get_embedding(text)
        names.append(deposit_type)
        embs.append(np.array(cache[key], dtype=float))

    save_cache(KB_CACHE_PATH, cache)

    if not names:
        raise FileNotFoundError(f"No .json found in {json_dir}")

    order = np.argsort(names)
    names = [names[i] for i in order]
    embs  = np.vstack([embs[i] for i in order])
    return names, embs



def row_to_meta_text(row: pd.Series, meta_cols: list[str]) -> str:
    banned = {"DEPOSIT_TYPE"}
    parts = []

    for c in meta_cols:
        if c in banned:
            continue
        if c not in row.index:
            continue

        v = row[c]
        if pd.isna(v):
            continue

        s = str(v).strip()
        if not s or s.lower() in {"nan", "none", "null"}:
            continue

        s = " ".join(s.split())
        s_norm = s.lower()

        parts.append(f"{c.lower()}: {s_norm}")

    if not parts:
        return ""
    return " | ".join(parts)


def build_sample_meta_embeddings(df: pd.DataFrame, meta_cols: list[str]) -> np.ndarray:
    cache = load_cache(SAMPLE_CACHE_PATH)
    out = []

    for i in range(len(df)):
        text = row_to_meta_text(df.iloc[i], meta_cols)
        key = sha16(text)
        if key not in cache:
            cache[key] = get_embedding(text)
        out.append(np.array(cache[key], dtype=float))

    save_cache(SAMPLE_CACHE_PATH, cache)
    return np.vstack(out)

# drop-in variants for K

def permute_rows(K: np.ndarray, seed: int) -> np.ndarray:
    rng = np.random.RandomState(seed)
    return K[rng.permutation(len(K))]

def col_shuffle(K: np.ndarray, seed: int) -> np.ndarray:
    rng = np.random.RandomState(seed)
    K2 = K.copy()
    for j in range(K.shape[1]):
        K2[:, j] = K2[rng.permutation(len(K2)), j]
    return K2


def fit_eval_rf(Xtr, ytr, Xte, yte, seed: int):
    clf = RandomForestClassifier(random_state=seed, **RF_PARAMS)
    clf.fit(Xtr, ytr)
    pred = clf.predict(Xte)
    return {
        "acc": accuracy_score(yte, pred),
        "macro_f1": f1_score(yte, pred, average="macro"),
        "weighted_f1": f1_score(yte, pred, average="weighted"),
    }

def summarize(metrics_list):
    arr = {k: np.array([m[k] for m in metrics_list], float) for k in ["acc", "macro_f1", "weighted_f1"]}
    return {k: (arr[k].mean(), arr[k].std(ddof=1)) for k in arr.keys()}

# MAIN

if __name__ == "__main__":
    df = load_all_csv(csv_folder)
    df = downsample_per_class(df, MAX_PER_CLASS)

    for c in META_COLS:
        if c not in df.columns:
            print(f"‚ö†Ô∏è META_COL '{c}' not found in df columns. It will be treated as missing for all rows.")

    y = df["DEPOSIT_TYPE"].values
    if NUMERIC_COLS is None:
        numeric_cols = infer_numeric_cols(df, META_COLS)
    else:
        numeric_cols = NUMERIC_COLS

    print(f"Samples={len(df)} | Classes={pd.Series(y).nunique()} | Numeric={len(numeric_cols)} | MetaCols={META_COLS}")

    # 1) KB embeddings
    kb_names, kb_emb = build_kb_embeddings(json_folder)
    print(f"KB types loaded: {len(kb_names)}")

    # 2) Sample metadata embeddings
    print("Building sample metadata embeddings (once) ‚Ä¶")
    S_meta = build_sample_meta_embeddings(df, META_COLS)  # (n,1536)

    # 3) K_all
    K_all = cosine_sim(S_meta, kb_emb)  # (n, C)

    # 4) 10 seeds quick validation
    seeds = list(range(10))
    res = {k: [] for k in ["baseline", "nsai", "drop_perm", "drop_col", "drop_zero"]}

    idx_all = np.arange(len(df))
    for seed in seeds:
        tr_idx, te_idx = train_test_split(idx_all, test_size=0.2, random_state=seed, stratify=y)

        # y encode (train only)
        le = LabelEncoder()
        ytr = le.fit_transform(y[tr_idx])

        yte_raw = y[te_idx]
        mask = np.isin(yte_raw, le.classes_)
        te_idx2 = te_idx[mask]
        yte = le.transform(y[te_idx2])

        Xtr_raw = df.iloc[tr_idx][numeric_cols].values
        Xte_raw = df.iloc[te_idx2][numeric_cols].values

        imputer = SimpleImputer(strategy="mean")
        scaler  = StandardScaler()

        Xtr_imp = imputer.fit_transform(Xtr_raw)
        Xte_imp = imputer.transform(Xte_raw)

        Xtr_num = scaler.fit_transform(Xtr_imp)
        Xte_num = scaler.transform(Xte_imp)

        # ---- K (label-free) ----
        Ktr = K_all[tr_idx, :]      # (n_train, C_all)
        Kte = K_all[te_idx2, :]

        # baseline
        res["baseline"].append(fit_eval_rf(Xtr_num, ytr, Xte_num, yte, seed))

        # nsai = numeric + K
        res["nsai"].append(fit_eval_rf(np.hstack([Xtr_num, Ktr]), ytr, np.hstack([Xte_num, Kte]), yte, seed))

        # strict drop-in 1: permute rows in train+test independently
        res["drop_perm"].append(
            fit_eval_rf(
                np.hstack([Xtr_num, permute_rows(Ktr, seed)]), ytr,
                np.hstack([Xte_num, permute_rows(Kte, seed + 100)]), yte,
                seed
            )
        )

        # strict drop-in 2: column-wise shuffle train+test
        res["drop_col"].append(
            fit_eval_rf(
                np.hstack([Xtr_num, col_shuffle(Ktr, seed)]), ytr,
                np.hstack([Xte_num, col_shuffle(Kte, seed + 100)]), yte,
                seed
            )
        )

        # drop-in 3: zero K
        res["drop_zero"].append(
            fit_eval_rf(
                np.hstack([Xtr_num, np.zeros_like(Ktr)]), ytr,
                np.hstack([Xte_num, np.zeros_like(Kte)]), yte,
                seed
            )
        )

    print("\n===== SUMMARY (mean ¬± std, 10 seeds) =====")
    for key in ["baseline", "nsai", "drop_perm", "drop_col", "drop_zero"]:
        s = summarize(res[key])
        print(f"\n{key.upper()}")
        print(f"  Acc     : {s['acc'][0]:.3f} ¬± {s['acc'][1]:.3f}")
        print(f"  MacroF1 : {s['macro_f1'][0]:.3f} ¬± {s['macro_f1'][1]:.3f}")
        print(f"  WeightF1: {s['weighted_f1'][0]:.3f} ¬± {s['weighted_f1'][1]:.3f}")

    i0 = 0
    topk = 5
    sims = K_all[i0]
    order = np.argsort(-sims)[:topk]
    print(f"\n[Sanity check] Sample 0 meta text = {row_to_meta_text(df.iloc[i0], META_COLS)}")
    print("Top-5 KB similarity:")
    for j in order:
        print(f"  {kb_names[j]} : {sims[j]:.3f}")


üîê Enter your OpenAI API key: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


  .apply(lambda x: x.sample(n=min(len(x), max_per_class), random_state=42))


Samples=2281 | Classes=17 | Numeric=76 | MetaCols=['SYSTEM_TYPE', 'PRIMARY_CLASS', 'SECONDARY_CLASS', 'SPECIFIC_NAME', 'HOST_ROCK_TYPE', 'STRATIGRAPHY', 'MINERALIZATION', 'ALTERATION', 'IGNEOUS_FORM', 'METAMORPHISM', 'FACIES_GRADE', 'GEOLOGIC_AGE', 'GEOLOGIC_AGE_DEPOSIT', 'GEOLOGIC_AGE_HOST', 'HOST_NAME']
KB types loaded: 17
Building sample metadata embeddings (once) ‚Ä¶





===== SUMMARY (mean ¬± std, 10 seeds) =====

BASELINE
  Acc     : 0.979 ¬± 0.007
  MacroF1 : 0.969 ¬± 0.011
  WeightF1: 0.979 ¬± 0.007

NSAI
  Acc     : 0.984 ¬± 0.004
  MacroF1 : 0.973 ¬± 0.011
  WeightF1: 0.984 ¬± 0.005

DROP_PERM
  Acc     : 0.977 ¬± 0.007
  MacroF1 : 0.965 ¬± 0.013
  WeightF1: 0.977 ¬± 0.007

DROP_COL
  Acc     : 0.978 ¬± 0.008
  MacroF1 : 0.967 ¬± 0.012
  WeightF1: 0.977 ¬± 0.008

DROP_ZERO
  Acc     : 0.978 ¬± 0.006
  MacroF1 : 0.968 ¬± 0.010
  WeightF1: 0.978 ¬± 0.006

[Sanity check] Sample 0 meta text = system_type: magmatic ree | primary_class: mineral | specific_name: calcite | host_rock_type: carbonatite | geologic_age_host: eocene | host_name: bear lodge carbonatite
Top-5 KB similarity:
  Carbonatite : 0.559
  Polymetallic sulfide skarn : 0.520
  Polymetallic sulfide skarn_replacement : 0.515
  High sulfidation Au-Ag : 0.482
  Porphyry Cu (Au) : 0.481
