In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("dr6_list.csv")

In [3]:
fields = df['field'].to_list()

In [4]:
# folders
image_codecs = "codecs/image_tokens/datacube_{field}_tokens.npz"
scalar_codecs = "scalar_tokenizers/scalar_tokens/datacube_{field}_scalar_tokens.npz"
spectrum_tokens = "spectrum_tokenizers/spectrum_tokens/datacube_{field}_spectrum_tokens.npz"

In [5]:
datacube = "/home/astrodados4/downloads/hypercube/datacube_{field}.parquet"

In [23]:
import numpy as np
import pandas as pd
from pathlib import Path

# ---------------------------
# ID normalizers (same idea)
# ---------------------------

def norm_splus_id(x) -> str:
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    if isinstance(x, (bytes, np.bytes_)):
        return x.decode("utf-8", errors="ignore")
    return str(x)

def norm_gaia_id(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return None
    try:
        return int(x)
    except Exception:
        return None

# ---------------------------
# Tokenizer label LUT
# ---------------------------

def load_scalar_tokenizer_config(config_path: str):
    """
    Loads your scalar_tokenizer_config.npz and returns:
      - bin_edges: dict[col] -> edges (N_BINS+1,)
      - n_bins: int
    """
    data = np.load(config_path, allow_pickle=True)
    bin_edges_raw = data["bin_edges"].item()
    n_bins = int(data["N_BINS"])

    # keys might be bytes -> normalize
    bin_edges = {}
    for k, v in bin_edges_raw.items():
        kk = norm_splus_id(k)
        bin_edges[kk] = np.asarray(v, dtype=np.float64)

    return bin_edges, n_bins

def build_edges_lut_for_scalar_cols(bin_edges: dict, scalar_cols: np.ndarray, n_bins: int):
    """
    Returns a list `edges_list` aligned with scalar_cols:
      edges_list[j] is float64 edges array for scalar_cols[j], or None if missing.
    """
    edges_list = []
    for c in scalar_cols:
        col = norm_splus_id(c)
        e = bin_edges.get(col, None)
        if e is None or len(e) != (n_bins + 1):
            edges_list.append(None)
        else:
            edges_list.append(e)
    return edges_list

def decode_scalar_row_to_labels(tokens_row: np.ndarray, scalar_cols: np.ndarray, edges_list: list):
    """
    tokens_row: (n_cols,) uint16
    returns list of dicts: [{"col":..., "tok":..., "lo":..., "hi":...}, ...]
    """
    out = []
    for j, tok in enumerate(tokens_row):
        e = edges_list[j]
        col = norm_splus_id(scalar_cols[j])
        t = int(tok)

        if e is None:
            out.append({"col": col, "tok": t, "lo": None, "hi": None})
            continue

        # clamp just in case
        if t < 0:
            t = 0
        if t > len(e) - 2:
            t = len(e) - 2

        lo = float(e[t])
        hi = float(e[t + 1])
        out.append({"col": col, "tok": t, "lo": lo, "hi": hi})
    return out

# ---------------------------
# NPZ helpers
# ---------------------------

def npz_row_map_by_id(npz, token_key: str, id_normalizer):
    """
    dict[id] -> token_row
    """
    if "ids" not in npz.files or token_key not in npz.files:
        return {}
    ids = [id_normalizer(x) for x in np.asarray(npz["ids"], dtype=object)]
    toks = npz[token_key]
    return {ids[i]: toks[i] for i in range(len(ids))}

def build_ragged_map_from_npz(npz, ids_key: str, flat_key: str, indptr_key: str, id_normalizer):
    """
    dict[id] -> 1D token sequence (ragged) using flat+indptr CSR-style.
    """
    if ids_key not in npz.files or flat_key not in npz.files or indptr_key not in npz.files:
        return {}
    ids = [id_normalizer(x) for x in np.asarray(npz[ids_key], dtype=object)]
    flat = npz[flat_key]
    indptr = npz[indptr_key]
    out = {}
    for i, key in enumerate(ids):
        a = int(indptr[i])
        b = int(indptr[i + 1])
        out[key] = flat[a:b]
    return out

# ---------------------------
# MAIN STORE BUILDER
# ---------------------------

def build_object_store_for_field(
    field: str,
    datacube_tmpl: str,
    image_npz_tmpl: str,
    scalar_npz_tmpl: str,
    spectrum_npz_tmpl: str,
    scalar_tokenizer_config_path: str,   # <<--- NEW
    image_token_key: str = "tokens_flat",
    decode_scalar_labels: bool = True,   # <<--- NEW
):
    # 1) read metadata and filter
    df = pd.read_parquet(
        datacube_tmpl.format(field=field),
        columns=["id", "ra", "dec", "gaia_source_id", "mag_pstotal_r", "err_mag_pstotal_r"],
    )

    mask = (
        (df["mag_pstotal_r"] < 22.0)
        & (df["mag_pstotal_r"] > 14.0)
        & (df["err_mag_pstotal_r"] < 2.0)
    )
    df = df.loc[mask].copy()
    df["id"] = df["id"].map(norm_splus_id)
    df["gaia_source_id"] = df["gaia_source_id"].map(norm_gaia_id)

    print(f"[{field}] selected sources: {len(df)}")

    # 2) load npz files
    img = np.load(image_npz_tmpl.format(field=field), allow_pickle=True)
    sca = np.load(scalar_npz_tmpl.format(field=field), allow_pickle=True)
    spe = np.load(spectrum_npz_tmpl.format(field=field), allow_pickle=True)

    print(f"[{field}] image npz keys   = {list(img.files)}")
    print(f"[{field}] scalar npz keys  = {list(sca.files)}")
    print(f"[{field}] spectrum npz keys= {list(spe.files)}")

    # 3) maps
    # image keyed by Gaia id
    img_map = npz_row_map_by_id(img, token_key=image_token_key, id_normalizer=norm_gaia_id)

    # scalar keyed by S-PLUS id, tokens are (N, n_scalar_cols)
    if "ids" not in sca.files or "scalar_tokens" not in sca.files or "scalar_cols" not in sca.files:
        raise RuntimeError("Scalar NPZ must contain ids, scalar_tokens, scalar_cols.")

    sca_ids = [norm_splus_id(x) for x in np.asarray(sca["ids"], dtype=object)]
    sca_tokens = sca["scalar_tokens"]
    sca_cols = np.asarray(sca["scalar_cols"], dtype=object)

    sca_map = {sca_ids[i]: sca_tokens[i] for i in range(len(sca_ids))}

    # spectrum keyed by S-PLUS id (ragged)
    bp_map = build_ragged_map_from_npz(
        spe,
        ids_key="ids",
        flat_key="tokens_gaiaxp_bp_flat",
        indptr_key="tokens_gaiaxp_bp_indptr",
        id_normalizer=norm_splus_id,
    )
    rp_map = build_ragged_map_from_npz(
        spe,
        ids_key="ids",
        flat_key="tokens_gaiaxp_rp_flat",
        indptr_key="tokens_gaiaxp_rp_indptr",
        id_normalizer=norm_splus_id,
    )

    # 4) tokenizer LUT (for decoding scalar tokens -> (lo,hi))
    bin_edges, n_bins = load_scalar_tokenizer_config(scalar_tokenizer_config_path)
    edges_list = build_edges_lut_for_scalar_cols(bin_edges, sca_cols, n_bins)

    # 5) merge store (keyed by S-PLUS id)
    store = {}
    for r in df.itertuples(index=False):
        sid = r.id
        gid = r.gaia_source_id

        scalar_row = sca_map.get(sid, None)

        store[sid] = {
            "meta": {
                "field": field,
                "id": sid,  # S-PLUS id (string)
                "ra": float(r.ra),
                "dec": float(r.dec),
                "gaia_source_id": gid,  # int or None
                "mag_pstotal_r": float(r.mag_pstotal_r),
                "err_mag_pstotal_r": float(r.err_mag_pstotal_r),
            },
            "image_tokens": (img_map.get(gid, None) if gid is not None else None),
            "scalar_tokens": scalar_row,
            "scalar_cols": sca_cols,  # keep col order for later use
            "scalar_labels": (decode_scalar_row_to_labels(scalar_row, sca_cols, edges_list)
                              if (decode_scalar_labels and scalar_row is not None) else None),
            "spectrum_tokens": {
                "gaiaxp_bp": bp_map.get(sid, None),
                "gaiaxp_rp": rp_map.get(sid, None),
            },
        }

    # coverage report
    n = len(store)
    n_img = sum(v["image_tokens"] is not None for v in store.values())
    n_sca = sum(v["scalar_tokens"] is not None for v in store.values())
    n_bp = sum(v["spectrum_tokens"]["gaiaxp_bp"] is not None and len(v["spectrum_tokens"]["gaiaxp_bp"]) > 0 for v in store.values())
    n_rp = sum(v["spectrum_tokens"]["gaiaxp_rp"] is not None and len(v["spectrum_tokens"]["gaiaxp_rp"]) > 0 for v in store.values())

    print(f"[{field}] coverage: image={n_img}/{n} scalar={n_sca}/{n} bp={n_bp}/{n} rp={n_rp}/{n}")

    return store

In [None]:
for field in fields:
    field_df = pd.read_parquet(datacube.format(field=field), columns=['id', 'ra', 'dec', 'gaia_source_id', 'mag_pstotal_r', 'err_mag_pstotal_r'])
    mask = (field_df['mag_pstotal_r'] < 22.0) &\
    (field_df['mag_pstotal_r'] > 14.0) &\
    (field_df['err_mag_pstotal_r'] < 2)
    
    field_df = field_df[mask]
    print(f"Field: {field}, Selected sources: {len(field_df)}")
    
    image_token = np.load(image_codecs.format(field=field), allow_pickle=True)
    scalar_token = np.load(scalar_codecs.format(field=field), allow_pickle=True)
    spectrum_token = np.load(spectrum_tokens.format(field=field), allow_pickle=True)
    
    print(f"Field: {field}")
    print(f"Image tokens shape: {image_token['ids'].shape}")
    print(f"Scalar tokens shape: {scalar_token['ids'].shape}")
    print(f"Spectrum tokens shape: {spectrum_token['ids'].shape}")
    
    store = build_object_store_for_field(
        field=field,
        datacube_tmpl=datacube.format(field=field),
        image_npz_tmpl=image_codecs.format(field=field),
        scalar_npz_tmpl=scalar_codecs.format(field=field),
        spectrum_npz_tmpl=spectrum_tokens.format(field=field),
        scalar_tokenizer_config_path="scalar_tokenizers/scalar_tokenizer_config.npz",
    )

Field: HYDRA-0011, Selected sources: 36491
Field: HYDRA-0011
Image tokens shape: (19331,)
Scalar tokens shape: (36496,)
Spectrum tokens shape: (77127,)
[HYDRA-0011] selected sources: 36491
[HYDRA-0011] image npz keys   = ['tokens_2d', 'tokens_flat', 'ids', 'id_col']
[HYDRA-0011] scalar npz keys  = ['scalar_tokens', 'scalar_cols', 'ids', 'id_col', 'N_BINS']
[HYDRA-0011] spectrum npz keys= ['ids', 'id_col', 'tokens_gaiaxp_bp_flat', 'tokens_gaiaxp_bp_indptr', 'tokens_gaiaxp_rp_flat', 'tokens_gaiaxp_rp_indptr']
[HYDRA-0011] coverage: image=18768/36491 scalar=36491/36491 bp=608/36491 rp=608/36491
Field: HYDRA-0012, Selected sources: 34632
Field: HYDRA-0012
Image tokens shape: (18431,)
Scalar tokens shape: (34656,)
Spectrum tokens shape: (80758,)
[HYDRA-0012] selected sources: 34632
[HYDRA-0012] image npz keys   = ['tokens_2d', 'tokens_flat', 'ids', 'id_col']
[HYDRA-0012] scalar npz keys  = ['scalar_tokens', 'scalar_cols', 'ids', 'id_col', 'N_BINS']
[HYDRA-0012] spectrum npz keys= ['ids', 'i