# NeurIPS 2024 Ariel Data Challenge — HuggingFace Dataset Upload

**Purpose**: Preprocess every planet in the Ariel competition dataset and push  
the results to `alexy-louis/ariel-exoplanet-2024` on the HuggingFace Hub.

**Outputs**:
- `data/preprocessed/{train,test}/{planet_id}.npz` — one compressed NumPy archive per planet
- HuggingFace dataset repository with `ariel_dataset.py` loading script

> **Note**: This notebook is Kaggle-ready and requires `ariel-data-challenge-2024` attached.

## 1. Setup

In [None]:
# Install / verify required packages
import subprocess, sys

REQUIRED_PACKAGES = [
    "h5py",
    "tqdm",
    "huggingface_hub",
    "datasets",
]

for pkg in REQUIRED_PACKAGES:
    try:
        __import__(pkg.replace("-", "_"))
        print(f"{pkg}: already installed")
    except ImportError:
        print(f"{pkg}: not found — installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg, "-q"])
        print(f"{pkg}: installed")

print("[Done] Package check complete.")

In [None]:
import os
import sys
import warnings
from pathlib import Path

import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# ---------------------------------------------------------------------------
# Path configuration
# ---------------------------------------------------------------------------

# Add the project source to the Python path so src.preprocessing is importable.
# On Kaggle, clone the repo first (uncomment the subprocess call below).
REPO_DIR = "/kaggle/working/ariel-exoplanet-ml"
sys.path.insert(0, REPO_DIR)

# Uncomment to clone the repository on Kaggle:
# subprocess.run(
#     ["git", "clone",
#      "https://github.com/Smooth-Cactus0/ariel-exoplanet-ml.git",
#      REPO_DIR],
#     check=True,
# )

# ---------------------------------------------------------------------------
# HuggingFace token — paste your token here or set HF_TOKEN env variable
# ---------------------------------------------------------------------------
os.environ["HF_TOKEN"] = ""  # Set your HuggingFace token here

# ---------------------------------------------------------------------------
# Directories
# ---------------------------------------------------------------------------
DATA_ROOT = Path("/kaggle/input/ariel-data-challenge-2024")
OUT_DIR   = Path("/kaggle/working/preprocessed")
OUT_DIR.mkdir(parents=True, exist_ok=True)
(OUT_DIR / "train").mkdir(exist_ok=True)
(OUT_DIR / "test").mkdir(exist_ok=True)

# ---------------------------------------------------------------------------
# HDF5 key names
# TODO: verify key names — confirmed as 'AIRS-CH0' and 'FGS1' in dataset.py
# ---------------------------------------------------------------------------
AIRS_KEY = "AIRS-CH0"  # TODO: verify key names
FGS1_KEY = "FGS1"      # TODO: verify key names

# ---------------------------------------------------------------------------
# Preprocessing hyper-parameters (must match training config)
# ---------------------------------------------------------------------------
INGRESS  = 0.20
EGRESS   = 0.80
BIN_SIZE = 5

print(f"DATA_ROOT : {DATA_ROOT}  (exists={DATA_ROOT.exists()})")
print(f"OUT_DIR   : {OUT_DIR}")
print(f"AIRS key  : {AIRS_KEY}")
print(f"FGS1 key  : {FGS1_KEY}")
print(f"Ingress / Egress / BinSize : {INGRESS} / {EGRESS} / {BIN_SIZE}")

print("[Done] Setup complete.")

## 2. Preprocess All Training Planets

In [None]:
# Load auxiliary table and quartiles table (labels)
aux_path    = DATA_ROOT / "AuxillaryTable.csv"
labels_path = DATA_ROOT / "QuartilesTable.csv"

df_aux = pd.read_csv(aux_path, index_col=0)
print(f"AuxillaryTable  : {df_aux.shape[0]} planets, {df_aux.shape[1]} features")

# QuartilesTable has columns like '0_q1', '0_q2', '0_q3', '1_q1', ...
if labels_path.exists():
    df_labels = pd.read_csv(labels_path, index_col=0)
    labelled_ids = set(df_labels.index.astype(str))
    print(f"QuartilesTable  : {len(labelled_ids)} labelled planets")
else:
    df_labels    = None
    labelled_ids = set()
    print("WARNING: QuartilesTable.csv not found — no label extraction possible.")

print(f"[Done] Tables loaded. {len(labelled_ids)} of {df_aux.shape[0]} planets are labelled.")

In [None]:
def parse_quartile_row(df_labels: pd.DataFrame, planet_id: str):
    """
    Extract (target_mean, target_std) from QuartilesTable for a single planet.

    Column naming convention: '{wavelength_idx}_q1', '{wavelength_idx}_q2', '{wavelength_idx}_q3'
    Target mean = q2 (median)
    Target std  = (q3 - q1) / 2  (half inter-quartile range ≈ 1-sigma)

    Returns (None, None) if the planet is not in the table.
    """
    if df_labels is None or planet_id not in df_labels.index.astype(str):
        return None, None
    try:
        row = df_labels.loc[int(planet_id)]
        q1 = row.filter(regex=r"^\d+_q1$").values.astype(np.float32)
        q2 = row.filter(regex=r"^\d+_q2$").values.astype(np.float32)
        q3 = row.filter(regex=r"^\d+_q3$").values.astype(np.float32)
        target_mean = q2
        target_std  = (q3 - q1) / 2.0
        return target_mean, target_std
    except (KeyError, ValueError) as exc:
        warnings.warn(f"Could not parse labels for planet {planet_id}: {exc}")
        return None, None

print("[Done] parse_quartile_row helper defined.")

In [None]:
# Import the preprocessing function from src/preprocessing.py
try:
    from src.preprocessing import preprocess_planet
    print("Imported preprocess_planet from src.preprocessing")
except ImportError as exc:
    print(f"WARNING: Could not import preprocess_planet ({exc}).")
    print("Ensure REPO_DIR is set correctly and the repo has been cloned.")
    raise

# ---------------------------------------------------------------------------
# Preprocess training planets
# ---------------------------------------------------------------------------
TRAIN_HDF5 = DATA_ROOT / "train.hdf5"  # TODO: verify key names inside this file
train_out_dir = OUT_DIR / "train"

all_train_ids = list(df_aux.index.astype(str))
n_total   = len(all_train_ids)
n_success = 0
n_labelled_saved = 0
n_failed  = 0
total_bytes = 0

print(f"Opening {TRAIN_HDF5} ...")

try:
    h5_train = h5py.File(TRAIN_HDF5, "r")
except OSError as exc:
    print(f"ERROR: Cannot open {TRAIN_HDF5}: {exc}")
    print("Ensure the Kaggle dataset is attached to this notebook.")
    raise

try:
    with tqdm(total=n_total, desc="Train planets", unit="planet") as pbar:
        for pid in all_train_ids:
            out_path = train_out_dir / f"{pid}.npz"

            # Skip if already done (allows resuming after interruption)
            if out_path.exists():
                n_success += 1
                pbar.update(1)
                pbar.set_postfix(skipped="(cached)")
                continue

            try:
                # --- Load raw HDF5 data -----------------------------------
                # TODO: verify key names — expected: planet group > AIRS-CH0, FGS1
                planet_grp = h5_train[pid]
                airs_raw = planet_grp[AIRS_KEY][()].astype(np.float32)  # (time, 356)
                fgs1_raw = planet_grp[FGS1_KEY][()].astype(np.float32)  # (time,)
            except KeyError as exc:
                warnings.warn(f"KeyError for planet {pid}: {exc} — skipping.")
                n_failed += 1
                pbar.update(1)
                continue
            except Exception as exc:
                warnings.warn(f"Unexpected error loading planet {pid}: {exc} — skipping.")
                n_failed += 1
                pbar.update(1)
                continue

            # --- Auxiliary features ---------------------------------------
            try:
                aux_row = df_aux.loc[int(pid)].values.astype(np.float32)  # (9,)
            except (KeyError, ValueError):
                aux_row = np.zeros(9, dtype=np.float32)
                warnings.warn(f"Auxiliary features not found for planet {pid} — using zeros.")

            # --- Preprocessing pipeline -----------------------------------
            try:
                result = preprocess_planet(
                    airs_raw, fgs1_raw,
                    ingress=INGRESS, egress=EGRESS, bin_size=BIN_SIZE,
                )
            except Exception as exc:
                warnings.warn(f"Preprocessing failed for planet {pid}: {exc} — skipping.")
                n_failed += 1
                pbar.update(1)
                continue

            # --- Label extraction (optional) ------------------------------
            target_mean, target_std = parse_quartile_row(df_labels, pid)

            # --- Save to .npz ---------------------------------------------
            save_kwargs = dict(
                airs_norm        = result["airs_norm"],          # (time_binned, 356)
                fgs1_norm        = result["fgs1_norm"],          # (time_binned,)
                aux              = aux_row,                       # (9,)
                transit_depth    = result["transit_depth"],      # (356,)
                transit_depth_err= result["transit_depth_err"],  # (356,)
                mask_oot         = result["mask_oot"],           # (time_binned,)
            )
            if target_mean is not None:
                save_kwargs["target_mean"] = target_mean  # (283,)
                save_kwargs["target_std"]  = target_std   # (283,)
                n_labelled_saved += 1

            np.savez_compressed(str(out_path), **save_kwargs)
            total_bytes += out_path.stat().st_size
            n_success += 1
            pbar.update(1)
            pbar.set_postfix(
                success=n_success, labelled=n_labelled_saved, failed=n_failed
            )

finally:
    h5_train.close()

# Summary
avg_kb = (total_bytes / max(n_success, 1)) / 1024
print("\n" + "=" * 60)
print("Train preprocessing summary")
print("=" * 60)
print(f"  Total planets     : {n_total}")
print(f"  Successfully saved: {n_success}")
print(f"  Labelled planets  : {n_labelled_saved}")
print(f"  Failed / skipped  : {n_failed}")
print(f"  Total size on disk: {total_bytes / 1_048_576:.1f} MB")
print(f"  Avg file size     : {avg_kb:.1f} KB")
print("[Done] Train preprocessing complete.")

## 3. Preprocess All Test Planets

In [None]:
# Load test auxiliary table (planet IDs to iterate over)
test_aux_path = DATA_ROOT / "AuxillaryTable_test.csv"
if test_aux_path.exists():
    df_aux_test = pd.read_csv(test_aux_path, index_col=0)
else:
    # Fallback: some competitions use a single AuxillaryTable for both splits
    print("WARNING: AuxillaryTable_test.csv not found — attempting to use train table.")
    df_aux_test = df_aux.copy()

TEST_HDF5 = DATA_ROOT / "test.hdf5"  # TODO: verify key names inside this file
test_out_dir = OUT_DIR / "test"

all_test_ids   = list(df_aux_test.index.astype(str))
n_total_test   = len(all_test_ids)
n_success_test = 0
n_failed_test  = 0
total_bytes_test = 0

print(f"Test planets to process : {n_total_test}")
print(f"Opening {TEST_HDF5} ...")

try:
    h5_test = h5py.File(TEST_HDF5, "r")
except OSError as exc:
    print(f"ERROR: Cannot open {TEST_HDF5}: {exc}")
    print("Ensure the Kaggle dataset is attached. Skipping test split.")
    h5_test = None

if h5_test is not None:
    try:
        with tqdm(total=n_total_test, desc="Test planets", unit="planet") as pbar:
            for pid in all_test_ids:
                out_path = test_out_dir / f"{pid}.npz"

                if out_path.exists():
                    n_success_test += 1
                    pbar.update(1)
                    pbar.set_postfix(skipped="(cached)")
                    continue

                try:
                    # TODO: verify key names — expected: planet group > AIRS-CH0, FGS1
                    planet_grp = h5_test[pid]
                    airs_raw = planet_grp[AIRS_KEY][()].astype(np.float32)
                    fgs1_raw = planet_grp[FGS1_KEY][()].astype(np.float32)
                except KeyError as exc:
                    warnings.warn(f"KeyError for test planet {pid}: {exc} — skipping.")
                    n_failed_test += 1
                    pbar.update(1)
                    continue
                except Exception as exc:
                    warnings.warn(f"Error loading test planet {pid}: {exc} — skipping.")
                    n_failed_test += 1
                    pbar.update(1)
                    continue

                try:
                    aux_row = df_aux_test.loc[int(pid)].values.astype(np.float32)
                except (KeyError, ValueError):
                    aux_row = np.zeros(9, dtype=np.float32)
                    warnings.warn(f"Aux features not found for test planet {pid}.")

                try:
                    result = preprocess_planet(
                        airs_raw, fgs1_raw,
                        ingress=INGRESS, egress=EGRESS, bin_size=BIN_SIZE,
                    )
                except Exception as exc:
                    warnings.warn(f"Preprocessing failed for test planet {pid}: {exc}.")
                    n_failed_test += 1
                    pbar.update(1)
                    continue

                np.savez_compressed(
                    str(out_path),
                    airs_norm        = result["airs_norm"],
                    fgs1_norm        = result["fgs1_norm"],
                    aux              = aux_row,
                    transit_depth    = result["transit_depth"],
                    transit_depth_err= result["transit_depth_err"],
                    mask_oot         = result["mask_oot"],
                )
                total_bytes_test += out_path.stat().st_size
                n_success_test += 1
                pbar.update(1)
                pbar.set_postfix(success=n_success_test, failed=n_failed_test)
    finally:
        h5_test.close()

avg_kb_test = (total_bytes_test / max(n_success_test, 1)) / 1024
print("\n" + "=" * 60)
print("Test preprocessing summary")
print("=" * 60)
print(f"  Total planets     : {n_total_test}")
print(f"  Successfully saved: {n_success_test}")
print(f"  Failed / skipped  : {n_failed_test}")
print(f"  Total size on disk: {total_bytes_test / 1_048_576:.1f} MB")
print(f"  Avg file size     : {avg_kb_test:.1f} KB")
print("[Done] Test preprocessing complete.")

## 4. Validate One Sample

In [None]:
# Load back one .npz from the train split to sanity-check contents and shapes
train_npz_files = sorted((OUT_DIR / "train").glob("*.npz"))

if not train_npz_files:
    print("WARNING: No .npz files found in train output dir — cannot validate.")
else:
    sample_path = train_npz_files[0]
    sample = np.load(sample_path, allow_pickle=False)

    planet_id = sample_path.stem
    print(f"Sample planet ID  : {planet_id}")
    print(f"NPZ file path     : {sample_path}")
    print(f"File size         : {sample_path.stat().st_size / 1024:.1f} KB")
    print()
    print(f"{'Key':<22} {'Shape':<25} {'dtype':<10} {'min':>10}  {'max':>10}")
    print("-" * 80)
    for key in sorted(sample.files):
        arr = sample[key]
        vmin = float(arr.min()) if arr.size > 0 else float('nan')
        vmax = float(arr.max()) if arr.size > 0 else float('nan')
        print(f"  {key:<20} {str(arr.shape):<25} {str(arr.dtype):<10} {vmin:>10.4f}  {vmax:>10.4f}")

    # --- Plot transit depth spectrum ----------------------------------------
    transit_depth = sample["transit_depth"]
    transit_depth_err = sample["transit_depth_err"]
    n_channels = len(transit_depth)
    wl_idx = np.arange(n_channels)

    fig, ax = plt.subplots(figsize=(12, 4))
    ax.plot(wl_idx, transit_depth, lw=0.9, color="steelblue", label="Transit depth")
    ax.fill_between(
        wl_idx,
        transit_depth - transit_depth_err,
        transit_depth + transit_depth_err,
        alpha=0.25, color="steelblue", label="±1σ",
    )

    # Overlay target_mean if available
    if "target_mean" in sample.files and len(sample["target_mean"]) == 283:
        target_mean = sample["target_mean"]
        target_std  = sample["target_std"]
        target_wl   = np.linspace(0, n_channels - 1, 283)
        ax.plot(target_wl, target_mean, lw=1.2, color="darkorange",
                linestyle="--", label="Target mean (q2, 283 channels)")
        ax.fill_between(
            target_wl,
            target_mean - target_std,
            target_mean + target_std,
            alpha=0.20, color="darkorange",
        )

    ax.set_xlabel("AIRS-CH0 channel index")
    ax.set_ylabel("Transit depth (fractional)")
    ax.set_title(f"Planet {planet_id} — extracted transit depth spectrum ({n_channels} channels)")
    ax.legend()
    plt.tight_layout()
    plt.show()

    print(f"[Done] Validation complete for planet {planet_id}.")

## 5. Push to HuggingFace Hub

In [None]:
from huggingface_hub import HfApi, login

HF_TOKEN = os.environ.get("HF_TOKEN", "")
if not HF_TOKEN:
    raise ValueError(
        "HF_TOKEN is empty. Set os.environ['HF_TOKEN'] in the Setup cell "
        "before running this section."
    )

login(token=HF_TOKEN, add_to_git_credential=False)
api = HfApi()

REPO_ID = "alexy-louis/ariel-exoplanet-2024"

print(f"[Done] Logged in to HuggingFace Hub as alexy-louis.")

In [None]:
# Create the dataset repository (no-op if it already exists)
api.create_repo(
    repo_id  = REPO_ID,
    repo_type= "dataset",
    exist_ok = True,
    private  = False,
)
print(f"Repository ready: https://huggingface.co/datasets/{REPO_ID}")
print("[Done] Repository created (or already exists).")

In [None]:
# Upload the entire preprocessed directory tree to data/preprocessed/ in the repo
print(f"Uploading {OUT_DIR} → {REPO_ID}:data/preprocessed/ ...")
print("(This may take several minutes depending on dataset size.)")

api.upload_folder(
    folder_path  = str(OUT_DIR),
    repo_id      = REPO_ID,
    repo_type    = "dataset",
    path_in_repo = "data/preprocessed",
    commit_message = "Upload preprocessed .npz files (train + test)",
)

print("[Done] Preprocessed data uploaded.")

In [None]:
# Upload the HuggingFace Datasets loading script
LOADING_SCRIPT = os.path.join(REPO_DIR, "hf_dataset", "ariel_dataset.py")

if not os.path.exists(LOADING_SCRIPT):
    print(f"WARNING: Loading script not found at {LOADING_SCRIPT}.")
    print("Ensure the repository is cloned at REPO_DIR.")
else:
    api.upload_file(
        path_or_fileobj= LOADING_SCRIPT,
        path_in_repo   = "ariel_dataset.py",
        repo_id        = REPO_ID,
        repo_type      = "dataset",
        commit_message = "Add HuggingFace Datasets loading script",
    )
    print(f"Uploaded loading script: {LOADING_SCRIPT}")

print("[Done] Upload complete!")
print(f"Dataset URL: https://huggingface.co/datasets/{REPO_ID}")

## 6. Verify Load from Hub

In [None]:
from datasets import load_dataset

print(f"Loading dataset from Hub: {REPO_ID}")
print("(First load will download and cache the data — may take a while.)")

ds = load_dataset(REPO_ID, split="train")

print("\nDataset info:")
print(ds)

print("\nFirst example keys:")
sample_hub = ds[0]
print(list(sample_hub.keys()))

print("\nFirst example shapes / lengths:")
for key, val in sample_hub.items():
    if hasattr(val, '__len__'):
        if hasattr(val, 'shape'):
            print(f"  {key:<22}: shape={val.shape}")
        else:
            print(f"  {key:<22}: len={len(val)}")
    else:
        print(f"  {key:<22}: {val}")

print("[Done] Dataset verified from Hub.")

## 7. Summary

### What was uploaded

- **Train split**: one `.npz` file per planet under `data/preprocessed/train/`  
  Each file contains: `airs_norm`, `fgs1_norm`, `aux`, `transit_depth`, `transit_depth_err`, `mask_oot`.  
  Labelled planets additionally contain: `target_mean`, `target_std`.

- **Test split**: one `.npz` file per planet under `data/preprocessed/test/`  
  Same structure, without label arrays.

- **Loading script**: `ariel_dataset.py` — a `datasets.GeneratorBasedBuilder` enabling  
  one-line loading via `load_dataset()`.

### How to use the dataset

```python
from datasets import load_dataset

# Load full train split
ds_train = load_dataset("alexy-louis/ariel-exoplanet-2024", split="train")

# Load full test split
ds_test = load_dataset("alexy-louis/ariel-exoplanet-2024", split="test")

# Access a single planet
planet = ds_train[0]
print(planet.keys())
# dict_keys: planet_id, airs_norm, fgs1_norm, aux,
#            transit_depth, transit_depth_err,
#            target_mean (labelled only), target_std (labelled only)

import numpy as np
airs = np.array(planet["airs_norm"])   # (time_binned, 356)
fgs1 = np.array(planet["fgs1_norm"])   # (time_binned,)
td   = np.array(planet["transit_depth"])  # (356,)
```

### Links

- HuggingFace Dataset: https://huggingface.co/datasets/alexy-louis/ariel-exoplanet-2024
- Kaggle Competition: https://www.kaggle.com/competitions/ariel-data-challenge-2024
- GitHub Repository: https://github.com/Smooth-Cactus0/ariel-exoplanet-ml