# NeurIPS 2024 Ariel Data Challenge — Baseline Models

**Goal**: Establish quantitative lower bounds on predictive performance using simple, interpretable baselines before training neural or tree-based models.

**Baselines implemented**:
1. **Constant predictor** — predict the training-set median for every planet and wavelength.
2. **Per-wavelength Ridge regression** — fit 283 independent Ridge regressors on the 5 ADC calibration features.
3. **Sigma sensitivity analysis** — demonstrate how miscalibrated uncertainty degrades GLL even with perfect mean predictions.

**Metric**: Gaussian Log-Likelihood (GLL).  Higher is better.  A perfect prediction scores 0.

$$\text{GLL}(y, \mu, \sigma) = -\frac{1}{2} \mathbb{E}\left[\log(2\pi\sigma^2) + \left(\frac{y - \mu}{\sigma}\right)^2\right]$$

> **Note**: This notebook is Kaggle-ready. Run it as a Kaggle kernel with the `ariel-data-challenge-2024` dataset attached.

## 1. Setup

In [None]:
import subprocess, sys
from pathlib import Path

# ── Kaggle: clone repo and add to sys.path ─────────────────────────────────
repo_dir = "/kaggle/working/ariel-exoplanet-ml"
project_dir = repo_dir + "/Kaggle competition/ARIEL neurIPS"

if not Path(repo_dir).exists():
    subprocess.run(
        ["git", "clone",
         "https://github.com/Smooth-Cactus0/ariel-exoplanet-ml.git",
         repo_dir],
        check=True,
    )
    print(f"Cloned repo to {repo_dir}")
else:
    print(f"Repo already exists at {repo_dir}")

sys.path.insert(0, project_dir)

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import Pipeline

# ── Data root ──────────────────────────────────────────────────────────────
DATA_ROOT = Path("/kaggle/input/ariel-data-challenge-2024")

# ── Plot style ─────────────────────────────────────────────────────────────
plt.rcParams.update({
    "figure.dpi": 110,
    "axes.spines.top": False,
    "axes.spines.right": False,
    "savefig.dpi": 150,
    "savefig.facecolor": "white",
})

# ── Output directories ────────────────────────────────────────────────────
OUT_DIR = Path("/kaggle/working/baseline_results")
OUT_DIR.mkdir(parents=True, exist_ok=True)
FIG_DIR = OUT_DIR / "figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

# ── Constants ──────────────────────────────────────────────────────────────
N_WAVELENGTHS = 283   # competition output dimension
RANDOM_STATE  = 42

print(f"Python      : {sys.version.split()[0]}")
print(f"NumPy       : {np.__version__}")
print(f"Pandas      : {pd.__version__}")
print(f"Matplotlib  : {matplotlib.__version__}")
print(f"DATA_ROOT   : {DATA_ROOT}")
print(f"Exists      : {DATA_ROOT.exists()}")
print(f"OUT_DIR     : {OUT_DIR}")
print("[Done] Setup complete.")

## 2. Load Labels and Auxiliary Features

- `train_adc_info.csv` — 5 ADC calibration features per planet: `FGS1_adc_offset`, `FGS1_adc_gain`, `AIRS-CH0_adc_offset`, `AIRS-CH0_adc_gain`, `star`.
- `train_labels.csv` — mean transmission spectrum `wl_1` ... `wl_283` for labelled planets.

We merge the two tables on `planet_id` and retain only the labelled planets for training.  Labels provide only the **mean** spectrum (no quartiles, no sigma).

In [None]:
# ── Attempt to load real data; fall back to synthetic if not found ──────────
_USING_SYNTHETIC = False

adc_path   = DATA_ROOT / "train_adc_info.csv"
label_path = DATA_ROOT / "train_labels.csv"

# ADC feature columns (excluding planet_id)
AUX_FEATURE_COLS = [
    "FGS1_adc_offset", "FGS1_adc_gain",
    "AIRS-CH0_adc_offset", "AIRS-CH0_adc_gain",
    "star",
]
N_AUX_FEATURES = len(AUX_FEATURE_COLS)  # 5

# Wavelength label columns
WL_COLS = [f"wl_{i}" for i in range(1, N_WAVELENGTHS + 1)]

if adc_path.exists() and label_path.exists():
    print("Loading real CSV files...")
    df_adc    = pd.read_csv(adc_path)
    df_labels = pd.read_csv(label_path)
    print(f"  train_adc_info : {df_adc.shape}")
    print(f"  train_labels   : {df_labels.shape}")
else:
    print("WARNING: CSV files not found. Generating SYNTHETIC fallback data.")
    _USING_SYNTHETIC = True

    rng_synth = np.random.default_rng(RANDOM_STATE)
    N_PLANETS_ALL      = 400
    N_PLANETS_LABELLED = 100

    # Synthetic planet IDs
    all_planet_ids = np.arange(N_PLANETS_ALL)

    # Synthetic ADC features: 5 columns
    aux_data = rng_synth.normal(0, 1, size=(N_PLANETS_ALL, N_AUX_FEATURES))
    df_adc = pd.DataFrame(aux_data, columns=AUX_FEATURE_COLS)
    df_adc.insert(0, "planet_id", all_planet_ids)

    # Synthetic labels for first N_PLANETS_LABELLED planets (means only)
    labelled_ids = all_planet_ids[:N_PLANETS_LABELLED]
    wl_means = rng_synth.normal(0.01, 0.003, size=(N_PLANETS_LABELLED, N_WAVELENGTHS))

    label_dict = {"planet_id": labelled_ids}
    for i, col in enumerate(WL_COLS):
        label_dict[col] = wl_means[:, i]

    df_labels = pd.DataFrame(label_dict)
    print(f"  Synthetic train_adc_info : {df_adc.shape}")
    print(f"  Synthetic train_labels   : {df_labels.shape}")

# ── Normalise planet_id types for safe merging ─────────────────────────────
df_adc["planet_id"]    = df_adc["planet_id"].astype(str)
df_labels["planet_id"] = df_labels["planet_id"].astype(str)

# ── Merge: keep only labelled planets (inner join on planet_id) ────────────
df_merged = pd.merge(
    df_labels,
    df_adc,
    on="planet_id",
    how="inner",
)

print(f"\nMerged labelled set shape : {df_merged.shape}")
print(f"  Labelled planets  : {len(df_merged)}")
print(f"  Total columns     : {df_merged.shape[1]}")

# ── Extract numpy arrays ───────────────────────────────────────────────────
Y_mean = df_merged[WL_COLS].values.astype(np.float64)   # (n_planets, 283)
X_aux  = df_merged[AUX_FEATURE_COLS].values.astype(np.float64)  # (n_planets, 5)

print(f"\nY_mean shape : {Y_mean.shape}  (labelled planets x wavelengths)")
print(f"X_aux  shape : {X_aux.shape}  (labelled planets x aux features)")
print(f"Aux features ({N_AUX_FEATURES}): {AUX_FEATURE_COLS}")
print("[Done] Labels and auxiliary features loaded and merged.")

## 3. Gaussian Log-Likelihood (GLL) — Implementation

The competition metric is:

$$\text{GLL}(y, \mu, \sigma) = -\frac{1}{2} \operatorname{mean}\!\left[\log(2\pi\sigma^2) + \left(\frac{y-\mu}{\sigma}\right)^2\right]$$

- **Higher is better**; a perfect predictor (y = mu, sigma calibrated) gives 0.
- Both **mean accuracy** (term 2) and **uncertainty calibration** (term 1) matter.
- Overconfident predictions (sigma too small) are penalised by the squared residual blowing up.
- Underconfident predictions (sigma too large) are penalised by the log(sigma^2) term.

Since `train_labels.csv` provides only the mean transmission spectrum (no quartiles or sigma), uncertainty is purely a model responsibility.

In [None]:
def gaussian_log_likelihood(
    y: np.ndarray,
    mu: np.ndarray,
    sigma: np.ndarray,
) -> float:
    """
    Compute the competition Gaussian Log-Likelihood score.

    GLL = -0.5 * mean( log(2*pi*sigma^2) + ((y - mu) / sigma)^2 )

    Higher is better.  Perfect prediction = 0.

    Parameters
    ----------
    y     : (...,) ground truth values
    mu    : (...,) predicted means (same shape as y)
    sigma : (...,) predicted stds  (must be positive; clipped at 1e-9)

    Returns
    -------
    float  — GLL score
    """
    sigma = np.clip(sigma, 1e-9, None)
    term1 = np.log(2.0 * np.pi * sigma ** 2)
    term2 = ((y - mu) / sigma) ** 2
    return float(-0.5 * np.mean(term1 + term2))


# ── Quick sanity checks ────────────────────────────────────────────────────
y_test    = np.array([1.0, 2.0, 3.0])
mu_test   = np.array([1.0, 2.0, 3.0])
sig_test  = np.array([0.1, 0.1, 0.1])

gll_perfect_mean = gaussian_log_likelihood(y_test, mu_test, sig_test)
gll_scipy = float(np.mean(scipy.stats.norm.logpdf(y_test, loc=mu_test, scale=sig_test)))

print(f"GLL (perfect mean, sigma=0.1)  : {gll_perfect_mean:.6f}")
print(f"scipy reference                : {gll_scipy:.6f}")
print(f"Difference vs scipy            : {abs(gll_perfect_mean - gll_scipy):.2e}  (should be <1e-10)")

# Verify: small sigma → GLL near 0 when prediction is exact
sigma_tiny = np.full_like(y_test, 1e-6)
gll_tiny = gaussian_log_likelihood(y_test, mu_test, sigma_tiny)
print(f"GLL (perfect mean, sigma=1e-6) : {gll_tiny:.4f}  (near 0 expected)")

# Verify: large sigma → strongly negative GLL
sigma_large = np.full_like(y_test, 100.0)
gll_large = gaussian_log_likelihood(y_test, mu_test, sigma_large)
print(f"GLL (perfect mean, sigma=100)  : {gll_large:.4f}  (strongly negative expected)")

print("[Done] GLL function implemented and verified.")

## 4. Baseline 1 — Constant Predictor (Training-Set Median)

The simplest possible baseline: ignore auxiliary features entirely and predict the training-set
median for every planet.

- **mu[lambda]** = median of wl_* values over all training planets at wavelength lambda
- **sigma[lambda]** = std of training residuals (Y_train - mu) at each wavelength

Since labels provide only the mean spectrum (no quartiles/IQR), we estimate sigma from the
standard deviation of training residuals.

This establishes the floor that *any* model must beat: if a model cannot outperform a global
constant, it has learned nothing from the data.

In [None]:
# ── Train / held-out 80/20 split ───────────────────────────────────────────
n_planets = Y_mean.shape[0]
(
    X_train, X_val,
    Y_train, Y_val,
) = train_test_split(
    X_aux,
    Y_mean,
    test_size=0.2,
    random_state=RANDOM_STATE,
)

print(f"Train split : {len(X_train)} planets")
print(f"Val   split : {len(X_val)} planets")

# ── Compute constant predictions from training set ─────────────────────────
# mu[lambda] = median of wl_* values over training planets
mu_const = np.median(Y_train, axis=0)     # (283,)

# sigma[lambda] = std of training residuals (no IQR available — means only)
resid_const_train = Y_train - mu_const[np.newaxis, :]   # (n_train, 283)
sigma_const = resid_const_train.std(axis=0)              # (283,)
sigma_const = np.clip(sigma_const, 1e-9, None)           # guard against zero

print(f"\nConstant predictor:")
print(f"  mu    : mean={mu_const.mean():.6f},  std={mu_const.std():.6f}")
print(f"  sigma : mean={sigma_const.mean():.6f}, std={sigma_const.std():.6f}")

# ── Evaluate on held-out validation set ───────────────────────────────────
# Broadcast constant predictions to all validation planets
n_val = len(Y_val)
mu_val_const    = np.tile(mu_const,    (n_val, 1))   # (n_val, 283)
sigma_val_const = np.tile(sigma_const, (n_val, 1))   # (n_val, 283)

gll_const = gaussian_log_likelihood(Y_val, mu_val_const, sigma_val_const)

print(f"\nBaseline (constant median) GLL = {gll_const:.4f}")
print("[Done] Baseline 1 (constant predictor) evaluated.")

## 5. Baseline 2 — Per-Wavelength Ridge Regression on ADC Features

Fit 283 independent Ridge regressors (one per wavelength channel), each using the 5 standardised
ADC calibration features to predict the mean transmission depth at that wavelength.

- **Features** (`X`): 5 ADC parameters (`FGS1_adc_offset`, `FGS1_adc_gain`, `AIRS-CH0_adc_offset`, `AIRS-CH0_adc_gain`, `star`), standardised via `StandardScaler`.
- **Target** (`y`): mean transmission depth at each wavelength (from `train_labels.csv`).
- **Sigma**: estimated from the standard deviation of in-fold training residuals.
- **Evaluation**: 5-fold cross-validation, mean GLL across all folds and wavelengths.

This tests whether the ADC calibration features carry *any* predictive signal about the spectrum shape.

In [None]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

ALPHA       = 1.0    # Ridge regularisation strength
N_FOLDS     = 5

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

# Collect per-fold GLL scores
fold_gll_scores = []

for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X_aux)):
    X_tr, X_vl = X_aux[train_idx], X_aux[val_idx]
    Y_tr, Y_vl = Y_mean[train_idx], Y_mean[val_idx]

    # Standardise features
    scaler = StandardScaler()
    X_tr_s = scaler.fit_transform(X_tr)
    X_vl_s = scaler.transform(X_vl)

    # Fit 283 Ridge regressors in one shot using multi-output Ridge
    # (Ridge natively supports multi-output when y is 2-D)
    ridge = Ridge(alpha=ALPHA, fit_intercept=True)
    ridge.fit(X_tr_s, Y_tr)

    # Predict on validation fold
    mu_pred = ridge.predict(X_vl_s)   # (n_val, 283)

    # Estimate sigma as std of in-fold TRAINING residuals (per wavelength)
    resid_tr = Y_tr - ridge.predict(X_tr_s)   # (n_train, 283)
    sigma_wl = resid_tr.std(axis=0)            # (283,)  — one sigma per wavelength
    sigma_wl = np.clip(sigma_wl, 1e-9, None)

    # Broadcast sigma to validation set
    n_vl = len(X_vl)
    sigma_pred = np.tile(sigma_wl, (n_vl, 1))   # (n_val, 283)

    fold_gll = gaussian_log_likelihood(Y_vl, mu_pred, sigma_pred)
    fold_gll_scores.append(fold_gll)
    print(f"  Fold {fold_idx + 1}/{N_FOLDS}:  val size={n_vl:3d}  GLL={fold_gll:.4f}")

gll_ridge_cv = float(np.mean(fold_gll_scores))
gll_ridge_std = float(np.std(fold_gll_scores))

print(f"\nBaseline (Ridge regression) GLL (5-fold CV) = {gll_ridge_cv:.4f}  "
      f"+/- {gll_ridge_std:.4f} (std across folds)")
print("[Done] Baseline 2 (Ridge regression) evaluated.")

## 6. Sigma Sensitivity Analysis

This experiment holds the **mean prediction fixed** at the training median (Baseline 1) and
varies only sigma across a range of multipliers applied to the residual standard deviation.

Key insight: **GLL is maximised at a well-calibrated sigma**. Too small (overconfident) or too large
(underconfident) both hurt.  The optimal sigma equals the true standard deviation of the target
distribution around the mean, i.e. `std(y - mu)`.

In [None]:
# ── Sigma multipliers to test ──────────────────────────────────────────────
sigma_multipliers = [0.1, 0.25, 0.5, 1.0, 1.5, 2.0, 3.0, 5.0, 10.0]

# sigma_const is the residual std baseline computed in Section 4
# We use the same held-out val set (Y_val, mu_val_const) from Baseline 1

gll_by_mult = []
for mult in sigma_multipliers:
    sigma_test = np.clip(mult * sigma_const, 1e-9, None)     # (283,)
    sigma_val_test = np.tile(sigma_test, (n_val, 1))          # (n_val, 283)
    gll_score = gaussian_log_likelihood(Y_val, mu_val_const, sigma_val_test)
    gll_by_mult.append(gll_score)

gll_by_mult = np.array(gll_by_mult)
best_idx  = int(np.argmax(gll_by_mult))
best_mult = sigma_multipliers[best_idx]
best_gll  = gll_by_mult[best_idx]

print(f"{'Multiplier':>12}  {'Sigma (mean)':>14}  {'GLL':>10}")
print("-" * 42)
for mult, gll_s in zip(sigma_multipliers, gll_by_mult):
    sigma_test = mult * sigma_const
    mark = " <<< BEST" if mult == best_mult else ""
    print(f"{mult:>12.2f}  {sigma_test.mean():>14.6f}  {gll_s:>10.4f}{mark}")

# ── Plot GLL vs sigma multiplier ───────────────────────────────────────────
fig, ax = plt.subplots(figsize=(10, 5))

ax.plot(sigma_multipliers, gll_by_mult,
        marker="o", markersize=7, lw=2, color="steelblue", label="GLL (constant mean)")

# Highlight optimal point
ax.plot(best_mult, best_gll,
        marker="*", markersize=16, color="crimson", zorder=5,
        label=f"Optimal: mult={best_mult:.2f}, GLL={best_gll:.4f}")

ax.annotate(
    f"Optimal sigma multiplier = {best_mult:.2f}\nGLL = {best_gll:.4f}",
    xy=(best_mult, best_gll),
    xytext=(best_mult * 1.3 if best_mult < 5 else best_mult * 0.4,
            best_gll - (gll_by_mult.max() - gll_by_mult.min()) * 0.25),
    arrowprops=dict(arrowstyle="->", color="gray", lw=1.5),
    fontsize=10, color="crimson",
    bbox=dict(boxstyle="round,pad=0.4", facecolor="lightyellow",
              edgecolor="gray", alpha=0.9),
)

# Shade the overconfident / underconfident halves
ax.axvspan(sigma_multipliers[0], best_mult,
           alpha=0.06, color="tomato",
           label="Overconfident (sigma too small)")
ax.axvspan(best_mult, sigma_multipliers[-1],
           alpha=0.06, color="royalblue",
           label="Underconfident (sigma too large)")

ax.set_xscale("log")
ax.set_xlabel("sigma multiplier  (sigma = mult * residual_std)", fontsize=11)
ax.set_ylabel("GLL score  (higher = better)", fontsize=11)
ax.set_title(
    "Uncertainty Sensitivity Analysis\n"
    "GLL vs sigma multiplier with fixed constant mean (training median)",
    fontsize=12
)
ax.legend(fontsize=9, loc="lower right")
ax.set_xticks(sigma_multipliers)
ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())

plt.tight_layout()
fig.savefig(FIG_DIR / "sigma_sensitivity.png", bbox_inches="tight")
plt.show()

print(f"\nOptimal sigma multiplier : {best_mult:.2f}  (out of {sigma_multipliers})")
print(f"Optimal GLL              : {best_gll:.4f}")
print("[Done] Sigma sensitivity analysis complete.")

## 7. Plot Baseline Predictions

For 3 representative held-out planets, compare the two baselines:
- **Baseline 1** (constant median): flat spectrum, uncertainty from residual std.
- **Baseline 2** (Ridge regression): per-wavelength prediction conditioned on ADC features.

Ground truth is shown as the mean transmission spectrum (points).  Predicted uncertainty is shown as a shaded mu +/- sigma band.

In [None]:
# ── Retrain Ridge on full training set for illustration ────────────────────
scaler_final = StandardScaler()
X_train_s    = scaler_final.fit_transform(X_train)
X_val_s      = scaler_final.transform(X_val)

ridge_final  = Ridge(alpha=ALPHA, fit_intercept=True)
ridge_final.fit(X_train_s, Y_train)

mu_ridge_val   = ridge_final.predict(X_val_s)               # (n_val, 283)
resid_train    = Y_train - ridge_final.predict(X_train_s)
sigma_ridge_wl = np.clip(resid_train.std(axis=0), 1e-9, None)  # (283,)

# ── Select 3 example planets from validation set ──────────────────────────
N_EXAMPLE   = 3
example_idx = np.linspace(0, len(Y_val) - 1, N_EXAMPLE, dtype=int)

wl_axis = np.arange(N_WAVELENGTHS)   # wavelength channel index (0..282)

fig, axes = plt.subplots(N_EXAMPLE, 2, figsize=(16, 4 * N_EXAMPLE))
fig.suptitle(
    "Baseline Predictions vs Ground Truth — 3 Example Validation Planets",
    fontsize=13, fontweight="bold", y=1.01
)

for row, pid in enumerate(example_idx):
    # Ground truth (mean spectrum only — no quartile bands)
    gt_mean = Y_val[pid]

    # ── Left: Baseline 1 (constant median) ────────────────────────────────
    ax_left = axes[row, 0]

    # Ground truth as scatter points
    ax_left.scatter(wl_axis, gt_mean, s=4, color="steelblue", alpha=0.7,
                    label="GT mean spectrum", zorder=3)

    # Constant prediction
    ax_left.plot(wl_axis, mu_const, lw=1.2, color="darkorange", linestyle="--",
                 label="Pred mu (const median)")
    ax_left.fill_between(wl_axis,
                         mu_const - sigma_const,
                         mu_const + sigma_const,
                         alpha=0.18, color="darkorange", label="Pred mu +/- sigma")

    ax_left.set_title(f"Planet val[{pid}] — Baseline 1 (Constant Median)", fontsize=10)
    ax_left.set_xlabel("Wavelength channel", fontsize=9)
    ax_left.set_ylabel("Transit depth", fontsize=9)
    if row == 0:
        ax_left.legend(fontsize=8, loc="upper right")
    ax_left.tick_params(labelsize=8)

    # ── Right: Baseline 2 (Ridge regression) ──────────────────────────────
    ax_right = axes[row, 1]

    # Ground truth as scatter points
    ax_right.scatter(wl_axis, gt_mean, s=4, color="steelblue", alpha=0.7,
                     label="GT mean spectrum", zorder=3)

    # Ridge prediction
    mu_ridge_p = mu_ridge_val[pid]          # (283,)
    ax_right.plot(wl_axis, mu_ridge_p, lw=1.2, color="crimson", linestyle="--",
                  label="Pred mu (Ridge)")
    ax_right.fill_between(wl_axis,
                          mu_ridge_p - sigma_ridge_wl,
                          mu_ridge_p + sigma_ridge_wl,
                          alpha=0.18, color="crimson", label="Pred mu +/- sigma")

    ax_right.set_title(f"Planet val[{pid}] — Baseline 2 (Ridge Regression)", fontsize=10)
    ax_right.set_xlabel("Wavelength channel", fontsize=9)
    ax_right.set_ylabel("Transit depth", fontsize=9)
    if row == 0:
        ax_right.legend(fontsize=8, loc="upper right")
    ax_right.tick_params(labelsize=8)

plt.tight_layout()
fig.savefig(FIG_DIR / "baseline_predictions.png", bbox_inches="tight")
plt.show()

print(f"[Done] Prediction plots for {N_EXAMPLE} example planets.")

## 8. Summary

### Results Table

| Method | GLL | Notes |
|---|---|---|
| Baseline 1: Constant median | *see cell below* | Global median mu, residual-std sigma. No ADC features used. |
| Baseline 2: Ridge regression (5-fold CV) | *see cell below* | 283 independent Ridge models on 5 ADC features. Sigma from training residuals. |

### Key Takeaways

1. **GLL is sensitive to both mean accuracy and uncertainty calibration.**  
   The sigma sensitivity analysis (Section 6) shows that even with perfect mean predictions,
   mis-calibrated uncertainty halves or more the GLL score.  A model that outputs a good mu
   but a bad sigma will score poorly.

2. **The constant predictor is a meaningful floor.**  
   Any model that fails to beat Baseline 1 has not learned anything useful from the data.
   The training-set median is computed on the labelled subset.  The model must generalise
   beyond this.

3. **ADC calibration features may carry some signal (Baseline 2 vs 1).**  
   If Ridge regression outperforms the constant predictor, the ADC offset/gain parameters
   and star type are correlated with the atmospheric spectrum.  The ADC features describe
   the detector calibration (`FGS1_adc_offset`, `FGS1_adc_gain`, `AIRS-CH0_adc_offset`,
   `AIRS-CH0_adc_gain`) and whether the star is in the training set (`star`).

4. **Labels provide only the mean transmission spectrum.**  
   Unlike a quartile-based setup, there are no q1/q3 bands to estimate uncertainty from
   the labels.  Sigma is purely a model responsibility.  This makes uncertainty calibration
   a key differentiator between models.

5. **What the full model needs to beat.**  
   The competition-winning approaches use direct photometric extraction from AIRS-CH0 raw
   light curves.  The per-planet parquet data provides per-channel flux time series that,
   once preprocessed (see `02_preprocessing.ipynb`), yield a transit depth spectrum per planet.
   This spectrum, combined with ADC features, is expected to drive GLL well above the baselines here.

6. **GLL = 0 is unreachable in practice.**  
   A score of 0 would require perfect mean prediction *and* sigma equal to the irreducible noise.
   Competition leaderboard scores are typically in the range -5 to -0.5; see the Kaggle
   discussion forum for context.

In [None]:
import json

# ── Print final summary table ───────────────────────────────────────────────
print("=" * 65)
print("BASELINE RESULTS SUMMARY")
print("=" * 65)
print(f"{'Method':<42}  {'GLL':>8}")
print("-" * 65)
print(f"{'Baseline 1: Constant median (20% val split)':<42}  {gll_const:>8.4f}")
print(f"{'Baseline 2: Ridge regression (5-fold CV)':<42}  {gll_ridge_cv:>8.4f}")
print("-" * 65)

improvement = gll_ridge_cv - gll_const
sign = "+" if improvement >= 0 else ""
print(f"Ridge vs Constant improvement : {sign}{improvement:.4f} GLL points")
print(f"Using synthetic data           : {_USING_SYNTHETIC}")
print("=" * 65)

print("\nBaseline (constant median) GLL =", round(gll_const, 4))
print("Baseline (Ridge regression) GLL (5-fold CV) =", round(gll_ridge_cv, 4))

# ── Save results as JSON ──────────────────────────────────────────────────
baseline_results = {
    "baseline_constant_median_gll": round(gll_const, 4),
    "baseline_ridge_regression_gll_cv": round(gll_ridge_cv, 4),
    "baseline_ridge_regression_gll_std": round(gll_ridge_std, 4),
    "ridge_vs_constant_improvement": round(improvement, 4),
    "optimal_sigma_multiplier": round(best_mult, 2),
    "optimal_sigma_gll": round(best_gll, 4),
    "using_synthetic_data": _USING_SYNTHETIC,
    "n_labelled_planets": len(Y_mean),
    "n_wavelengths": N_WAVELENGTHS,
}
results_path = OUT_DIR / "baseline_results.json"
with open(results_path, "w") as f:
    json.dump(baseline_results, f, indent=2)
print(f"\nResults saved to {results_path}")

print("\n[Done] Baseline notebook complete.")

## 9. Push Baseline Results to GitHub

Push GLL scores (JSON), sigma sensitivity plot, and prediction comparison plot to the repo.

In [None]:
import shutil
import subprocess
from pathlib import Path

# ── GitHub token (read from Kaggle Secrets — never hardcode!) ──────────────
# Add your PAT as a Kaggle Secret named "GH_TOKEN":
#   Notebook sidebar → Add-ons → Secrets → + Add a new secret
import os
try:
    from kaggle_secrets import UserSecretsClient
    GH_TOKEN = UserSecretsClient().get_secret("GH_TOKEN")
    print("GH_TOKEN loaded from Kaggle Secrets.")
except Exception:
    GH_TOKEN = os.environ.get("GH_TOKEN", "")
    if GH_TOKEN:
        print("GH_TOKEN loaded from environment variable.")
    else:
        print("WARNING: GH_TOKEN not found — push to GitHub will be skipped.")

# ── Repo paths ────────────────────────────────────────────────────────────
repo_dir    = Path("/kaggle/working/ariel-exoplanet-ml")
project_dir = repo_dir / "Kaggle competition" / "ARIEL neurIPS"

# ── Ensure repo is up-to-date ─────────────────────────────────────────────
if not repo_dir.exists():
    subprocess.run(
        ["git", "clone", "https://github.com/Smooth-Cactus0/ariel-exoplanet-ml.git",
         str(repo_dir)],
        check=True,
    )
else:
    subprocess.run(["git", "-C", str(repo_dir), "pull", "--ff-only"], check=False)

# ── Configure git identity (required on Kaggle kernels) ───────────────────
subprocess.run(["git", "-C", str(repo_dir), "config", "user.email", "alexy.louis@kaggle-notebook.local"], check=True)
subprocess.run(["git", "-C", str(repo_dir), "config", "user.name", "Alexy Louis (Kaggle)"], check=True)

# ── Copy results and figures to repo ───────────────────────────────────────
repo_results_dir = project_dir / "results"
repo_results_dir.mkdir(parents=True, exist_ok=True)
repo_fig_dir = project_dir / "figures"
repo_fig_dir.mkdir(parents=True, exist_ok=True)

# Copy baseline_results.json → results/
results_json = OUT_DIR / "baseline_results.json"
if results_json.exists():
    shutil.copy2(results_json, repo_results_dir / "baseline_results.json")
    print(f"  baseline_results.json -> results/baseline_results.json")

# Copy figures → figures/
for fig_path in sorted(FIG_DIR.glob("*.png")):
    dest = repo_fig_dir / fig_path.name
    shutil.copy2(fig_path, dest)
    print(f"  {fig_path.name} -> figures/{fig_path.name}")

# ── Git add, commit, push ─────────────────────────────────────────────────
subprocess.run(
    ["git", "-C", str(repo_dir), "add",
     "Kaggle competition/ARIEL neurIPS/results/",
     "Kaggle competition/ARIEL neurIPS/figures/"],
    check=True,
)

status = subprocess.run(
    ["git", "-C", str(repo_dir), "diff", "--cached", "--quiet"],
    capture_output=True,
)
if status.returncode != 0:
    subprocess.run(
        ["git", "-C", str(repo_dir), "commit", "-m",
         "data: update baseline results and figures from Kaggle notebook run"],
        check=True,
    )
    if GH_TOKEN:
        subprocess.run(
            ["git", "-C", str(repo_dir), "remote", "set-url", "origin",
             f"https://{GH_TOKEN}@github.com/Smooth-Cactus0/ariel-exoplanet-ml.git"],
            check=True,
        )
        subprocess.run(
            ["git", "-C", str(repo_dir), "push", "origin", "master"],
            check=True,
        )
        print("\n[Done] Baseline results pushed to GitHub.")
    else:
        print("\n[Done] Results committed locally but NOT pushed (no GH_TOKEN).")
else:
    print("\n[Done] No changes to push (results already up-to-date).")