In [None]:
import os, sys
import numpy as np
import torch
from astropy.table import Table
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

sys.path.append("../..")

from astroclip.env import format_with_env
from property_utils.models import few_shot, zero_shot
from property_utils.plotting import plot_scatter

ASTROCLIP_ROOT = format_with_env("{ASTROCLIP_ROOT}")

PROVABGS_ROOT = f"{ASTROCLIP_ROOT}/datasets/provabgs/"
SUPERVISED_ROOT = f"{ASTROCLIP_ROOT}/supervised/"

# Define models in embeddings
image_models = ["astroclip_image", "astrodino", "stein"]
spectrum_models = ["astroclip_spectrum", "specformer"]

# Set up the paths
train_path = os.path.join(PROVABGS_ROOT, "provabgs_paired_train_embeddings.hdf5")
test_path = os.path.join(PROVABGS_ROOT, "provabgs_paired_test_embeddings.hdf5")

# Get embeddings and PROVABGS table
train_provabgs = Table.read(train_path)
test_provabgs = Table.read(test_path)

# Get properties and scale
properties = ["Z_MW", "LOG_MSTAR", "TAGE_MW", "sSFR"]
y_train = np.stack([train_provabgs[prop].data.squeeze() for prop in properties]).T
y_test = np.stack([test_provabgs[prop].data.squeeze() for prop in properties]).T
scaler = {"mean": y_train.mean(axis=0), "std": y_train.std(axis=0)}
y_train = (y_train - scaler["mean"]) / scaler["std"]

print(
    "Size of training set:",
    len(train_provabgs),
    "\nSize of test set:",
    len(test_provabgs),
)

# Galaxy Property Prediction from Image Embeddings

In [None]:
# Get data
data = {}
for model in image_models:
    data[model] = {}
    X_train, X_test = (
        train_provabgs[model + "_embeddings"],
        test_provabgs[model + "_embeddings"],
    )
    embedding_scaler = StandardScaler().fit(X_train)
    data[model]["train"] = embedding_scaler.transform(X_train)
    data[model]["test"] = embedding_scaler.transform(X_test)

In [None]:
# Perfrom knn and mlp
preds_knn, preds_mlp = {}, {}
for key in data.keys():
    print(f"Evaluating {key} model...")
    raw_preds_knn = zero_shot(data[key]["train"], y_train, data[key]["test"])
    raw_preds_mlp = few_shot(
        model, data[key]["train"], y_train, data[key]["test"]
    ).squeeze()
    preds_knn[key] = raw_preds_knn * scaler["std"] + scaler["mean"]
    preds_mlp[key] = raw_preds_mlp * scaler["std"] + scaler["mean"]

In [None]:
# Make a table of r^2 scores
knn_r2 = {key: [] for key in preds_knn.keys()}
mlp_r2 = {key: [] for key in preds_mlp.keys()}

for key in preds_knn.keys():
    for i, prop in enumerate(properties):
        knn_r2[key].append(r2_score(y_test[:, i], preds_knn[key][:, i]))
        mlp_r2[key].append(r2_score(y_test[:, i], preds_mlp[key][:, i]))

knn_r2["properties"] = properties
mlp_r2["properties"] = properties

In [None]:
Table(knn_r2)

In [None]:
Table(mlp_r2)

In [None]:
# Get predictions from supervised models
resnet_preds = torch.load(
    os.path.join(SUPERVISED_ROOT, "image/ResNet18/global_properties/test_pred.pt")
)
photometry_preds = torch.load(
    os.path.join(SUPERVISED_ROOT, "photometry/MLP/global_properties/test_pred.pt")
)

# Add predictions to dictionary
preds_supervised = {
    "resnet18": np.stack([resnet_preds[prop].squeeze() for prop in properties]).T,
    "photometry": np.stack([photometry_preds[prop].squeeze() for prop in properties]).T,
}

supervised_r2 = {key: [] for key in preds_supervised.keys()}
for key in preds_supervised.keys():
    for i, prop in enumerate(properties):
        supervised_r2[key].append(r2_score(y_test[:, i], preds_supervised[key][:, i]))

supervised_r2["properties"] = properties
Table(supervised_r2)

# Galaxy Property Prediction from Spectrum Embeddings

In [None]:
# Get data
data = {}
for model in spectrum_models:
    data[model] = {}
    X_train, X_test = (
        train_provabgs[model + "_embeddings"],
        test_provabgs[model + "_embeddings"],
    )
    embedding_scaler = StandardScaler().fit(X_train)
    data[model]["train"] = embedding_scaler.transform(X_train)
    data[model]["test"] = embedding_scaler.transform(X_test)

In [None]:
# Perfrom knn and mlp
preds_knn, preds_mlp = {}, {}
for key in data.keys():
    print(f"Evaluating {key} model...")
    raw_preds_knn = zero_shot(data[key]["train"], y_train, data[key]["test"])
    raw_preds_mlp = few_shot(
        model, data[key]["train"], y_train, data[key]["test"]
    ).squeeze()
    preds_knn[key] = raw_preds_knn * scaler["std"] + scaler["mean"]
    preds_mlp[key] = raw_preds_mlp * scaler["std"] + scaler["mean"]

In [None]:
# Make a table of r^2 scores
knn_r2 = {key: [] for key in preds_knn.keys()}
mlp_r2 = {key: [] for key in preds_mlp.keys()}

for key in preds_knn.keys():
    for i, prop in enumerate(properties):
        knn_r2[key].append(r2_score(y_test[:, i], preds_knn[key][:, i]))
        mlp_r2[key].append(r2_score(y_test[:, i], preds_mlp[key][:, i]))

knn_r2["properties"] = properties
mlp_r2["properties"] = properties

In [None]:
Table(knn_r2)

In [None]:
Table(mlp_r2)

In [None]:
# Get predictions from supervised models
spectrum_preds = torch.load(
    os.path.join(SUPERVISED_ROOT, "spectrum/Conv+Att/global_properties/test_pred.pt")
)

# Add predictions to dictionary
preds_supervised = {
    "conv+att": np.stack([spectrum_preds[prop].squeeze() for prop in properties]).T,
}

supervised_r2 = {key: [] for key in preds_supervised.keys()}
for key in preds_supervised.keys():
    for i, prop in enumerate(properties):
        supervised_r2[key].append(r2_score(y_test[:, i], preds_supervised[key][:, i]))

supervised_r2["properties"] = properties
Table(supervised_r2)