In [None]:
import os, sys
import numpy as np
import torch
from astropy.table import Table
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

sys.path.append("../..")

from astroclip.env import format_with_env
from property_utils.models import few_shot, zero_shot
from property_utils.plotting import plot_scatter

In [None]:
ASTROCLIP_ROOT = format_with_env("{ASTROCLIP_ROOT}")

PROVABGS_ROOT = f"{ASTROCLIP_ROOT}/datasets/provabgs/"
SUPERVISED_ROOT = f"{ASTROCLIP_ROOT}/supervised/"

# Define models in embeddings
image_models = ["astroclip_image", "astrodino", "stein"]
spectrum_models = ["astroclip_spectrum", "specformer"]

# Set up the paths
train_path = os.path.join(PROVABGS_ROOT, "provabgs_paired_train_embeddings.hdf5")
test_path = os.path.join(PROVABGS_ROOT, "provabgs_paired_test_embeddings.hdf5")

# Get embeddings and PROVABGS table
train_provabgs = Table.read(train_path)
test_provabgs = Table.read(test_path)

In [3]:
# Get properties and scale
properties = ["Z_MW", "LOG_MSTAR", "TAGE_MW", "sSFR"]
y_train = np.stack([train_provabgs[prop].data.squeeze() for prop in properties]).T
y_test = np.stack([test_provabgs[prop].data.squeeze() for prop in properties]).T
scaler = {"mean": y_train.mean(axis=0), "std": y_train.std(axis=0)}
y_train = (y_train - scaler["mean"]) / scaler["std"]

print(
    "Size of training set:",
    len(train_provabgs),
    "\nSize of test set:",
    len(test_provabgs),
)

Size of training set: 2681 
Size of test set: 2844


# Galaxy Property Prediction from Image Embeddings

In [4]:
# Get data
data = {}
for model in image_models:
    data[model] = {}
    X_train, X_test = (
        train_provabgs[model + "_embeddings"],
        test_provabgs[model + "_embeddings"],
    )
    embedding_scaler = StandardScaler().fit(X_train)
    data[model]["train"] = embedding_scaler.transform(X_train)
    data[model]["test"] = embedding_scaler.transform(X_test)

In [5]:
# Perfrom knn and mlp
preds_knn, preds_mlp = {}, {}
for key in data.keys():
    print(f"Evaluating {key} model...")
    raw_preds_knn = zero_shot(data[key]["train"], y_train, data[key]["test"])
    raw_preds_mlp = few_shot(
        model, data[key]["train"], y_train, data[key]["test"]
    ).squeeze()
    preds_knn[key] = raw_preds_knn * scaler["std"] + scaler["mean"]
    preds_mlp[key] = raw_preds_mlp * scaler["std"] + scaler["mean"]

Evaluating astroclip_image model...


[WinError 2] The system cannot find the file specified
  File "c:\Users\mi3se\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\mi3se\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\mi3se\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\mi3se\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


Evaluating astrodino model...
Evaluating stein model...


In [6]:
# Make a table of r^2 scores
knn_r2 = {key: [] for key in preds_knn.keys()}
mlp_r2 = {key: [] for key in preds_mlp.keys()}

for key in preds_knn.keys():
    for i, prop in enumerate(properties):
        knn_r2[key].append(r2_score(y_test[:, i], preds_knn[key][:, i]))
        mlp_r2[key].append(r2_score(y_test[:, i], preds_mlp[key][:, i]))

knn_r2["properties"] = properties
mlp_r2["properties"] = properties

In [7]:
Table(knn_r2)

astroclip_image,astrodino,stein,properties
float64,float64,float64,str9
0.3752610710926899,0.4323129721799389,0.0643307244044502,Z_MW
0.5787679086623845,0.5653011500672032,0.2009448578689787,LOG_MSTAR
0.0543263967931566,0.0172029697265012,-0.4291024729717663,TAGE_MW
0.3359011029840055,0.1992407113334663,0.0434007090447119,sSFR


In [8]:
Table(mlp_r2)

astroclip_image,astrodino,stein,properties
float64,float64,float64,str9
0.5089998641786604,0.528509397700264,0.3625773487159056,Z_MW
0.6834140206349506,0.6474121219931318,0.3748410892599463,LOG_MSTAR
-0.0917218929475209,-0.0268512374455496,-0.2902687793995258,TAGE_MW
0.3882318412195646,0.2869485493363974,0.1168237999941658,sSFR


In [None]:
# Get predictions from supervised models
resnet_preds = torch.load(
    os.path.join(SUPERVISED_ROOT, "image/ResNet18/global_properties/test_pred.pt")
)
photometry_preds = torch.load(
    os.path.join(SUPERVISED_ROOT, "photometry/MLP/global_properties/test_pred.pt")
)

# Add predictions to dictionary
preds_supervised = {
    "resnet18": np.stack([resnet_preds[prop].squeeze() for prop in properties]).T,
    "photometry": np.stack([photometry_preds[prop].squeeze() for prop in properties]).T,
}

supervised_r2 = {key: [] for key in preds_supervised.keys()}
for key in preds_supervised.keys():
    for i, prop in enumerate(properties):
        supervised_r2[key].append(r2_score(y_test[:, i], preds_supervised[key][:, i]))

supervised_r2["properties"] = properties
Table(supervised_r2)

# Galaxy Property Prediction from Spectrum Embeddings

In [10]:
# Get data
data = {}
for model in spectrum_models:
    data[model] = {}
    X_train, X_test = (
        train_provabgs[model + "_embeddings"],
        test_provabgs[model + "_embeddings"],
    )
    embedding_scaler = StandardScaler().fit(X_train)
    data[model]["train"] = embedding_scaler.transform(X_train)
    data[model]["test"] = embedding_scaler.transform(X_test)

In [11]:
# Perfrom knn and mlp
preds_knn, preds_mlp = {}, {}
for key in data.keys():
    print(f"Evaluating {key} model...")
    raw_preds_knn = zero_shot(data[key]["train"], y_train, data[key]["test"])
    raw_preds_mlp = few_shot(
        model, data[key]["train"], y_train, data[key]["test"]
    ).squeeze()
    preds_knn[key] = raw_preds_knn * scaler["std"] + scaler["mean"]
    preds_mlp[key] = raw_preds_mlp * scaler["std"] + scaler["mean"]

Evaluating astroclip_spectrum model...
Evaluating specformer model...


In [12]:
# Make a table of r^2 scores
knn_r2 = {key: [] for key in preds_knn.keys()}
mlp_r2 = {key: [] for key in preds_mlp.keys()}

for key in preds_knn.keys():
    for i, prop in enumerate(properties):
        knn_r2[key].append(r2_score(y_test[:, i], preds_knn[key][:, i]))
        mlp_r2[key].append(r2_score(y_test[:, i], preds_mlp[key][:, i]))

knn_r2["properties"] = properties
mlp_r2["properties"] = properties

In [13]:
Table(knn_r2)

astroclip_spectrum,specformer,properties
float64,float64,str9
0.5322482871068985,0.5354494451710723,Z_MW
0.7542939659438246,0.6640968009283513,LOG_MSTAR
0.1150730508897691,0.0946366528893204,TAGE_MW
0.4484141179718265,0.4165222339685064,sSFR


In [14]:
Table(mlp_r2)

astroclip_spectrum,specformer,properties
float64,float64,str9
0.5387492629791656,0.5950981039949153,Z_MW
0.7518805289789331,0.7804331705869244,LOG_MSTAR
0.0818607807367767,-0.0294130885088446,TAGE_MW
0.4882757212292408,0.5451530083966152,sSFR


In [None]:
# Get predictions from supervised models
spectrum_preds = torch.load(
    os.path.join(SUPERVISED_ROOT, "spectrum/Conv+Att/global_properties/test_pred.pt")
)

# Add predictions to dictionary
preds_supervised = {
    "conv+att": np.stack([spectrum_preds[prop].squeeze() for prop in properties]).T,
}

supervised_r2 = {key: [] for key in preds_supervised.keys()}
for key in preds_supervised.keys():
    for i, prop in enumerate(properties):
        supervised_r2[key].append(r2_score(y_test[:, i], preds_supervised[key][:, i]))

supervised_r2["properties"] = properties
Table(supervised_r2)