In [15]:
import numpy as np
import umap
import matplotlib.pyplot as plt
from astropy.io import fits
import os
import time
from astropy.table import Table


In [16]:
# Load data and embeddings
def load_data_and_embeddings(folder_path, embedding_file, idx_file, catalog_path, txt_file):
    # Load catalog data
    with fits.open(catalog_path) as hdul:
        catalog_data = hdul[1].data

    # Load galaxy names from txt file
    with open(os.path.join(folder_path, txt_file), "r") as f:
        file_names = [line.strip() for line in f.readlines()]

    # Load embeddings and indices
    zss = np.load(os.path.join(folder_path, embedding_file))
    idxs = np.load(os.path.join(folder_path, idx_file))
    file_names_selected = [file_names[idx] for idx in idxs]

    # Map embeddings to catalog indices
    catalog_file_names = catalog_data['name']
    catalog_name_to_idx = {name: i for i, name in enumerate(catalog_file_names)}
    print("Number of matches", len(catalog_name_to_idx))

    # Extract property values and valid embeddings
    embeddings, object_id = [], []
    for file_name in file_names_selected:
        catalog_idx = catalog_name_to_idx.get(file_name)
        if catalog_idx is not None:
            embeddings.append(zss[file_names_selected.index(file_name)])
            object_id.append(catalog_data['object_id'][catalog_idx])

    # Validation prints
    print(f"Loaded {len(embeddings)} embeddings.")
    print(f"Loaded {len(object_id)}.")
    return np.array(embeddings), np.array(object_id)

In [17]:
# Q1 data path
catalog_path = "../Q1_data/EuclidMorphPhysPropSpecZ.fits" 

In [4]:
# Embeddings file names
train_embedding_file = "zss_64t_mean.npy"
train_idx_file = "idxs_64t_mean.npy"
test_embedding_file = "zss_64t_mean_test.npy"
test_idx_file = "idxs_64t_mean_test.npy"
train_txt_file = "train.txt"
test_txt_file = "test.txt"

In [5]:
# Folder path for VIS
folder_path = "VIS"  

In [7]:
# Folder for VIS+NISP+SED data
folder_path = "VIS_NISP_SED" 

In [18]:
# Folder and names for only the SED data
folder_path = "VIS_NISP_SED" 
train_embedding_file = "zss_sedonly.npy"
train_idx_file = "idxs_sedonly.npy"
test_embedding_file = "zss_sedonly_test.npy"
test_idx_file = "idxs_sedonly_test.npy"
train_txt_file = "train.txt"
test_txt_file = "test.txt"

In [19]:
# Load data
train_embeddings, train_object_id = load_data_and_embeddings(
    folder_path, train_embedding_file, train_idx_file, catalog_path, "train.txt"
)

print("Embeddings shape:", train_embeddings.shape)


Number of matches 331759
Loaded 264800 embeddings.
Loaded 264800.
Embeddings shape: (264800, 768)


In [20]:
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)

start_time = time.time()
zss_vis = umap_model.fit_transform(train_embeddings)
print(f"UMAP completed in {time.time() - start_time:.2f} seconds")

  warn(


UMAP completed in 184.85 seconds


In [21]:
# Write results to FITS
data = Table({
    'id': train_object_id,
    'UMAP1': zss_vis[:, 0],
    'UMAP2': zss_vis[:, 1]
    })
hdu = fits.BinTableHDU(data)
#hdu.writeto("UMAP_VIS_NISP_SED_train.fits", overwrite=True)
hdu.writeto("UMAP_SED_train.fits", overwrite=True)


In [5]:
# Load test data
test_embeddings, test_object_id = load_data_and_embeddings(
    folder_path, test_embedding_file, test_idx_file, catalog_path, "test.txt"
)

print("Embeddings shape:", test_embeddings.shape)


Number of matches 331759
Loaded 66960 embeddings.
Loaded 66960 smooth.
Embeddings shape: (66960, 768)


In [6]:
# Perform UMAP for dimensionality reduction
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)

start_time = time.time()
zss_vis = umap_model.fit_transform(test_embeddings)
print(f"UMAP completed in {time.time() - start_time:.2f} seconds")

  warn(


UMAP for VIS completed in 54.71 seconds


In [7]:
# Write results to FITS
data = Table({
    'id': test_object_id,
    'UMAP1': zss_vis[:, 0],
    'UMAP2': zss_vis[:, 1]
    })
hdu = fits.BinTableHDU(data)
hdu.writeto("UMAP_VIS_NISP_SED_test.fits", overwrite=True)
