## Train a PAE on labeled galaxy spectra

In [1]:
import numpy as np
import pandas as pd
import torch
from pathlib import Path
from spectra_pae.spectra_pae import *

Note: To fully understand what this code is doing under the hood, you need to read the paper, look at the Spectra_PAE class and the PytorchPAE package with the Autoencoder class. All of these are well documented.

### set parameters

In [2]:
SEED               = 287505

## dataset name (if you want to add a new dataset, it must be added to the the PytorchPAE package in custom_datasets.py) 
dataset_name       = 'SDSS_DR16'
# dataset directory
data_dir           = '/global/cscratch1/sd/vboehm/Datasets/sdss/by_model'
# directory for saving trained models 
model_dir          = '/global/cscratch1/sd/vboehm/SDSSOutlier/fc'
# dimensionality of the input data 
input_dim          = (1000,1)

### initiate the class

In [3]:
# all functionalities are described in the class documentation 
SPAE = Spectra_PAE(data_dir, model_dir, dataset_name='SDSS_DR16', input_dim=input_dim)

### Train the full model. This goes through all training steps described in the publication

In [4]:
# for testing set nepochs to 10 and niter to 20
SPAE.train_complete_model(nepochs=100, use_prior=True, retrain=True, niter=500)
#target loss for AE stage 1 is ~1.1-1.2, stage 2 should be O(1e-4)

"""
trains and saves complete model
--------
nepochs: number of epochs to train for (you could add an early stopping criteria on validation loss)
retrain: whether to retrain even if model if files exist (better to change the name under which the models are saved by chanin the prefixes)
use_prior: whether to use a prior when evaluating the class probability
niter: number of training steps in the normalizing flow
"""


training AE stage 1...
epoch: 1, training loss: 4.1023e+00, validation loss: 1.9990e+00, learning rate: 1.0000e-03
epoch: 2, training loss: 1.7914e+00, validation loss: 1.5753e+00, learning rate: 9.9000e-04
epoch: 3, training loss: 1.6515e+00, validation loss: 1.5127e+00, learning rate: 9.8010e-04
epoch: 4, training loss: 1.5878e+00, validation loss: 1.5267e+00, learning rate: 9.7030e-04
epoch: 5, training loss: 1.5550e+00, validation loss: 1.6459e+00, learning rate: 9.6060e-04
epoch: 6, training loss: 1.5387e+00, validation loss: 1.5077e+00, learning rate: 9.5099e-04
epoch: 7, training loss: 1.5130e+00, validation loss: 1.4945e+00, learning rate: 9.4148e-04
epoch: 8, training loss: 1.4949e+00, validation loss: 1.4704e+00, learning rate: 9.3207e-04
epoch: 9, training loss: 1.4819e+00, validation loss: 1.4210e+00, learning rate: 9.2274e-04
saved model to "/global/cscratch1/sd/vboehm/SDSSOutlier/fc/AE1.ckpt"
epoch: 10, training loss: 1.4641e+00, validation loss: 1.4329e+00, learning rate

### get log probability of all spectra in the combined validation set under the most likely label

In [7]:
## the initial dataset is divided into training, validation and test set. However, I combine validation and test set into one dataset in the publication.
## the data I'm sharing is actually: training set = training set, validation set = validation + test set, test set = test set. 
## You can recover the original split between validation and test set from this. 
logps = SPAE.evaluate_NF2(SPAE.NF1_data['valid'],SPAE.new_labels['valid'])

### evaluate the rank (in terms of percentile) of a single spectrum with respect to a reference sample.


In [8]:
# note that I'm not using the training set as a reference sample here. This is to avoid biases from potential overfitting. 
# In the training I do not penalize overfitting as long as the validation loss keeps improving. Early stopping is based on the validation loss not improving, not the training loss!
rank = SPAE.evaluate_logp_percentile(SPAE.NF1_data['valid'],SPAE.labels['valid'], SPAE.NF1_data['valid'][0:1],SPAE.labels['valid'][0:1])