# Self-supervised and multi-modal representation Learning: Opening bulk spectra

## Light curve encoding via masked self-supervised learning

In [1]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyarrow.parquet as pq

%load_ext autoreload
%autoreload 2

### Data loading and pre-processing

Let's start by loading the bulk lightcurve sample. Note that we only load one `.parquet` file here for illustration, but there are a huge number available at https://irsa.ipac.caltech.edu/Missions/ztf.html (under "Lightcurve Bulk Download"). 

Data release notes: https://irsa.ipac.caltech.edu/data/ZTF/docs/releases/ztf_release_notes_latest.

In [2]:
bulk_df = pq.read_table("../data/lightcurves_bulk/ztf_001605_zg_c01_q1_dr20.parquet").to_pandas()
bulk_df.head()

Unnamed: 0,objectid,filterid,fieldid,rcid,objra,objdec,nepochs,hmjd,mag,magerr,clrcoeff,catflags
0,1605101100000000,1,1605,0,70.702309,20.470905,64,"[58362.51656, 58373.52451, 58380.52687, 58385....","[18.081408, 18.153248, 18.185818, 18.151423, 1...","[0.036423214, 0.037922166, 0.03863352, 0.03788...","[-0.004919746, -0.05696958, -0.026023958, -0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32768, 0, 0, 0,..."
2,1605101100000002,1,1605,0,71.31263,20.457535,64,"[58362.51651, 58373.52445, 58380.52682, 58385....","[17.337278, 17.400082, 17.389616, 17.356413, 1...","[0.02554485, 0.02619334, 0.026082417, 0.025738...","[-0.004919746, -0.05696958, -0.026023958, -0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32768, 0, 0,..."
3,1605101100000003,1,1605,0,70.713264,20.469236,14,"[58380.52687, 58439.42102, 58494.18065, 58726....","[20.451708, 21.258446, 21.050898, 21.428753, 2...","[0.14320381, 0.1871612, 0.17555499, 0.19668484...","[-0.026023958, -0.067739874, -0.049056668, -0....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,1605101100000005,1,1605,0,70.988045,20.463734,64,"[58362.51654, 58373.52448, 58380.52685, 58385....","[17.669598, 17.741493, 17.695265, 17.697018, 1...","[0.029478874, 0.030508809, 0.029838556, 0.0298...","[-0.004919746, -0.05696958, -0.026023958, -0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32768, 0, 0,..."
6,1605101100000006,1,1605,0,70.810501,20.466816,62,"[58362.51655, 58373.5245, 58380.52686, 58384.3...","[18.900702, 18.937569, 18.927834, 18.747997, 1...","[0.060209155, 0.0616775, 0.06128603, 0.0545308...","[-0.004919746, -0.05696958, -0.026023958, -0.0...","[0, 0, 0, 65535, 0, 0, 0, 0, 0, 0, 0, 0, 32768..."


The data contain variable number of light curves. Let's exclude those with less than 5 observations, and for those with >100 observations pick 100 at random. Pad the shorter ones with zeros up to a length of 100. We'll also create a padding mask to be used in the encoder.

In [3]:
def preprocess_bulk_lightcurves(bulk_df, n_min_obs=5, n_max_obs=100):
    """ Preprocess the bulk light curves by filtering out entries with < `n_min_obs` observations and randomly sampling `n_max_obs` observations from entries with > `n_max_obs` observations. 
    Pad the entries with < `n_max_obs` observations with zeros. Return the filtered dataframe and a mask array indicating which entries were padded with zeros.
    """

    # Filter out entries with < 5 observations
    filtered_df = bulk_df[bulk_df['mag'].apply(lambda x: len(x) >= n_min_obs)]  

    # For entries with > n_max_obs observations, randomly sample 200 observations (hmjd, mag, and magerr with same sample) from the light curve
    # Pad the entries to n_max_obs observations with zeros and create a mask array
    mask_list = [] 
    for i in filtered_df.index:

        # If there are more than n_max_obs observations, randomly sample n_max_obs observations
        if len(filtered_df.loc[i]['mag']) > n_max_obs:
            mask = np.ones(n_max_obs, dtype=bool)
            mask_list.append(mask)

            # Randomly sample n_max_obs observations
            indices = np.random.choice(len(filtered_df.loc[i]['mag']), n_max_obs)
            filtered_df.at[i, 'hmjd'] = filtered_df.loc[i]['hmjd'][indices]

            # Sort the observations by time
            sorted_indices = np.argsort(filtered_df.loc[i]['hmjd'])
            filtered_df.at[i, 'hmjd'] = filtered_df.loc[i]['hmjd'][sorted_indices]
            filtered_df.at[i, 'mag'] = filtered_df.loc[i]['mag'][sorted_indices]
            filtered_df.at[i, 'magerr'] = filtered_df.loc[i]['magerr'][sorted_indices]

        # Otherwise, pad the arrays with zeros
        else:
            mask = np.zeros(n_max_obs, dtype=bool)
            mask[:len(filtered_df.loc[i]['mag'])] = True
            mask_list.append(mask)
            
            # Pad the arrays with zeros
            filtered_df.at[i, 'mag'] = np.pad(filtered_df.loc[i]['mag'], (0, n_max_obs - len(filtered_df.loc[i]['mag'])), 'constant')
            filtered_df.at[i, 'magerr'] = np.pad(filtered_df.loc[i]['magerr'], (0, n_max_obs - len(filtered_df.loc[i]['magerr'])), 'constant')
            filtered_df.at[i, 'hmjd'] = np.pad(filtered_df.loc[i]['hmjd'], (0, n_max_obs - len(filtered_df.loc[i]['hmjd'])), 'constant')

    mask_ary = np.array(mask_list)
    return filtered_df, mask_ary

In [4]:
filtered_df, mask_ary = preprocess_bulk_lightcurves(bulk_df)

This is a very noisy sample -- are there any quality cuts that might be useful? For example, a cut based on a correlation function of the points as a function of time-spacing?