In [0]:
PROJECT_PATH = '/home/dobos/project/pfsspec-all'
DATASET_PATH = '/datascope/subaru/user/zye20/train/test4'

In [0]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [0]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator, MultipleLocator
import h5py as h5

In [0]:
# Allow load project as module
sys.path.insert(0, PROJECT_PATH)

In [0]:
from pfs.ga.pfsspec.core import Spectrum
from pfs.ga.pfsspec.stellar.dataset import ModelDataset

# Load and test training set

In [0]:
fn = os.path.join(DATASET_PATH, 'dataset.h5')

In [0]:
with h5.File(fn, 'r') as f:
    for k in f:
        if isinstance(f[k], h5.Dataset):
            print(k, f[k].shape, f[k].chunks)

In [0]:
ds = ModelDataset()
ds.load(fn, format='h5')
ds.params.shape

In [0]:
ds.preload_arrays

In [0]:
ds.params.head(10)

In [0]:
ds.params.tail(10)

In [0]:
ds.params.loc[99]

In [0]:
params = []
for k in ds.params.columns:
    if ds.params[k].isnull().sum() == 0:
        params.append(k)
params

In [0]:
id = 0
spec = ds.get_spectrum(id)
spec.wave, spec.flux

In [0]:
id = 0
spec = ds.get_spectrum(id)

plt.plot(spec.wave, spec.flux, lw=0.3)
mm = np.quantile(spec.flux, [0.05, 0.95])
print(mm)
plt.ylim(mm[0], mm[1])

idx = np.digitize((4000, 8000), spec.wave)
print(idx)
med = np.median(spec.flux[idx[0]:idx[1]])
print(med)

plt.figure()
plt.plot(spec.wave, spec.flux * 0.5 / med, lw=0.3)
plt.ylim(mm[0] * 0.5 / med, mm[1] * 0.5 / med)

In [0]:
spec = ds.get_spectrum(id)
print(spec.exp_time, spec.exp_count)
print(spec.wave, spec.flux, spec.flux_err, spec.flux_sky)

## Examples as a function of wavelength

In [0]:
f, axs = plt.subplots(16, 1, figsize=(16, 48))

i = 0
#id = np.random.randint(ds.flux.shape[0] - 16)
id = 10
for ax in axs.flatten():
    spec = ds.get_spectrum(id + i)
    
    idx = np.digitize([4000, 8000], spec.wave)
    #mm = np.median(spec.flux[idx[0]:idx[1]])
    #ax.plot([4000, 12000], [mm, mm])
    
    #ax.plot(spec.wave, spec.flux / mm, lw=0.3)
    ax.plot(spec.wave, spec.flux, lw=0.5)
    if spec.flux_err is not None:
        ax.plot(spec.wave, spec.flux_err, lw=0.5)
    ax.set_ylim(0, 1.5 * spec.flux.max())
    #mm = np.median(spec.flux)
    #ax.set_ylim(-2 * mm, 10 * mm)
    #ax.set_ylim(-1, 3)
    ax.text(0.1, 0.1, str(id + i), transform=ax.transAxes)
    ax.text(0.9, 0.9, 'mag = %.2f \n[M/H] = %.2f \nT_eff = %.0f \nlog_g = %.1f \nSNR = %.3f' 
            % (ds.params['mag'][id + i], ds.params['M_H'][id + i], ds.params['T_eff'][id + i], ds.params['log_g'][id + i], ds.params['snr'][id + i]), transform=ax.transAxes,
           ha='right', va='top')
    #print(i, ' '.join(['{}={}'.format(k, ds.params.iloc[id + i][k]) for k in ds.params.iloc[[id + i]]]))
    ax.grid(True)
    i += 1
    
#plt.savefig(os.path.join(DATASET_PATH, 'examples.png'))

## Add some noise

In [0]:
f, axs = plt.subplots(8, 1, figsize=(16, 48))

i = 0
#id = np.random.randint(ds.flux.shape[0] - 16)
id = 64
for ax in axs.flatten():
    spec = ds.get_spectrum(id + i)
    
    idx = np.digitize([4000, 8000], spec.wave)
    #mm = np.median(spec.flux[idx[0]:idx[1]])
    #ax.plot([4000, 12000], [mm, mm])
    
    #ax.plot(spec.wave, spec.flux / mm, lw=0.3)
    ax.plot(spec.wave, spec.flux + np.random.normal(0, spec.flux_err), lw=0.3)
    ax.plot(spec.wave, spec.flux, lw=0.5)

    ax.set_ylim(0, 1.5 * spec.flux.max())
    #mm = np.median(spec.flux)
    #ax.set_ylim(-2 * mm, 10 * mm)
    #ax.set_ylim(-1, 3)
    ax.text(0.1, 0.1, str(id + i), transform=ax.transAxes)
    ax.text(0.9, 0.9, 'mag = %.2f \n[M/H] = %.2f \nT_eff = %.0f \nlog_g = %.1f \nSNR = %.3f' 
            % (ds.params['mag'][id + i], ds.params['M_H'][id + i], ds.params['T_eff'][id + i], ds.params['log_g'][id + i], ds.params['snr'][id + i]), transform=ax.transAxes,
           ha='right', va='top')
    #print(i, ' '.join(['{}={}'.format(k, ds.params.iloc[id + i][k]) for k in ds.params.iloc[[id + i]]]))
    ax.grid(True)
    i += 1
    
#plt.savefig(os.path.join(DATASET_PATH, 'examples.png'))

## Examples as a function of pixel

In [0]:
if ds.wave.ndim != 1:
    f, axs = plt.subplots(16, 1, figsize=(16, 48))

    i = 0
    #id = np.random.randint(ds.flux.shape[0] - 16)
    id = 64
    for ax in axs.flatten():
        spec = ds.get_spectrum(id + i)

        idx = np.digitize([4000, 8000], spec.wave)
        #mm = np.median(spec.flux[idx[0]:idx[1]])
        #ax.plot([4000, 12000], [mm, mm])

        #ax.plot(spec.wave, spec.flux / mm, lw=0.3)
        ax.plot(spec.flux, lw=0.5)
        if spec.flux_err is not None:
            ax.plot(spec.wave, spec.flux_err, lw=0.5)
        ax.set_ylim(0, 1.5 * spec.flux.max())
        #mm = np.median(spec.flux)
        #ax.set_ylim(-2 * mm, 10 * mm)
        #ax.set_ylim(-1, 3)
        ax.text(0.1, 0.1, str(id + i), transform=ax.transAxes)
        ax.text(0.9, 0.9, 'mag = %.2f \n[M/H] = %.2f \nT_eff = %.0f \nlog_g = %.1f \nSNR = %.3f' 
                % (ds.params['mag'][id + i], ds.params['M_H'][id + i], ds.params['T_eff'][id + i], ds.params['log_g'][id + i], ds.params['snr'][id + i]), transform=ax.transAxes,
               ha='right', va='top')
        #print(i, ' '.join(['{}={}'.format(k, ds.params.iloc[id + i][k]) for k in ds.params.iloc[[id + i]]]))
        ax.xaxis.set_minor_locator(AutoMinorLocator(4))
        ax.grid(True)
        ax.grid(True, which='minor')
        ax.set_xlim(2500, 3000)
        i += 1

    #plt.savefig(os.path.join(DATASET_PATH, 'examples.png'))

# Look for invalid SNR

In [0]:
snr = ds.params['snr']
np.sum(np.isnan(snr)), np.sum(np.isinf(snr)), np.sum(snr == 0)

In [0]:
idx = np.where(np.isnan(snr))
idx = idx[0]
idx.shape, idx

In [0]:
for i in range(min(4, idx.shape[0])):
    f, axs = plt.subplots(1, 1, figsize=(16, 6), squeeze=False)
    
    ax = axs[0, 0]
    spec = ds.get_spectrum(idx[i])
        
    #ax.plot(spec.wave, spec.flux / mm, lw=0.3)
    ax.plot(spec.wave, spec.flux + np.random.normal(0, spec.flux_err), lw=0.3)
    ax.plot(spec.wave, spec.flux, lw=0.5)

    ax.set_ylim(0, 1.5 * spec.flux.max())
    ax.text(0.1, 0.1, str(idx[i]), transform=ax.transAxes)
    ax.text(0.9, 0.9, 'mag = %.2f \n[Fe/H] = %.2f \nT_eff = %.0f \nlog_g = %.1f \nSNR = %.3f' 
            % (ds.params['mag'][idx[i]], ds.params['Fe_H'][idx[i]], ds.params['T_eff'][idx[i]], ds.params['log_g'][idx[i]], ds.params['snr'][idx[i]]), transform=ax.transAxes,
           ha='right', va='top')
    #print(i, ' '.join(['{}={}'.format(k, ds.params.iloc[id + i][k]) for k in ds.params.iloc[[id + i]]]))
    ax.grid(True)

In [0]:
for i in range(min(4, idx.shape[0])):
    spec = ds.get_spectrum(idx[i])
    print(idx[i], np.sum(spec.flux_err == 0))

# Training set properties

## Distribution of stellar parameters

In [0]:
N = len(params)
f, ax = plt.subplots(N, 1, figsize=(10, 6 * N))
i = 0
for k in params:
    if ds.params.dtypes[k] == np.float64:
        #qs = np.quantile(ds.params[k], [0.1, 0.2, 0.8, 0.9])
        #hist_range = (2 * qs[0] - qs[1], 2 * qs[3] - qs[2])
        hist_range = (np.min(ds.params[k]), np.max(ds.params[k]))
        hist1, bins = np.histogram(ds.params[k], range=hist_range, bins=100, density=True)
        ax[i].step(0.5 * (bins[:-1] + bins[1:]), hist1, '-', label='train')
        ax[i].set_xlabel(k)
        ax[i].set_ylim(0, None)
        ax[i].legend()
        ax[i].grid(True)
        i += 1
    
#plt.savefig(os.path.join(DATASET_PATH, 'params.png'))