In [19]:
import numpy as np
import astropy.io.fits as pyfits
import matplotlib.pyplot as plt
import glob
# making the plots look good 
import seaborn as sns 
plt.rcParams["figure.figsize"] = [16,9]
sns.set_style('whitegrid')

# loading spectres for a resampling test https://spectres.readthedocs.io/en/latest/ 
from spectres import spectres
from matplotlib import gridspec

In [20]:
import pandas as pd
df = pd.read_csv("GALAH_DR3_sobject_ids_existing.csv")
df.columns = ["sobject_id"]
sobject_ids = df["sobject_id"].to_numpy().tolist()

In [21]:
len(sobject_ids)

577627

In [25]:
def read_spectra(sobject_id):
    fits_files = [[],[],[],[]]
    for each_ccd in [3]:
        fits_files[each_ccd-1] = glob.glob("/data/praveen/galah-total/galah/dr3/spectra/hermes/"+str(sobject_id)+str(each_ccd)+".fits") #this is reading fits files from file and not downloading directly 

    spectrum = dict()
    for each_ccd in [3]: #GALAH uses indexing from 1 - 4
        if fits_files[each_ccd-1]!=[]: #just using zero indexing here 
            fits = pyfits.open(fits_files[each_ccd-1][0]) 
            
            # Extension 0: Reduced spectrum
            # Extension 1: Relative error spectrum
            # Extension 4: Normalised spectrum, NB: cut for CCD4

            # Extract wavelength grid for the reduced spectrum
            start_wavelength = fits[0].header["CRVAL1"]
            dispersion       = fits[0].header["CDELT1"]
            nr_pixels        = fits[0].header["NAXIS1"]
            reference_pixel  = fits[0].header["CRPIX1"]

            if reference_pixel == 0:
                reference_pixel = 1
            spectrum['wave_red_'+str(each_ccd)] = ((np.arange(0,nr_pixels)--reference_pixel+1)*dispersion+start_wavelength) #this is the reduced spectrum 

            # Extract wavelength grid for the normalised spectrum
            start_wavelength = fits[4].header["CRVAL1"]
            dispersion       = fits[4].header["CDELT1"]
            nr_pixels        = fits[4].header["NAXIS1"]
            reference_pixel  = fits[4].header["CRPIX1"]

            if reference_pixel == 0:
                reference_pixel=1
            spectrum['wave_norm_'+str(each_ccd)] = ((np.arange(0,nr_pixels)--reference_pixel+1)*dispersion+start_wavelength) #this is the normalised spectrum 

            # need to confirm how this is calculated 
            spectrum['sob_red_'+str(each_ccd)]  = np.array(fits[0].data)
            spectrum['uob_red_'+str(each_ccd)]  = np.array(fits[0].data * fits[1].data)

            # need to confirm how this is calculated 
            spectrum['sob_norm_'+str(each_ccd)] = np.array(fits[4].data)
            if each_ccd != 4:
                spectrum['uob_norm_'+str(each_ccd)] = np.array(fits[4].data * fits[1].data)
            else:
                # for normalised error of CCD4, only used appropriate parts of error spectrum
                spectrum['uob_norm_4'] = np.array(fits[4].data * (fits[1].data)[-len(spectrum['sob_norm_4']):])

            fits.close()
        else:
            spectrum['wave_red_'+str(each_ccd)] = []
            spectrum['wave_norm_'+str(each_ccd)] = []
            spectrum['sob_red_'+str(each_ccd)] = []
            spectrum['sob_norm_'+str(each_ccd)] = []
            spectrum['uob_red_'+str(each_ccd)] = []
            spectrum['uob_norm_'+str(each_ccd)] = []
    
    spectrum['wave_red'] = np.concatenate(([spectrum['wave_red_'+str(each_ccd)] for each_ccd in [3]]))
    spectrum['wave_norm'] = np.concatenate(([spectrum['wave_norm_'+str(each_ccd)] for each_ccd in [3]]))
    spectrum['sob_red'] = np.concatenate(([spectrum['sob_red_'+str(each_ccd)] for each_ccd in [3]]))
    spectrum['sob_norm'] = np.concatenate(([spectrum['sob_norm_'+str(each_ccd)] for each_ccd in [3]]))
    spectrum['uob_red'] = np.concatenate(([spectrum['uob_red_'+str(each_ccd)] for each_ccd in [3]]))
    spectrum['uob_norm'] = np.concatenate(([spectrum['uob_norm_'+str(each_ccd)] for each_ccd in [3]]))

    return spectrum #returns a 30 "row" dict of numpy array per row 'wave_red_x' is the key for the key value pair # need to look at camera 3 for lithium 

In [26]:
GRID_SIZE = 0.06
LOWER_LAMBDA = 6472.5
UPPER_LAMBDA = 6740

regrid = np.arange(LOWER_LAMBDA, UPPER_LAMBDA, GRID_SIZE) 

def resample_spectra(spectrum, camera, verbose):

    spec_resample, spec_errs_resample = spectres(regrid,  spectrum['wave_norm_'+str(camera)], spectrum['sob_norm_'+str(camera)], spec_errs= spectrum['uob_norm_'+str(camera)],verbose=verbose) 

    return spec_resample, spec_errs_resample

In [27]:
null_sobject_ids = {}
null_sobject_ids['null_sobject_ids'] = []

resampled_spectra_collection = {}
resampled_spectra_collection['spec_resample'] = []

resampled_error_collection = {}
resampled_error_collection['error_resample'] = []

In [28]:
for sobject_id in sobject_ids:
    temp_spectrum=read_spectra(sobject_id)['sob_norm_'+str(3)]
    if np.asarray(temp_spectrum).size==0:
        null_sobject_ids['null_sobject_ids'].append(sobject_id)
   

In [30]:
print(null_sobject_ids['null_sobject_ids'])

[13111600050100, 13111600050101, 13111600050102, 1311160005010, 13111600050104, 13111600050105, 13111600050106, 13111600050107, 13111600050109, 13111600050110, 13111600050111, 13111600050112, 1311160005011, 13111600050114, 13111600050115, 13111600050116, 13111600050117, 13111600050119, 13111600050120, 13111600050121, 13111600050122, 1311160005012, 13111600050124, 13111600050125, 13111600050127, 13111600050128, 13111600050129, 13111600050130, 13111600050131, 13111600050132, 13111600050134, 13111600050135, 13111600050136, 13111600050137, 13111600050138, 13111600050139, 13111600100101, 13111600100104, 13111600100110, 13111600100117, 13111600100118, 13111600100119, 13111600100120, 13111600100121, 13111600100125, 13111600100130, 13111600100135, 13111600100136, 13111600100138, 13111800240101, 13111800240102, 13111800240105, 13111800240106, 13111800240108, 13111800240109, 13111800240121, 13111800240124, 13111800240132, 13111800240138, 1311180024022, 13111800240229, 13111800290100, 13111800290

In [31]:
type(null_sobject_ids['null_sobject_ids'])

list

In [32]:
df_nulls = pd.DataFrame(null_sobject_ids['null_sobject_ids'], columns=['sobject_id'])

In [10]:
type(temp_spectrum)

list

In [11]:
print(temp_spectrum)

[]


In [33]:
df_nulls.to_csv("GALAH_DR3_sobject_ids_nulls.csv", header=False, index=False)

In [1]:
import pandas as pd

In [3]:
df1 = pd.read_csv("GALAH_DR3_sobject_ids_problematic_read.csv",header=None)

In [4]:
df1.columns = ["sobject_id"]

In [5]:
df2 = pd.read_csv("GALAH_DR3_sobject_ids_nulls.csv",header=None)

In [6]:
df2.columns = ["sobject_id"]

In [7]:
df_temp = pd.merge(df1,df2, how='outer', indicator=True)

In [8]:
df_temp["_merge"].value_counts()

left_only     527694
both           60650
right_only         0
Name: _merge, dtype: int64

In [9]:
df_final_cands = df_temp.loc[df_temp["_merge"]=="left_only"]

In [43]:
nparray = df_final_cands["sobject_id"].to_numpy()
df = pd.DataFrame(nparray)
df.to_csv("non_null_candidates.csv", header=False, index=False)

In [28]:
for sobject_id in sobject_ids:
    #this is the normalised resampled spectra 
    temp_spectrum = resample_spectra(read_spectra(sobject_id), 3, False)
    temp_spectrum[0][np.isnan(temp_spectrum[0])] = 1 #padding 
    resampled_spectra_collection['spec_resample'].append(temp_spectrum[0])

    #these are the error spectra
    #calculate mean error for padding 

    non_na_values = temp_spectrum[1][~np.isnan(temp_spectrum[1])]
    mean_error = np.mean(non_na_values)

    temp_spectrum[1][np.isnan(temp_spectrum[1])] = mean_error
    resampled_error_collection['error_resample'].append(temp_spectrum[1])

AttributeError: 'list' object has no attribute 'shape'

In [None]:
import h5py

#save the resampled spectra to be used as inputs to the training set
hf_spec = h5py.File("/data/praveen/resampled_emission_spectra.h5", "w")
hf_spec.create_dataset('spectra', data=resampled_spectra_collection['spec_resample'])
hf_spec.close()

#save the wavelength grid 
hf_grid = h5py.File("/data/praveen/wl_grid.h5", "w")
hf_grid.create_dataset('wl_grid', data=regrid)
hf_grid.close()

#save the error spectra
hf_error = h5py.File("/data/praveen/resampled_test_errors.h5", "w")
hf_error.create_dataset('errors', data=resampled_error_collection['error_resample'])
hf_error.close()