# SDSS eBOSS Data 
## Script on reading and pre-processing data, and generation of a catalogue of desirable galaxy types

This script extracts useful data from the spPlate and spAll_redrock fits files, and generates the required training data set.

1. **Defining input parameters**
2. **Reading and pre-processing the data**
3. **Applying selection cuts**
5. **Generating the training data set**

**Data**: 14th Oct, 2019. <br>
**Author**: Soumya Shreeram <br>
**Supervised by**: Anand Raichoor <br>
**Script adapted from**: S. Ben Nejma


In [None]:
import astropy.io.fits as fits
import matplotlib.pyplot as plt
import numpy as np
import os
import subprocess
from astropy.convolution import convolve, Box1DKernel

## 1. Defining input parameters

In [None]:
# data directory on lesta with the spAll_redrock files
spPlate_dir = r'/hpcstorage/raichoor/spplatelist_v5_13_0/spPlate'
spAll_redrock_file = r'/hpcstorage/raichoor/spplatelist_v5_13_0/' \
            'spall_redrock_v5_13_0.valid.fits'

## 2. Reading the data

In [17]:
def setName(data_dir, plate_mjd):
    file_name = '-'+plate_mjd+'.fits'
    data_file = os.path.join(data_dir, file_name)
    return data_file

def readFile(filename):
    """
    Function opens the file
    @input filename :: name of the file
    """
    hdu = fits.open(filename)
    data = hdu[1].data
    hdu.close()        
    return data

def plateMJD(data):
    # defining the PLATE number, p, and MJD, m for all the files
    pms = np.array([str(p)+'-'+ str(m) for p, m in zip(data['PLATE'],
data['MJD'])])
    return pms

def uniquePmsProgramme(pms, data):
    # selecting only the unique plates-mjd, and find their programmes
    pms_unique, idx = np.unique(pms, return_index=True)
    prog_unique = data['programname'][idx]
    return pms_unique, prog_unique

def readSpPlate(data_dir, plate_mjd):
    """
    Function to read the useful headers and data from spPlate fits file
    @param place :: 4-digit plate number
    @param mjd :: 5-digit MJD
    
    @returns wavelength, bunit, flux, ivar (refer comments for individual meanings)
    """
    # opens the file
    hdu     = fits.open(setName(data_dir, plate_mjd))        
    
    c0      = hdu[0].header['coeff0']   # Central wavelength (log10) of first pixel
    c1      = hdu[0].header['coeff1']   # Log10 dispersion per pixel
    npix    = hdu[0].header['naxis1']   # WIDTH (TOTAL!
    wavelength    = 10.**(c0 + c1 * np.arange(npix))
    bunit   = hdu[0].header['bunit']    # Units of flux

    flux    = hdu[0].data               # Flux in units of 10^-17^ erg/s/cm^2^/Ang
    ivar    = hdu[1].data               # Inverse variance (1/sigma^2^) for HDU 0
    hdu.close()
    return wavelength, bunit, flux, ivar

In [18]:
# reads the file spAll_redrock and generates arrays of unique plate-MJD and programs
data = readFile(spAll_redrock_file)
pms = plateMJD(data)

## 3. Applying selection cuts

The functions below implement various selection cuts to obtain the desired data. They are summarized below:
* Select plates that observe **E**mission-**L**ine type **G**alaxies (ELGs), LRGs, and QSOs
* Select wavelength that are common to all plates
* Removing sky spectra and certain configurations
* Select redshift range (Zspec fibres)

In [13]:
def galaxyType(pms_unique, prog_unique, names, gal_type, num_p):
    """
    Function chooses the file name based of desired galaxy type
    @params pms_unique, prog_unique :: unique array of plate nos.-MJD & programmes
    @param names :: array of names of the galaxies/programmes to select/de-select
    @param gal_type :: string to distinguish the desired operations
    @param num_p :: number of plates of each galaxy to select
    
    @returns sub_plates :: array names of selected plates 
    """
    if gal_type == 'ELG': # select ELG plates
        sub_plates = np.random.choice(pms_unique[(prog_unique==names[0]) | \
                                    (prog_unique==names[1])],size=num_p).tolist()
    elif gal_type == 'LRG+QSO': # select LRG+QSO plates
         sub_plates = np.random.choice(pms_unique[(prog_unique==names[0]) & \
                                            (prog_unique!=names[1]) & \
                                            (prog_unique!=names[2])],size=num_p).tolist()
    else: # select boss plates
        sub_plates = np.random.choice(pms_unique[(prog_unique==names[0])],size=num_p).tolist()
    return sub_plates
 
def selectPlates(pms_unique, prog_unique, num_pl):
    """
    Function the selects plates containing ELGs, LRG+QSOs, and some random.
    @param pms_unique :: arroy of plate nos. and MJDs
    @param prog_unique :: list of unique programmes (eBoss/Boss)
    @param num_pl :: number of plates of each category to select
    
    @returns selected_plates :: array of the file names containing desired galaxies
    """
    selected_plates = []
    
    # select 4 eboss ELG plates
    names_elg = ['ELG_NGC', 'ELG_SGC']
    selected_plates += galaxyType(pms_unique, prog_unique, names_elg, 'ELG', num_pl)
    
    # select 4 eboss LRG+QSO plates
    names_lrgQso = ['eboss', 'ELG_NGC', 'ELG_SGC']
    selected_plates += galaxyType(pms_unique, prog_unique, names_lrgQso, 'LRG+QSO', num_pl)
    
    # select 4 random boss plates
    names_boss = ['boss']
    selected_plates += galaxyType(pms_unique, prog_unique, names_boss, 'boss plates', num_pl)
    
    return selected_plates

def writeToFile(pms, outfilename, selected_plates):
    """
    Function extracts the info from desired files and writes to a new file
    @param pms :: complete array of plate nos. and MJD
    @param outfilename :: output file name
    @param selected_plates :: array of all the selected plates
    """    
    # extract those plate-mjd files
    extract_files = np.in1d(pms, selected_plates)
    
    # write info to new fits file
    hdu[1].data = hdu[1].data[extract_files]
    return hdu.writeto(outfilename, overwrite=True)

def discardSkySpectra(data):
    """
    Function derfines the conditions for discarding the sky spectra
    and some othe factors
    """
    data = data[(data['ZWARN'] == 0) & \
                (data['OBJTYPE'] != 'SKY') & \
                (data['CHI2']/data['NPIXELS'].astype(float) > 0.4) & \
                (data['DELTACHI2']/data['NPIXELS'].astype(float) > 0.0025)]
    return data

def selectWavelengths(pms_unique, spPlate_dir, idx, pm):
    """
    Function selects the common wavelengths in all files wrt the first file
    @param pms_unique :: unique array of plate nos. and MJD
    @param spPlate_dir :: directory of sp plate files
    @param idx, pm :: iterating variables
    
    @returns wavelength, wavelength_idx :: the intersected waves and indicies
    """
    wavelength_idx = [];       
    wavelength = readSpPlate(spPlate_dir, pm)[0]        

    if idx == 0: # for the first spPlate file
        waves_initial = wavelength
        wavelength_idx.append(np.arrange(len(waves_initial)))
    else:  # intersect the wavelengths common to the 1st (initial) array 
        common_waves, idx0, idx1 = np.intersect1d(waves_initial, wavelength, \
                                                  return_indices=True)
    # redefine+add the common indicies for every file wrt the 1st wavelength index array
    wavelength_idx = [w[idx0] for w in wavelength_idx]
    wavelength_idx.append(idx1)

    # new wave array updated wrt common elements of the 1st wave array
    wavelength = waveslength[idx1]

    return wavelength, wavelength_idx

In [None]:
# sky spectra and other factors taken out
data_i = discardSkySpectra(data)
pms_i = plateMJD(data_i)

# find unique pms, programmes
pms_unique, prog_unique = uniquePmsProgramme(pms_i, data_i)

# select plates containing ELGs, LRGs, QSOs, and some boss plates
selected_plates = selectPlates(pms_unique, prog_unique, num_pl)

# write the info to a new file
writeToFile(pms, outfilename, selected_plates)

# wavelength file

In [None]:
for idx, pm in enumerate(pms_unique):
    selectWavelengths(pms_unique, spPlate_dir, idx, pm)