In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/colab

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/colab


In [2]:
import sys
PACKAGE_PATH = '/content/drive/MyDrive/colab/packages'

if PACKAGE_PATH not in sys.path:
    sys.path.append(PACKAGE_PATH)
sys.path.append('/content/drive/MyDrive/colab')

In [3]:
import os
import numpy as np
import fitsio
import pandas as pd

In [4]:
file_path = '/content/drive/MyDrive/colab/data/zall-pix-iron.fits'
if not os.path.exists(file_path):
  !wget -c -O {file_path} https://data.desi.lbl.gov/public/dr1/spectro/redux/iron/zcatalog/v1/zall-pix-iron.fits
else:
  print(f"{file_path} already exists.")

/content/drive/MyDrive/colab/data/zall-pix-iron.fits already exists.


In [5]:
with fitsio.FITS(file_path) as f:
  all_columns=f[1].get_colnames()
  print("Columns available:\n",all_columns)
  n_rows=f[1].get_nrows()
  print("Total number of rows:",n_rows)


Columns available:
 ['TARGETID', 'SURVEY', 'PROGRAM', 'HEALPIX', 'SPGRPVAL', 'Z', 'ZERR', 'ZWARN', 'CHI2', 'COEFF', 'NPIXELS', 'SPECTYPE', 'SUBTYPE', 'NCOEFF', 'DELTACHI2', 'COADD_FIBERSTATUS', 'TARGET_RA', 'TARGET_DEC', 'PMRA', 'PMDEC', 'REF_EPOCH', 'FA_TARGET', 'FA_TYPE', 'OBJTYPE', 'SUBPRIORITY', 'OBSCONDITIONS', 'RELEASE', 'BRICKNAME', 'BRICKID', 'BRICK_OBJID', 'MORPHTYPE', 'EBV', 'FLUX_G', 'FLUX_R', 'FLUX_Z', 'FLUX_W1', 'FLUX_W2', 'FLUX_IVAR_G', 'FLUX_IVAR_R', 'FLUX_IVAR_Z', 'FLUX_IVAR_W1', 'FLUX_IVAR_W2', 'FIBERFLUX_G', 'FIBERFLUX_R', 'FIBERFLUX_Z', 'FIBERTOTFLUX_G', 'FIBERTOTFLUX_R', 'FIBERTOTFLUX_Z', 'MASKBITS', 'SERSIC', 'SHAPE_R', 'SHAPE_E1', 'SHAPE_E2', 'REF_ID', 'REF_CAT', 'GAIA_PHOT_G_MEAN_MAG', 'GAIA_PHOT_BP_MEAN_MAG', 'GAIA_PHOT_RP_MEAN_MAG', 'PARALLAX', 'PHOTSYS', 'PRIORITY_INIT', 'NUMOBS_INIT', 'CMX_TARGET', 'DESI_TARGET', 'BGS_TARGET', 'MWS_TARGET', 'SCND_TARGET', 'SV1_DESI_TARGET', 'SV1_BGS_TARGET', 'SV1_MWS_TARGET', 'SV1_SCND_TARGET', 'SV2_DESI_TARGET', 'SV2_BGS_TAR

## Sample a subset

In [6]:
sample_size = 10000
indices=np.arange(sample_size)
with fitsio.FITS(file_path) as f:
  sample_data = f[1].read(columns=all_columns, rows=indices)

In [7]:
good_columns = []
for col in all_columns:
    try:
        _ = pd.Series(sample_data[col])
        good_columns.append(col)
    except Exception as e:
        print(f"Skipping column {col} due to error: {e}")
print(f"Good columns: {good_columns}")

Skipping column COEFF due to error: Data must be 1-dimensional, got ndarray of shape (10000, 10) instead
Good columns: ['TARGETID', 'SURVEY', 'PROGRAM', 'HEALPIX', 'SPGRPVAL', 'Z', 'ZERR', 'ZWARN', 'CHI2', 'NPIXELS', 'SPECTYPE', 'SUBTYPE', 'NCOEFF', 'DELTACHI2', 'COADD_FIBERSTATUS', 'TARGET_RA', 'TARGET_DEC', 'PMRA', 'PMDEC', 'REF_EPOCH', 'FA_TARGET', 'FA_TYPE', 'OBJTYPE', 'SUBPRIORITY', 'OBSCONDITIONS', 'RELEASE', 'BRICKNAME', 'BRICKID', 'BRICK_OBJID', 'MORPHTYPE', 'EBV', 'FLUX_G', 'FLUX_R', 'FLUX_Z', 'FLUX_W1', 'FLUX_W2', 'FLUX_IVAR_G', 'FLUX_IVAR_R', 'FLUX_IVAR_Z', 'FLUX_IVAR_W1', 'FLUX_IVAR_W2', 'FIBERFLUX_G', 'FIBERFLUX_R', 'FIBERFLUX_Z', 'FIBERTOTFLUX_G', 'FIBERTOTFLUX_R', 'FIBERTOTFLUX_Z', 'MASKBITS', 'SERSIC', 'SHAPE_R', 'SHAPE_E1', 'SHAPE_E2', 'REF_ID', 'REF_CAT', 'GAIA_PHOT_G_MEAN_MAG', 'GAIA_PHOT_BP_MEAN_MAG', 'GAIA_PHOT_RP_MEAN_MAG', 'PARALLAX', 'PHOTSYS', 'PRIORITY_INIT', 'NUMOBS_INIT', 'CMX_TARGET', 'DESI_TARGET', 'BGS_TARGET', 'MWS_TARGET', 'SCND_TARGET', 'SV1_DESI_TARGE

In [8]:
df_sample = pd.DataFrame({col: sample_data[col] for col in good_columns})
df_sample.columns=df_sample.columns.str.lower()
df_sample=df_sample.set_index("targetid")
df_sample.head()

Unnamed: 0_level_0,survey,program,healpix,spgrpval,z,zerr,zwarn,chi2,npixels,spectype,...,tsnr2_gpbbackup,tsnr2_qso,tsnr2_lrg,main_nspec,main_primary,sv_nspec,sv_primary,zcat_nspec,zcat_primary,desiname
targetid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
39628473198710603,cmx,other,2152,2152,0.804206,9.589004e-06,0,10003.72,7928,GALAXY,...,37057.066406,82.756981,235.852158,0,False,0,False,1,True,DESI J023.7648+29.8323
39628473198711006,cmx,other,2152,2152,-6.9e-05,8.066745e-07,0,57903.55,7923,STAR,...,36845.910156,81.696724,231.869568,0,False,0,False,1,True,DESI J023.7864+29.8667
39628473198711342,cmx,other,2152,2152,0.932752,2.232146e-46,518,9e+99,0,GALAXY,...,34692.910156,76.127892,202.061813,0,False,0,False,1,True,DESI J023.8022+29.8321
39628473198711925,cmx,other,2152,2152,0.390723,2.031381e-05,0,10482.61,7928,GALAXY,...,35960.445312,80.330345,227.827698,0,False,0,False,1,True,DESI J023.8321+29.8581
39628473202901028,cmx,other,2152,2152,0.665803,7.591307e-05,0,8802.139,7927,GALAXY,...,36465.910156,80.075523,228.336853,0,False,0,False,1,True,DESI J023.8666+29.8163


In [9]:
if not os.path.exists('/content/drive/MyDrive/colab/data/sample.csv'):
  df_sample.to_csv('/content/drive/MyDrive/colab/data/sample.csv')

In [10]:
keywords =['COADD', 'RA', 'DEC', 'SHAPE', 'DELTA', 'MJD', 'INIT', 'TSNR',
       'NSPEC', 'PRIMARY', 'BRICK', 'PRIORITY', 'COEFF', 'PIX','_TARGET','REF',]
cols_to_drop=set()
for kw in keywords:
  for col in all_columns:
    if kw in col:
      cols_to_drop.add(col)
extra_drop = ['SURVEY', 'OBJTYPE', 'SERSIC', 'MEAN_PSF_TO_FIBER_SPECFLUX',
       'FA_TYPE', 'OBSCONDITIONS', 'RELEASE', 'PHOTSYS', 'SPGRPVAL',
       'DESINAME', 'ZWARN', 'MASKBITS']
cols_to_drop.update(extra_drop)
useful_cols=[col for col in all_columns if col not in cols_to_drop]
print(useful_cols)

['TARGETID', 'Z', 'ZERR', 'CHI2', 'SPECTYPE', 'SUBTYPE', 'MORPHTYPE', 'EBV', 'FLUX_G', 'FLUX_R', 'FLUX_Z', 'FLUX_W1', 'FLUX_W2', 'FLUX_IVAR_G', 'FLUX_IVAR_R', 'FLUX_IVAR_Z', 'FLUX_IVAR_W1', 'FLUX_IVAR_W2', 'FIBERFLUX_G', 'FIBERFLUX_R', 'FIBERFLUX_Z', 'FIBERTOTFLUX_G', 'FIBERTOTFLUX_R', 'FIBERTOTFLUX_Z', 'GAIA_PHOT_G_MEAN_MAG', 'GAIA_PHOT_BP_MEAN_MAG', 'GAIA_PHOT_RP_MEAN_MAG']


In [11]:
index_cols = ["SPECTYPE", "ZWARN"]
if not os.path.exists("data/index.csv"):
  with fitsio.FITS(file_path) as f:
    data=f[1].read(columns=index_cols)
    df=pd.DataFrame(data)
    df.to_csv("index.csv",index=False)
    del data
df=pd.read_csv("data/index.csv")
df

Unnamed: 0,ZWARN,SPECTYPE
0,0,GALAXY
1,0,STAR
2,518,GALAXY
3,0,GALAXY
4,0,GALAXY
...,...,...
28425958,2053,GALAXY
28425959,2053,QSO
28425960,5,GALAXY
28425961,5,GALAXY


In [12]:
good_indices = df.index[df["ZWARN"] == 0]
star_indices=df.index[(df["SPECTYPE"] =="STAR") &(df["ZWARN"]==0)]
del df

In [16]:
if not os.path.exists("data/desi_stars.csv"):
  with fitsio.FITS(file_path) as f:
    star_cols=[col for col in useful_cols if col not in index_cols]
    data=f[1].read(rows=star_indices,columns=star_cols)
    star_df=pd.DataFrame(data)
    star_df.columns=star_df.columns.str.lower()
    star_df=star_df.set_index("targetid")
    del data
    coeff=f[1].read(rows=star_indices,columns=["COEFF","TARGETID"])
    df_coeff=pd.DataFrame(coeff["COEFF"],columns=[f"coeff_{i}" for i in range(10)],index=coeff["TARGETID"])
    del coeff
    df_coeff = df_coeff.astype("float64")
    df_coeff = df_coeff.drop(columns=[f"coeff_{i}" for i in range(5, 10)])
    df_coeff.columns=df_coeff.columns.str.lower()
    star_dataset=pd.concat([star_df,df_coeff],axis=1)
    del star_df,df_coeff
    star_dataset.to_csv("data/desi_stars.csv")
    star_dataset=pd.read_csv("data/desi_stars.csv",index_col=0)
    print(len(star_dataset))
    print(star_dataset.head())

4750477
                            z          zerr          chi2 subtype morphtype  \
39628473198711006   -0.000069  8.066745e-07  57903.546964       G       PSF   
1152921504619435527 -0.000038  4.481989e-06  14892.817330       M       NaN   
1152921504619435582  0.000098  4.753192e-06  11548.243202       M       NaN   
39628473207097483   -0.000155  1.370832e-06  30294.328937       G       PSF   
39628473211289672   -0.000032  1.148578e-05   9428.524438       M       PSF   

                          ebv      flux_g      flux_r      flux_z     flux_w1  \
39628473198711006    0.053758  356.878500  511.257870  560.903600  139.894710   
1152921504619435527  0.051629    0.000000    0.000000    0.000000    0.000000   
1152921504619435582  0.051625    0.000000    0.000000    0.000000    0.000000   
39628473207097483    0.050515  103.943115  149.515100  163.248120   40.065598   
39628473211289672    0.048970    0.109821    0.411892    6.435524    8.218025   

                     ...  fibe

In [14]:
seed = 42
rng = np.random.default_rng(seed)
sample_size = 10000
indices = np.unique(rng.integers(0,n_rows,size=1000+sample_size))
indices=np.intersect1d(indices,good_indices)
needed=sample_size-len(indices)
while needed>0:
  extra=rng.integers(0,n_rows,size=2*needed)
  indices = np.unique(np.concatenate([indices, extra]))
  indices=np.intersect1d(indices,good_indices)
  needed=sample_size-len(indices)
indices=indices[:sample_size]

In [15]:
if not os.path.exists("data/desi_sample_clean.csv"):
  with fitsio.FITS(file_path) as f:
    spectype_cols=[col for col in useful_cols if col not in ["SUBTYPE"]]
    spectype_data=f[1].read(rows=indices,columns=spectype_cols)
    spectype_df=pd.DataFrame(spectype_data,columns=spectype_cols)
    spectype_df.columns=spectype_df.columns.str.lower()
    spectype_df=spectype_df.set_index("targetid")
    del spectype_data
    spectype_df.to_csv("data/desi_sample_clean.csv")
    spectype_df=pd.read_csv("data/desi_sample_clean.csv",index_col=0)
    print(spectype_df.head())
    print(len(spectype_df))

                            z          zerr           chi2 spectype morphtype  \
targetid                                                                        
1152921504619435527 -0.000038  4.481989e-06   14892.817330     STAR       NaN   
2305843012942433355 -0.000070  3.822802e-06    8263.021712     STAR      GPSF   
2305843012950841090  0.000044  6.850911e-07  105931.548296     STAR      GPSF   
2305843012950843292 -0.000059  1.086602e-06   46184.958727     STAR      GPSF   
2305843012963417838 -0.000272  4.639735e-07  131375.831897     STAR      GPSF   

                          ebv  flux_g  flux_r  flux_z  flux_w1  ...  \
targetid                                                        ...   
1152921504619435527  0.051629     0.0     0.0     0.0      0.0  ...   
2305843012942433355  0.177142   -99.0   -99.0   -99.0      0.0  ...   
2305843012950841090  0.136814   -99.0   -99.0   -99.0      0.0  ...   
2305843012950843292  0.168506   -99.0   -99.0   -99.0      0.0  ...   
230584