In [2]:
import sys
sys.path.append('../')

In [3]:
from scripts.utils import data_path
from scripts.cross_match_scripts import pandas_to_fits
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
import seaborn as sns
%matplotlib inline









# Counterparts and field sources catalogs construction

## Load DESI-CSC 50 arcsec cone search match

### preprocessing: load DESI-CSC 50 arcsec cone search match and join it with CSC data frame to get fluxes/errors.

In [4]:
desi_csc_orig = pd.read_pickle(data_path+'/csc_desi_r50_gaia.gz_pkl',compression='gzip')
desi_csc_orig['name_csc'] = desi_csc_orig['name_csc'].astype(str).str[2:-2] #fix bug in name_csc string

In [5]:
#load CSC  raw data so that I get positional errors and X-ray fluxes.
#it will be fixed in the future by MB #TODO
csc_orig = pd.read_pickle(data_path+'/csc_init_df.gz_pkl',compression='gzip')
csc_orig.rename(columns={'name':'name_csc'},inplace=True) #TODO

In [6]:
#add chanda localisation error and flux
desi_csc_orig = desi_csc_orig.merge(csc_orig[['name_csc', 'flux_csc_05_2', 'r_98_csc' ]], on='name_csc')


#add magnitudes (NOT DEREDDENED) #TODO 
#later make cut on the SNR of flux #TODO
@np.vectorize
def flux2mag(flux):
    if  flux<=0 or np.isnan(flux):
        return np.nan
    else:
        return 22.5 - 2.5 * np.log10(flux)


for flux_name in ['flux_g', 'flux_r', 'flux_z',
                    'flux_w1', 'flux_w2', 'flux_w3', 'flux_w4']:

    desi_csc_orig['mag_' + flux_name.split('_')[1]] = flux2mag(desi_csc_orig[flux_name])


### Primary filters and DESI duplicates (ra,dec) removal

In [7]:
#step 1: filter brick ID
#next sort by name_csc, ra, dec and flux_g, with flux_g from largest to smallest
desi_csc = desi_csc_orig.query('brick_primary == True')
desi_csc = desi_csc.sort_values(by=['name_csc', 'ra_csc', 'dec_csc', 'flux_g'], ascending=[True, True, True, False])

#step 2: remove duplicates in ['ra_csc', 'dec_csc', 'ra', 'dec'] and keep the one with the highest flux_g 
desi_csc = desi_csc.drop_duplicates(subset=['ra_csc', 'dec_csc', 'ra', 'dec'], keep='first')


#assign desi_id
tm_desi_id=desi_csc['release'].astype(str)+'_'+desi_csc['brickid'].astype(str)+'_'+desi_csc['objid'].astype(str)
desi_csc['desi_id'] = tm_desi_id

#sort by sep_csc so that the closest objects are first
desi_csc.sort_values(by=['name_csc', 'sep_csc'], inplace=True)

## Calculating source densities and false association radius (r_false) for each CSC source 

In [8]:
def annuli_area_deg2(r_in_arcsec, r_out_arcsec):
    r_in_deg = r_in_arcsec/3600
    r_out_deg = r_out_arcsec/3600
    area = np.pi*(r_out_deg**2 - r_in_deg**2)
    return area

def r_false(desi_rho_deg2, thresh = 0.03):
    ''' Belvedersky+ 2022 '''
    desi_rho_arcsec2 = desi_rho_deg2/(3600**2)
    return np.sqrt(-np.log(1-thresh)/(np.pi*desi_rho_arcsec2))


#make a dataframe of CSC sources
csc = pd.DataFrame(desi_csc.groupby(by = 'name_csc', ).agg({'ra_csc': np.mean, 'dec_csc': np.mean}))


#calculate total number of DESI sources withing 10-50 arcsec of each CSC source
csc[['desi_10_50']] =  desi_csc.groupby(by = ['name_csc', pd.cut(desi_csc.sep_csc, [10, 50]) ]).size().unstack()


#calculate source densities for 10-50'' annulus
csc['src_dens_deg2_10_50'] = csc['desi_10_50']/annuli_area_deg2(10, 50)

#drop sources with  zeros in the 10-50'' source density, 4 sources
csc.drop(csc[csc.src_dens_deg2_10_50 == 0].index, inplace=True)

#assign r_false for 10-50'' annulus density
csc['r_false_003_dens_10_50'] = r_false(csc['src_dens_deg2_10_50'])
csc.drop(columns=['desi_10_50', 'src_dens_deg2_10_50'], inplace=True)

csc.reset_index(inplace=True)

#assign r_false for each CSC source from csc dataframe to desi_csc dataframe
desi_csc = desi_csc.merge(csc[['name_csc','r_false_003_dens_10_50']], on='name_csc')



In [9]:
#calculate for each CSC source the number of DESI sources within r_false
tmp_df = pd.DataFrame(desi_csc.query('sep_csc <= r_false_003_dens_10_50').groupby(by = 'name_csc').sep_csc.apply('count'))
print(tmp_df.sep_csc.value_counts())
tmp_df.columns = ['sep_less_r_false']

#merge with desi_csc dataframe
desi_csc = desi_csc.merge(tmp_df, on='name_csc')

1    77254
2     5714
3       81
4       10
5        1
Name: sep_csc, dtype: int64


## Counterpart  assignment

In [10]:
desi_csc_ctps = desi_csc.query('sep_csc <= r_false_003_dens_10_50 & sep_less_r_false==1 & sep_csc < r_98_csc ')

In [12]:
print('delete ', desi_csc_ctps.duplicated(subset='desi_id').sum(), ' duplicated desi_id')
desi_csc_ctps = desi_csc_ctps.drop_duplicates(subset='desi_id')
print('Final number of counterparts:', len(desi_csc_ctps))
desi_csc_ctps['is_counterpart'] = True
#dublicates will appear here if untreated
#desi_csc_ctps.pivot_table(values='name_csc', index='desi_id', aggfunc='count').sort_values(by = 'name_csc',ascending=False)

delete  22  duplicated desi_id
Final number of counterparts: 77111


## Field source assignment

In [13]:
desi_csc_field = desi_csc.query("sep_csc>=10 & sep_csc<=50 ")
print('number of possible field sources: ', len(desi_csc_field))

number of possible field sources:  4945706


In [14]:
tmp_df = desi_csc_field.groupby(by = 'desi_id', ).agg({'name_csc': 'count'})
clear_field_ids = tmp_df.index[tmp_df['name_csc']==1]
desi_csc_field = desi_csc_field[desi_csc_field.desi_id.isin(clear_field_ids)==True]
fraction_to_retain = 0.1
desi_csc_field = desi_csc_field.sample(frac=fraction_to_retain)
desi_csc_field['is_counterpart'] = False
desi_csc_field['flux_csc_05_2'] = np.nan
desi_csc_field['r_98_csc'] = np.nan

print(f'Final number of secure field sources  ({fraction_to_retain} of it): ', len(desi_csc_field))

Final number of secure field sources  (0.1 of it):  326948


## Saving the combined catalogs

In [1]:
desi_csc_training = pd.concat([desi_csc_ctps, desi_csc_field])
#save cat to pickle
desi_csc_training.to_pickle(data_path+'csc_desi_r50_gaia_traning.gz_pkl', compression='gzip')

NameError: name 'pd' is not defined