In [1]:
import pickle
import os
from copy import copy

import pandas as pd
import numpy as np
from astropy.table import Table
from tqdm import tqdm
%matplotlib inline
import matplotlib.pyplot as plt

import sklearn
from sklearn.neighbors import KDTree

In [2]:
def angular_dist(x, y):
    ra_01, dec_01 = x[0], x[1]
    ra_02, dec_02 = y[0], y[1]
    ra_01 = ra_01 * np.pi / 180
    ra_02 = ra_02 * np.pi / 180
    dec_01 = dec_01 * np.pi / 180
    dec_02 = dec_02 * np.pi / 180
    return np.arccos(np.minimum(
        np.sin(dec_01) * np.sin(dec_02) +
        np.cos(dec_01) * np.cos(dec_02) * np.cos(ra_01 - ra_02),
        1
    )) * 180 / (np.pi)

def correlate(d1, d2):
    kd_tree = KDTree(d1)
    d, idx = kd_tree.query(d2, k=20)

    a = []
    for i in tqdm(range(idx.shape[0])):
        min_dist = 10000
        min_id = None
        for id_ in idx[i]:
            dist = angular_dist(d2[i], d1[id_])
            if dist < min_dist:
                min_dist = dist
                min_id = id_
        if min_dist < 2/3600:
            a.append([min_id, min_dist, i])
        else:
            a.append([min_id, min_dist, i])
    return np.array(a)

In [3]:
data_dir = 'data'
with open(f'{data_dir}/features_sdssdr16+psdr2+all_deacls8tr_QSO+GALAXY_20201212133711.pkl', 'rb') as f:
    features_list = pickle.load(f)
models_cols = ['max19_z', 'conf19_z', 'max21_z', 'conf21_z', 'max22_z', 'conf22_z', 'max35_z', 'conf35_z']

In [4]:
t = Table.read(f'{data_dir}/dr16q_prop_Sep08_2022.fits')
print(t.colnames)
dr16q_prop = t['OBJID', 'RA', 'DEC', 'Z_DR16Q', 'Z_SYS', 'Z_SYS_ERR'].to_pandas()
dr16q_prop.rename(columns={'Z_DR16Q': 'Z'}, inplace=True)
dr16q_prop['dr16q'] = [True] * dr16q_prop.shape[0]
dr16q_prop['train20'] = [False] * dr16q_prop.shape[0]
dr16q_prop['superset'] = [False] * dr16q_prop.shape[0]
for fea in features_list:
    dr16q_prop[fea] = [None] * dr16q_prop.shape[0]
for i in [19, 21, 22, 35]:
    dr16q_prop[f'max{i}_z'] = [None] * dr16q_prop.shape[0]
    dr16q_prop[f'conf{i}_z'] = [None] * dr16q_prop.shape[0]
dr16q_prop.sort_values(by=['RA'], inplace=True)
dr16q_prop.drop_duplicates(subset=['RA', 'DEC'], inplace=True, keep='last')
dr16q_prop.reset_index(drop=True, inplace=True)
dr16q_prop.insert(6, 'Z_SYS_DIFF', dr16q_prop['Z_SYS'] - dr16q_prop['Z'])
dr16q_prop.insert(7, 'Z_SYS_DIFF_ABS', np.abs(dr16q_prop['Z_SYS'] - dr16q_prop['Z']))
dr16q_prop.insert(9, 'dr16q_prop', (dr16q_prop['Z_SYS_DIFF_ABS'] >= 0.1))
dr16q_prop



['SDSS_NAME', 'PLATE', 'MJD', 'FIBERID', 'RA', 'DEC', 'OBJID', 'IF_BOSS_SDSS', 'Z_DR16Q', 'SOURCE_Z_DR16Q', 'Z_FIT', 'Z_SYS', 'Z_SYS_ERR', 'EBV', 'SN_MEDIAN_ALL', 'CONTI_FIT', 'CONTI_FIT_ERR', 'CONTI_STAT', 'FEII_UV', 'FEII_UV_ERR', 'FEII_UV_EW', 'FEII_UV_EW_ERR', 'FEII_OPT', 'FEII_OPT_ERR', 'FEII_OPT_EW', 'FEII_OPT_EW_ERR', 'LOGL1350', 'LOGL1350_ERR', 'LOGL1700', 'LOGL1700_ERR', 'LOGL3000', 'LOGL3000_ERR', 'LOGL5100', 'LOGL5100_ERR', 'HALPHA', 'HALPHA_BR', 'NII6585', 'SII6718', 'HBETA', 'HBETA_BR', 'HEII4687', 'HEII4687_BR', 'OIII5007', 'OIII5007C', 'CAII3934', 'OII3728', 'NEV3426', 'MGII', 'MGII_BR', 'CIII_ALL', 'CIII_BR', 'SIIII1892', 'ALIII1857', 'NIII1750', 'CIV', 'HEII1640', 'HEII1640_BR', 'SIIV_OIV', 'OI1304', 'LYA', 'NV1240', 'HALPHA_ERR', 'HALPHA_BR_ERR', 'NII6585_ERR', 'SII6718_ERR', 'HBETA_ERR', 'HBETA_BR_ERR', 'HEII4687_ERR', 'HEII4687_BR_ERR', 'OIII5007_ERR', 'OIII5007C_ERR', 'CAII3934_ERR', 'OII3728_ERR', 'NEV3426_ERR', 'MGII_ERR', 'MGII_BR_ERR', 'CIII_ALL_ERR', 'CIII_BR_

Unnamed: 0,OBJID,RA,DEC,Z,Z_SYS,Z_SYS_ERR,Z_SYS_DIFF,Z_SYS_DIFF_ABS,dr16q,dr16q_prop,...,sdssdr16_r_cmodel-decals8tr_r,sdssdr16_z_cmodel-decals8tr_z,max19_z,conf19_z,max21_z,conf21_z,max22_z,conf22_z,max35_z,conf35_z
0,b'7750-58402-0802',0.000629,35.517841,0.845435,0.846867,0.000704,0.001432,0.001432,True,False,...,,,,,,,,,,
1,b'7749-58073-0660',0.001415,31.057048,2.035491,2.040685,0.002466,0.005194,0.005194,True,False,...,,,,,,,,,,
2,b'7695-57654-0565',0.001526,27.732283,1.770552,1.774080,0.002622,0.003528,0.003528,True,False,...,,,,,,,,,,
3,b'11279-58449-0978',0.001535,7.064129,1.574227,1.575559,0.002725,0.001331,0.001331,True,False,...,,,,,,,,,,
4,b'7596-56945-0162',0.001898,17.773739,2.309000,2.310261,0.003747,0.001261,0.001261,True,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750409,b'7134-56566-0408',359.999118,28.954734,2.452000,2.451400,0.005250,-0.000600,0.000600,True,False,...,,,,,,,,,,
750410,b'7145-56567-0240',359.999303,34.720842,3.109000,3.112038,0.006417,0.003038,0.003038,True,False,...,,,,,,,,,,
750411,b'8741-57390-0060',359.999615,3.268586,1.232962,1.233783,0.000758,0.000821,0.000821,True,False,...,,,,,,,,,,
750412,b'7595-56957-0259',359.999759,20.721079,2.009865,2.013729,0.002790,0.003864,0.003864,True,False,...,,,,,,,,,,


In [30]:
d_z = dr16q_prop['Z_SYS_DIFF_ABS']
d_v = 300000 * d_z / (1 + dr16q_prop['Z_SYS'])

In [34]:
((d_z > 0.1) & (d_v > 10000)).sum()

1933

In [36]:
(d_v > 10000).sum()

1943

In [38]:
(d_z > 0.1).sum()

1941

In [5]:
dfs = []
for i in range(32):
    num = str(i).zfill(5)
    feas = pd.read_pickle(
        f'{data_dir}/22_DR16Q_v4-wo_20_train/part-{num}.features.gz_pkl',
        compression='gzip'
    )[['RA', 'DEC', 'Z'] + features_list]
    preds = pd.read_pickle(
        f'{data_dir}/22_DR16Q_v4-wo_20_train/part-{num}.predictions.x1a.gz_pkl',
        compression='gzip'
    )[['zoo_x1a19_z_max', 'zoo_x1a19_z_maxConf', 
       'zoo_x1a21_z_max', 'zoo_x1a21_z_maxConf', 
       'zoo_x1a22_z_max', 'zoo_x1a22_z_maxConf', 
       'zoo_x1a35_z_max', 'zoo_x1a35_z_maxConf']]
    df = feas.merge(preds, how='right', left_index=True, right_index=True)
    dfs.append(df)
dr16q = pd.concat(dfs, axis=0)
dr16q.rename(
    columns={
        'zoo_x1a19_z_max': 'max19_z', 'zoo_x1a19_z_maxConf': 'conf19_z',
        'zoo_x1a21_z_max': 'max21_z', 'zoo_x1a21_z_maxConf': 'conf21_z',
        'zoo_x1a22_z_max': 'max22_z', 'zoo_x1a22_z_maxConf': 'conf22_z',
        'zoo_x1a35_z_max': 'max35_z', 'zoo_x1a35_z_maxConf': 'conf35_z',
    }, inplace=True)
dr16q.sort_values(by=['RA'], inplace=True)
dr16q.drop_duplicates(subset=['RA', 'DEC'], inplace=True, keep='last')
dr16q.reset_index(drop=True, inplace=True)
dr16q.replace(np.nan, None, inplace=True)
dr16q

Unnamed: 0,RA,DEC,Z,sdssdr16_u_psf,sdssdr16_g_psf,sdssdr16_r_psf,sdssdr16_i_psf,sdssdr16_z_psf,sdssdr16_u_cmodel,sdssdr16_i_cmodel,...,sdssdr16_r_cmodel-decals8tr_r,sdssdr16_z_cmodel-decals8tr_z,max19_z,conf19_z,max21_z,conf21_z,max22_z,conf22_z,max35_z,conf35_z
0,0.001415,31.057048,2.035491,22.07464,21.787479,21.562507,21.360414,20.933499,21.817585,21.363539,...,-0.116084,-0.320602,1.735292,0.307693,1.934331,0.433418,1.855865,0.346749,2.028873,0.274436
1,0.001526,27.732283,1.770552,22.330387,21.903107,21.751831,21.664443,22.048195,21.924103,21.556177,...,-0.524986,-2.353988,1.709753,0.466287,1.730614,0.691206,1.752056,0.689963,1.682694,0.613794
2,0.001535,7.064129,1.574227,22.498194,22.103252,21.854685,21.638759,21.964469,21.761538,21.574824,...,-0.070024,-0.605396,1.445,0.45677,1.701,0.37172,1.627,0.446391,1.613,0.561226
3,0.001914,9.385637,2.024146,18.76583,18.661867,18.499508,18.335921,18.15841,18.774551,18.369465,...,0.08355,0.047153,,,,,2.121,0.638553,,
4,0.001978,-0.451088,0.250000,21.672738,21.193839,20.508787,20.15317,19.87858,21.432634,19.721854,...,0.009057,-0.250328,0.317354,0.441683,0.306038,0.479988,0.32021,0.185194,0.3508,0.702891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306735,359.997573,-9.826069,2.439000,22.488502,21.00742,21.059175,21.065099,20.417605,19.705819,21.006593,...,-0.469388,-0.363081,2.468,0.704687,2.459247,0.745153,2.232917,0.616561,2.522,0.733557
306736,359.997675,0.144870,1.356595,20.45271,20.535363,20.30579,20.287443,20.298002,20.508662,20.289982,...,-0.191817,-0.268319,1.110535,0.417104,1.178801,0.686697,1.102664,0.727626,1.175745,0.7415
306737,359.997704,10.564075,1.284287,20.054219,19.80636,19.387776,19.168335,19.006462,19.925669,19.138338,...,0.162244,0.082854,1.4569,0.784296,1.5509,0.877846,1.457419,0.719128,1.451017,0.844344
306738,359.998520,-0.655884,1.355954,22.055338,22.506995,21.409914,21.74893,21.66175,21.947964,21.504402,...,0.314801,-0.326978,0.854488,0.214549,1.107337,0.271965,1.096616,0.451444,1.078204,0.492389


In [6]:
c = correlate(dr16q_prop[['RA', 'DEC']].values, dr16q[['RA', 'DEC']].values)
np.unique(c[:,0].astype(int)).shape

100%|██████████| 306740/306740 [01:02<00:00, 4922.06it/s]


(306740,)

In [7]:
print(c[:,0].astype(int))
print(c[:, 1].max())
np.unique(c[:,0].astype(int)).shape

[     1      2      3 ... 750402 750403 750404]
2.0913097891518726e-06


(306740,)

In [8]:
full_01 = dr16q_prop.copy()
full_01.loc[c[:, 0].astype(int), features_list + models_cols] = dr16q[features_list + models_cols].values
full_01

Unnamed: 0,OBJID,RA,DEC,Z,Z_SYS,Z_SYS_ERR,Z_SYS_DIFF,Z_SYS_DIFF_ABS,dr16q,dr16q_prop,...,sdssdr16_r_cmodel-decals8tr_r,sdssdr16_z_cmodel-decals8tr_z,max19_z,conf19_z,max21_z,conf21_z,max22_z,conf22_z,max35_z,conf35_z
0,b'7750-58402-0802',0.000629,35.517841,0.845435,0.846867,0.000704,0.001432,0.001432,True,False,...,,,,,,,,,,
1,b'7749-58073-0660',0.001415,31.057048,2.035491,2.040685,0.002466,0.005194,0.005194,True,False,...,-0.116084,-0.320602,1.735292,0.307693,1.934331,0.433418,1.855865,0.346749,2.028873,0.274436
2,b'7695-57654-0565',0.001526,27.732283,1.770552,1.774080,0.002622,0.003528,0.003528,True,False,...,-0.524986,-2.353988,1.709753,0.466287,1.730614,0.691206,1.752056,0.689963,1.682694,0.613794
3,b'11279-58449-0978',0.001535,7.064129,1.574227,1.575559,0.002725,0.001331,0.001331,True,False,...,-0.070024,-0.605396,1.445,0.45677,1.701,0.37172,1.627,0.446391,1.613,0.561226
4,b'7596-56945-0162',0.001898,17.773739,2.309000,2.310261,0.003747,0.001261,0.001261,True,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750409,b'7134-56566-0408',359.999118,28.954734,2.452000,2.451400,0.005250,-0.000600,0.000600,True,False,...,,,,,,,,,,
750410,b'7145-56567-0240',359.999303,34.720842,3.109000,3.112038,0.006417,0.003038,0.003038,True,False,...,,,,,,,,,,
750411,b'8741-57390-0060',359.999615,3.268586,1.232962,1.233783,0.000758,0.000821,0.000821,True,False,...,,,,,,,,,,
750412,b'7595-56957-0259',359.999759,20.721079,2.009865,2.013729,0.002790,0.003864,0.003864,True,False,...,,,,,,,,,,


In [9]:
(~full_01['max22_z'].isna()).sum()

304815

In [10]:
dfs = []

feas = pd.read_pickle(
    f'{data_dir}/20_2-fold-cv/cv2_0/part-00000.features.gz_pkl',
    compression='gzip'
)[['ra', 'dec', 'zspec'] + features_list]
preds = pd.read_pickle(
    f'{data_dir}/20_2-fold-cv/cv2_0/part-00000.predictions.x1cv2_0.gz_pkl',
    compression='gzip'
)[[
    'zoo_x1cv2_019_z_max', 'zoo_x1cv2_019_z_maxConf',
    'zoo_x1cv2_021_z_max', 'zoo_x1cv2_021_z_maxConf',
    'zoo_x1cv2_022_z_max', 'zoo_x1cv2_022_z_maxConf',
    'zoo_x1cv2_035_z_max', 'zoo_x1cv2_035_z_maxConf'
]]
df = feas.merge(preds, how='right', left_index=True, right_index=True)
df.columns = copy(dr16q.columns)
dfs.append(df)

feas = pd.read_pickle(
    f'{data_dir}/20_2-fold-cv/cv2_1/part-00000.features.gz_pkl',
    compression='gzip'
)[['ra', 'dec', 'zspec'] + features_list]
preds = pd.read_pickle(
    f'{data_dir}/20_2-fold-cv/cv2_1/part-00000.predictions.x1cv2_1.gz_pkl',
    compression='gzip'
)[[
    'zoo_x1cv2_119_z_max', 'zoo_x1cv2_119_z_maxConf',
    'zoo_x1cv2_121_z_max', 'zoo_x1cv2_121_z_maxConf',
    'zoo_x1cv2_122_z_max', 'zoo_x1cv2_122_z_maxConf',
    'zoo_x1cv2_135_z_max', 'zoo_x1cv2_135_z_maxConf'
]]
df = feas.merge(preds, how='right', left_index=True, right_index=True)
df.columns = copy(dr16q.columns)
dfs.append(df)

train_20 = pd.concat(dfs, axis=0)
train_20.sort_values(by=['RA'], inplace=True)
train_20.drop_duplicates(subset=['RA', 'DEC'], inplace=True, keep='last')
train_20.reset_index(drop=True, inplace=True)
train_20.replace(np.nan, None, inplace=True)
train_20

Unnamed: 0,RA,DEC,Z,sdssdr16_u_psf,sdssdr16_g_psf,sdssdr16_r_psf,sdssdr16_i_psf,sdssdr16_z_psf,sdssdr16_u_cmodel,sdssdr16_i_cmodel,...,sdssdr16_r_cmodel-decals8tr_r,sdssdr16_z_cmodel-decals8tr_z,max19_z,conf19_z,max21_z,conf21_z,max22_z,conf22_z,max35_z,conf35_z
0,0.000722,11.343983,0.442757,23.3769,22.425905,20.79993,20.031074,19.62268,20.226514,19.305746,...,0.264465,0.434388,0.474689,0.928341,0.458632,0.936633,0.419659,0.636504,0.447622,0.961682
1,0.001417,18.492306,0.629820,22.113601,22.968215,21.524378,20.363373,19.827644,22.126405,19.405817,...,0.2447,-0.151581,0.599155,0.926998,0.647224,0.999999,0.708078,0.910233,0.604697,0.955222
2,0.001885,17.773712,2.309000,22.265783,21.813904,21.999912,21.87019,21.266889,22.34359,21.853381,...,0.450456,-0.287098,2.322151,0.312809,2.300145,0.422922,2.368,0.576355,2.121082,0.503914
3,0.002416,5.941882,2.103120,22.059562,21.434419,21.211019,21.184635,20.688257,22.046778,21.198729,...,-0.265694,-0.54398,0.846,0.331083,0.819,0.382401,0.916635,0.653894,0.801,0.485579
4,0.002769,14.974691,2.497000,21.761608,21.095956,20.797531,20.715893,20.589152,21.633194,20.676625,...,0.161842,-0.086858,2.32,0.558577,2.4825,0.664471,2.625,0.634396,2.404,0.824371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580451,359.998949,10.116818,2.415000,23.056658,21.692741,21.373058,21.15195,21.260001,21.525019,21.153138,...,-0.220069,0.502369,2.343,0.44883,2.362,0.289139,2.62,0.275886,2.598,0.508071
580452,359.999026,24.413551,1.490065,20.921111,20.732991,20.529058,20.355686,20.119849,20.765828,20.362367,...,-0.267949,-0.534182,1.696801,0.276569,1.516,0.353289,1.471,0.272345,1.519121,0.532895
580453,359.999121,28.954727,2.452000,21.876842,21.168578,21.233727,21.235761,20.713764,21.639407,21.276328,...,0.037796,-0.513353,2.36,0.36883,2.349,0.583509,2.416101,0.459684,2.333,0.82793
580454,359.999634,3.268618,1.233161,18.791953,18.815835,18.557703,18.614296,18.673656,18.798606,18.611784,...,-0.470473,-0.625142,1.261,0.76583,1.2648,0.80364,1.163,0.854128,1.319833,0.751813


In [11]:
c = correlate(dr16q_prop[['RA', 'DEC']].values, train_20[['RA', 'DEC']].values)
np.unique(c[:,0].astype(int)).shape

100%|██████████| 580456/580456 [01:59<00:00, 4854.90it/s]


(469499,)

In [12]:
(c[:,1] <= 1/3600).sum(), np.unique(c[c[:,1] <= 1/3600,0].astype(int)).shape

(437833, (437473,))

In [13]:
full_02 = full_01.copy()
full_02.loc[c[c[:,1] <= 1/3600, 0].astype(int), features_list + models_cols] = train_20.loc[c[c[:,1] <= 1/3600, 2].astype(int), features_list + models_cols].values
full_02.loc[c[c[:,1] <= 1/3600, 0].astype(int), 'train20'] = True
full_02

Unnamed: 0,OBJID,RA,DEC,Z,Z_SYS,Z_SYS_ERR,Z_SYS_DIFF,Z_SYS_DIFF_ABS,dr16q,dr16q_prop,...,sdssdr16_r_cmodel-decals8tr_r,sdssdr16_z_cmodel-decals8tr_z,max19_z,conf19_z,max21_z,conf21_z,max22_z,conf22_z,max35_z,conf35_z
0,b'7750-58402-0802',0.000629,35.517841,0.845435,0.846867,0.000704,0.001432,0.001432,True,False,...,,,,,,,,,,
1,b'7749-58073-0660',0.001415,31.057048,2.035491,2.040685,0.002466,0.005194,0.005194,True,False,...,-0.116084,-0.320602,1.735292,0.307693,1.934331,0.433418,1.855865,0.346749,2.028873,0.274436
2,b'7695-57654-0565',0.001526,27.732283,1.770552,1.774080,0.002622,0.003528,0.003528,True,False,...,-0.524986,-2.353988,1.709753,0.466287,1.730614,0.691206,1.752056,0.689963,1.682694,0.613794
3,b'11279-58449-0978',0.001535,7.064129,1.574227,1.575559,0.002725,0.001331,0.001331,True,False,...,-0.070024,-0.605396,1.445,0.45677,1.701,0.37172,1.627,0.446391,1.613,0.561226
4,b'7596-56945-0162',0.001898,17.773739,2.309000,2.310261,0.003747,0.001261,0.001261,True,False,...,0.450456,-0.287098,2.322151,0.312809,2.300145,0.422922,2.368,0.576355,2.121082,0.503914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750409,b'7134-56566-0408',359.999118,28.954734,2.452000,2.451400,0.005250,-0.000600,0.000600,True,False,...,0.037796,-0.513353,2.36,0.36883,2.349,0.583509,2.416101,0.459684,2.333,0.82793
750410,b'7145-56567-0240',359.999303,34.720842,3.109000,3.112038,0.006417,0.003038,0.003038,True,False,...,,,,,,,,,,
750411,b'8741-57390-0060',359.999615,3.268586,1.232962,1.233783,0.000758,0.000821,0.000821,True,False,...,-0.470473,-0.625142,1.261,0.76583,1.2648,0.80364,1.163,0.854128,1.319833,0.751813
750412,b'7595-56957-0259',359.999759,20.721079,2.009865,2.013729,0.002790,0.003864,0.003864,True,False,...,0.155895,-0.028731,2.187629,0.706492,2.203455,0.461592,2.204818,0.332097,1.956595,0.967376


In [14]:
train_20_out = train_20[c[:,1] > 1/3600]
train_20_out.reset_index(drop=True, inplace=True)
train_20_out.insert(0, 'OBJID', None)
train_20_out.insert(4, 'Z_SYS', None)
train_20_out.insert(5, 'Z_SYS_ERR', None)
train_20_out.insert(6, 'Z_SYS_DIFF', None)
train_20_out.insert(7, 'Z_SYS_DIFF_ABS', None)
train_20_out.insert(8, 'dr16q', False)
train_20_out.insert(9, 'dr16q_prop', False)
train_20_out.insert(10, 'train20', True)
train_20_out.insert(11, 'superset', False)
train_20_out

Unnamed: 0,OBJID,RA,DEC,Z,Z_SYS,Z_SYS_ERR,Z_SYS_DIFF,Z_SYS_DIFF_ABS,dr16q,dr16q_prop,...,sdssdr16_r_cmodel-decals8tr_r,sdssdr16_z_cmodel-decals8tr_z,max19_z,conf19_z,max21_z,conf21_z,max22_z,conf22_z,max35_z,conf35_z
0,,0.000722,11.343983,0.442757,,,,,False,False,...,0.264465,0.434388,0.474689,0.928341,0.458632,0.936633,0.419659,0.636504,0.447622,0.961682
1,,0.001417,18.492306,0.629820,,,,,False,False,...,0.2447,-0.151581,0.599155,0.926998,0.647224,0.999999,0.708078,0.910233,0.604697,0.955222
2,,0.003054,22.654332,0.687130,,,,,False,False,...,-0.243831,0.157843,0.634992,0.9715,0.661076,0.993357,0.63845,0.882298,0.637401,0.97876
3,,0.004471,-2.661104,0.459787,,,,,False,False,...,0.288692,-0.03791,0.443924,0.983014,0.431312,0.959338,0.434016,0.884785,0.447185,0.959962
4,,0.007367,19.547288,0.090489,,,,,False,False,...,-0.006464,-0.041948,0.204093,0.955007,0.156124,0.886952,0.127543,0.96279,0.157698,0.860703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142618,,359.989649,17.861844,1.146000,,,,,False,False,...,-0.357822,-0.564539,1.147,0.272866,1.369694,0.33023,1.423635,0.318768,1.199739,0.469563
142619,,359.989956,5.066145,0.334862,,,,,False,False,...,1.135929,-0.26704,0.272929,0.23963,0.290426,0.349578,0.235084,0.294244,0.165135,0.131182
142620,,359.992068,18.476645,1.283000,,,,,False,False,...,0.079747,-0.068443,1.198504,0.335637,1.351848,0.590475,1.317722,0.490772,1.221652,0.570182
142621,,359.994125,3.739318,0.483395,,,,,False,False,...,-0.411984,0.08965,0.529025,0.913628,0.505969,0.947357,0.495814,0.878827,0.507,0.983596


In [15]:
full_03 = pd.concat((full_02, train_20_out), axis=0)
full_03.reset_index(drop=True, inplace=True)
full_03.replace(np.nan, None, inplace=True)
full_03

Unnamed: 0,OBJID,RA,DEC,Z,Z_SYS,Z_SYS_ERR,Z_SYS_DIFF,Z_SYS_DIFF_ABS,dr16q,dr16q_prop,...,sdssdr16_r_cmodel-decals8tr_r,sdssdr16_z_cmodel-decals8tr_z,max19_z,conf19_z,max21_z,conf21_z,max22_z,conf22_z,max35_z,conf35_z
0,b'7750-58402-0802',0.000629,35.517841,0.845435,0.846867,0.000704,0.001432,0.001432,True,False,...,,,,,,,,,,
1,b'7749-58073-0660',0.001415,31.057048,2.035491,2.040685,0.002466,0.005194,0.005194,True,False,...,-0.116084,-0.320602,1.735292,0.307693,1.934331,0.433418,1.855865,0.346749,2.028873,0.274436
2,b'7695-57654-0565',0.001526,27.732283,1.770552,1.77408,0.002622,0.003528,0.003528,True,False,...,-0.524986,-2.353988,1.709753,0.466287,1.730614,0.691206,1.752056,0.689963,1.682694,0.613794
3,b'11279-58449-0978',0.001535,7.064129,1.574227,1.575559,0.002725,0.001331,0.001331,True,False,...,-0.070024,-0.605396,1.445,0.45677,1.701,0.37172,1.627,0.446391,1.613,0.561226
4,b'7596-56945-0162',0.001898,17.773739,2.309000,2.310261,0.003747,0.001261,0.001261,True,False,...,0.450456,-0.287098,2.322151,0.312809,2.300145,0.422922,2.368,0.576355,2.121082,0.503914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
893032,,359.989649,17.861844,1.146000,,,,,False,False,...,-0.357822,-0.564539,1.147,0.272866,1.369694,0.33023,1.423635,0.318768,1.199739,0.469563
893033,,359.989956,5.066145,0.334862,,,,,False,False,...,1.135929,-0.26704,0.272929,0.23963,0.290426,0.349578,0.235084,0.294244,0.165135,0.131182
893034,,359.992068,18.476645,1.283000,,,,,False,False,...,0.079747,-0.068443,1.198504,0.335637,1.351848,0.590475,1.317722,0.490772,1.221652,0.570182
893035,,359.994125,3.739318,0.483395,,,,,False,False,...,-0.411984,0.08965,0.529025,0.913628,0.505969,0.947357,0.495814,0.878827,0.507,0.983596


In [16]:
superset = pd.read_csv(f'{data_dir}/proc_superset.csv')
superset.drop(columns='SOURCE', inplace=True)
superset.reset_index(drop=True, inplace=True)
superset.insert(0, 'OBJID', None)
superset.insert(4, 'Z_SYS', None)
superset.insert(5, 'Z_SYS_ERR', None)
superset.insert(6, 'Z_SYS_DIFF', None)
superset.insert(7, 'Z_SYS_DIFF_ABS', None)
superset.insert(8, 'dr16q', False)
superset.insert(9, 'dr16q_prop', False)
superset.insert(10, 'train20', False)
superset.insert(11, 'superset', True)
superset.insert(74, 'max19_z', None)
superset.insert(75, 'conf19_z', None)
superset.insert(76, 'max21_z', None)
superset.insert(77, 'conf21_z', None)
superset.insert(78, 'max22_z', None)
superset.insert(79, 'conf22_z', None)
superset.rename(
    columns={
        'zoo_x1a35_z_max': 'max35_z', 'zoo_x1a35_z_maxConf': 'conf35_z'
    }, inplace=True)
superset

Unnamed: 0,OBJID,RA,DEC,Z,Z_SYS,Z_SYS_ERR,Z_SYS_DIFF,Z_SYS_DIFF_ABS,dr16q,dr16q_prop,...,conf19_z,max21_z,conf21_z,max22_z,conf22_z,sdssdr16_g_cmodel-decals8tr_g,sdssdr16_r_cmodel-decals8tr_r,sdssdr16_z_cmodel-decals8tr_z,max35_z,conf35_z
0,,138.718517,56.280226,-1.000000,,,,,False,False,...,,,,,,0.312401,-2.771111,-0.349603,0.780652,0.423525
1,,119.299997,22.527369,-0.011447,,,,,False,False,...,,,,,,-0.089592,-0.129546,-0.363575,3.359000,0.637353
2,,262.134784,36.734522,-0.011332,,,,,False,False,...,,,,,,0.034192,0.138233,-0.130191,3.164000,0.500935
3,,327.439561,-0.195568,-0.011123,,,,,False,False,...,,,,,,0.176945,0.110534,-1.779119,3.139719,0.272778
4,,217.129032,15.540025,-0.010959,,,,,False,False,...,,,,,,-0.077542,0.006412,0.055268,3.412000,0.350389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365341,,201.934129,12.271040,7.021599,,,,,False,False,...,,,,,,-0.196749,0.066927,-0.193626,2.900859,0.345879
365342,,36.002095,-0.394057,7.027864,,,,,False,False,...,,,,,,-0.259824,0.056779,-0.548422,2.023643,0.290574
365343,,179.515381,45.946690,7.029899,,,,,False,False,...,,,,,,-0.826703,-0.221706,-0.515543,0.735022,0.589784
365344,,0.306355,6.436817,7.030458,,,,,False,False,...,,,,,,0.102827,0.176069,-0.434187,0.373313,0.168273


In [17]:
full_04 = pd.concat((full_03, superset), axis=0)
full_04.reset_index(drop=True, inplace=True)
full_04.replace(np.nan, None, inplace=True)
full_04

Unnamed: 0,OBJID,RA,DEC,Z,Z_SYS,Z_SYS_ERR,Z_SYS_DIFF,Z_SYS_DIFF_ABS,dr16q,dr16q_prop,...,sdssdr16_r_cmodel-decals8tr_r,sdssdr16_z_cmodel-decals8tr_z,max19_z,conf19_z,max21_z,conf21_z,max22_z,conf22_z,max35_z,conf35_z
0,b'7750-58402-0802',0.000629,35.517841,0.845435,0.846867,0.000704,0.001432,0.001432,True,False,...,,,,,,,,,,
1,b'7749-58073-0660',0.001415,31.057048,2.035491,2.040685,0.002466,0.005194,0.005194,True,False,...,-0.116084,-0.320602,1.735292,0.307693,1.934331,0.433418,1.855865,0.346749,2.028873,0.274436
2,b'7695-57654-0565',0.001526,27.732283,1.770552,1.77408,0.002622,0.003528,0.003528,True,False,...,-0.524986,-2.353988,1.709753,0.466287,1.730614,0.691206,1.752056,0.689963,1.682694,0.613794
3,b'11279-58449-0978',0.001535,7.064129,1.574227,1.575559,0.002725,0.001331,0.001331,True,False,...,-0.070024,-0.605396,1.445,0.45677,1.701,0.37172,1.627,0.446391,1.613,0.561226
4,b'7596-56945-0162',0.001898,17.773739,2.309000,2.310261,0.003747,0.001261,0.001261,True,False,...,0.450456,-0.287098,2.322151,0.312809,2.300145,0.422922,2.368,0.576355,2.121082,0.503914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1258378,,201.934129,12.271040,7.021599,,,,,False,False,...,0.066927,-0.193626,,,,,,,2.900859,0.345879
1258379,,36.002095,-0.394057,7.027864,,,,,False,False,...,0.056779,-0.548422,,,,,,,2.023643,0.290574
1258380,,179.515381,45.946690,7.029899,,,,,False,False,...,-0.221706,-0.515543,,,,,,,0.735022,0.589784
1258381,,0.306355,6.436817,7.030458,,,,,False,False,...,0.176069,-0.434187,,,,,,,0.373313,0.168273


In [18]:
full_04.to_csv(f'{data_dir}/full_dr16q_train20_superset.csv', index=False)

In [19]:
full_04[(full_04['train20']) & (full_04['dr16q_prop'])]

Unnamed: 0,OBJID,RA,DEC,Z,Z_SYS,Z_SYS_ERR,Z_SYS_DIFF,Z_SYS_DIFF_ABS,dr16q,dr16q_prop,...,sdssdr16_r_cmodel-decals8tr_r,sdssdr16_z_cmodel-decals8tr_z,max19_z,conf19_z,max21_z,conf21_z,max22_z,conf22_z,max35_z,conf35_z
714,b'7666-57339-0127',0.182396,24.708938,2.812042,2.055525,0.002691,-0.756517,0.756517,True,True,...,-0.156049,-0.017781,1.979535,0.597602,2.259843,0.435133,2.068,0.623723,2.120766,0.951724
5570,b'8740-57367-0008',1.489409,5.237260,4.804017,1.532229,0.001289,-3.271787,3.271787,True,True,...,0.003243,-0.166998,1.765,0.332651,1.815977,0.269669,2.625556,0.258568,1.710972,0.389368
12522,b'6279-56243-0538',3.435045,24.955056,2.405000,1.676462,0.003814,-0.728538,0.728538,True,True,...,-0.737713,-0.883388,2.205914,0.498672,2.207,0.471103,2.384836,0.402826,2.198,0.687996
12695,b'7169-56628-0800',3.488202,-8.559648,4.193000,1.265264,0.000869,-2.927736,2.927736,True,True,...,0.251942,-0.670408,1.231,0.653004,1.261381,0.550629,1.288588,0.243799,1.214,0.644671
13690,b'6184-56267-0488',3.794226,13.819518,3.215000,0.851655,0.001316,-2.363345,2.363345,True,True,...,0.244954,0.15973,0.703,0.565905,0.858,0.688582,0.703263,0.62564,0.869711,0.514507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729559,b'6137-56270-0272',353.604729,15.931345,3.314000,2.413811,0.002817,-0.900189,0.900189,True,True,...,-0.5407,-0.524184,2.191159,0.478985,2.491,0.449666,2.363,0.543903,2.549,0.668483
740156,b'7599-56955-0230',357.162398,17.801085,0.199574,0.573994,0.000317,0.37442,0.37442,True,True,...,0.714635,0.596878,0.461306,0.487011,0.583893,0.341383,1.438288,0.525003,0.809147,0.260305
741105,b'8742-57364-0601',357.461297,6.252755,6.949000,1.533203,0.002761,-5.415797,5.415797,True,True,...,-0.02049,-0.187273,2.372,0.441826,2.095132,0.363373,1.55,0.523302,2.055263,0.320846
741550,b'7851-56932-0656',357.587732,-1.735379,5.299385,1.761067,0.002688,-3.538318,3.538318,True,True,...,0.466838,0.227166,2.874886,0.31232,2.134,0.232328,2.694445,0.269811,1.8341,0.383696


In [20]:
full_04[full_04['Z'] <= -1]

Unnamed: 0,OBJID,RA,DEC,Z,Z_SYS,Z_SYS_ERR,Z_SYS_DIFF,Z_SYS_DIFF_ABS,dr16q,dr16q_prop,...,sdssdr16_r_cmodel-decals8tr_r,sdssdr16_z_cmodel-decals8tr_z,max19_z,conf19_z,max21_z,conf21_z,max22_z,conf22_z,max35_z,conf35_z
164256,b'4501-55590-0178',120.31258,13.611725,-999.0,0.0,-1.0,999.0,999.0,True,True,...,-0.105338,-0.291012,,,,,0.392884,0.483584,,
336175,b'5356-55979-0876',164.698568,11.187737,-999.0,0.4,-1.0,999.4,999.4,True,True,...,0.312959,0.063554,0.467022,0.117475,0.472587,0.178183,0.770286,0.424594,0.659,0.337362
469936,b'8201-58146-0270',201.119434,57.27803,-999.0,1.077868,0.010941,1000.077868,1000.077868,True,True,...,0.021335,0.030715,,,,,0.023326,0.52859,,
628132,b'4190-55686-0354',249.214456,26.449127,-999.0,0.8,-1.0,999.8,999.8,True,True,...,-0.277899,-0.274186,0.623922,0.268944,0.7715,0.282915,0.2606,0.529289,0.874,0.209615
893037,,138.718517,56.280226,-1.0,,,,,False,False,...,-2.771111,-0.349603,,,,,,,0.780652,0.423525
