In [1]:
import pandas as pd
import numpy as np
import os
import time
from astropy.cosmology import Planck13
from astropy.io import fits
from astropy.table import Table
from tqdm import tqdm
%matplotlib inline
import matplotlib.pyplot as plt

import disperse

from sklearn.neighbors import KDTree

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
cosmo = Planck13
H0 = cosmo.H0.value
Om = cosmo.Om0
Ol = 0.69288
Ok = 0.0
print(f'H0 = {cosmo.H0.value}')
print(f'Om = {cosmo.Om0}')
print(f'Ol = {0.69288}')

H0 = 67.77
Om = 0.30712
Ol = 0.69288


In [3]:
np.random.seed(0)

In [4]:
gal_RA_int = (140, 260)
gal_DEC_int = (-10, 30)
gal_Z_int = (0, 1.0)

In [5]:
galaxies = pd.read_csv("ACT_galaxies.csv")
galaxies

Unnamed: 0,RA,DEC,Z
0,146.71421,-1.041304,0.021275
1,146.71421,-1.041304,0.021222
2,146.91945,-0.990492,0.213925
3,146.74413,-0.652191,0.203783
4,146.85983,-0.808902,0.126554
...,...,...,...
931855,146.13969,26.500868,0.903536
931856,146.20911,26.557960,0.628224
931857,146.09398,26.524402,0.845268
931858,146.32252,26.549261,0.058894


In [6]:
clusters = pd.read_csv("ACT_clusters.csv")
clusters

Unnamed: 0,RA,DEC,Z,M,R,type,CX,CY,CZ,ID,...,d_12.2,d_12.4,d_12.6,d_12.8,d_13.0,d_13.2,d_13.4,d_13.6,d_13.8,d_14.0
0,230.761495,8.587807,0.0352,1.676978,0,1,-96.597031,-118.277283,23.062053,0,...,6.432487,101.423067,101.423067,101.423067,101.423067,101.423067,101.423067,101.423067,101.423067,101.423067
1,230.452957,7.709549,0.0442,2.457475,0,1,-122.101803,-147.873793,25.960786,1,...,8.378063,135.818132,135.818132,135.818132,135.818132,135.818132,135.818132,135.818132,135.818132,135.818132
2,227.733561,5.744353,0.0766,11.002140,0,1,-222.703294,-245.035914,33.308899,2,...,1.570683,267.106804,267.106804,267.106804,267.106804,267.106804,267.106804,267.106804,267.106804,267.106804
3,208.253128,5.139146,0.0788,1.222567,0,1,-300.190348,-161.319215,30.649445,3,...,1.138946,223.929466,223.929466,223.929466,223.929466,223.929466,223.929466,223.929466,223.929466,223.929466
4,173.207731,14.465334,0.0832,2.935207,0,1,-346.994931,41.329156,90.147741,4,...,19.328642,127.612739,127.612739,127.612739,127.612739,127.612739,127.612739,127.612739,127.612739,127.612739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3514,195.258283,9.471381,0.9750,0.000000,0,0,-3174.029164,-865.832664,548.868644,3514,...,1365.518508,1365.518508,1365.518508,1365.518508,1365.518508,1365.518508,1365.518508,1365.518508,1365.518508,1365.518508
3515,228.743762,0.269355,0.9750,0.000000,0,0,-2199.479254,-2507.474677,15.680420,3515,...,49.407946,49.407946,49.407946,49.407946,49.407946,49.407946,49.407946,49.407946,49.407946,49.407946
3516,209.909774,18.112089,0.9750,0.000000,0,0,-2747.967683,-1580.775317,1036.921951,3516,...,1186.018791,1186.018791,1186.018791,1186.018791,1186.018791,1186.018791,1186.018791,1186.018791,1186.018791,1186.018791
3517,211.766151,11.274241,0.9750,0.000000,0,0,-2781.108471,-1722.086506,652.102646,3517,...,894.356227,894.356227,894.356227,894.356227,894.356227,894.356227,894.356227,894.356227,894.356227,894.356227


In [7]:
clusters_ext = pd.read_csv("ACT_clusters_ext.csv", engine='c')
clusters_ext

Unnamed: 0,ID,RA,DEC,Z,CX,CY,CZ,Z_true,M,R,...,d_12.2,d_12.4,d_12.6,d_12.8,d_13.0,d_13.2,d_13.4,d_13.6,d_13.8,d_14.0
0,0,230.761495,8.587807,0.002,-5.531095,-6.772495,1.320521,0.0352,1.676978,0.0,...,5.886773,5.886773,5.886773,5.886773,5.886773,5.886773,5.886773,5.886773,5.886773,5.886773
1,0,230.761495,8.587807,0.004,-11.057085,-13.538739,2.639823,0.0352,1.676978,0.0,...,11.181167,11.181167,11.181167,11.181167,11.181167,11.181167,11.181167,11.181167,11.181167,11.181167
2,0,230.761495,8.587807,0.006,-16.577964,-20.298725,3.957905,0.0352,1.676978,0.0,...,18.221260,18.221260,18.221260,18.221260,18.221260,18.221260,18.221260,18.221260,18.221260,18.221260
3,0,230.761495,8.587807,0.008,-22.093726,-27.052446,5.274765,0.0352,1.676978,0.0,...,23.982981,23.982981,23.982981,23.982981,23.982981,23.982981,23.982981,23.982981,23.982981,23.982981
4,0,230.761495,8.587807,0.010,-27.604365,-33.799893,6.590403,0.0352,1.676978,0.0,...,30.638206,30.638206,30.638206,30.638206,30.638206,30.638206,30.638206,30.638206,30.638206,30.638206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1755976,3518,189.207861,10.112176,0.990,-3278.055129,-531.390651,592.261022,0.9750,0.000000,0.0,...,1358.151962,1358.151962,1358.151962,1358.151962,1358.151962,1358.151962,1358.151962,1358.151962,1358.151962,1358.151962
1755977,3518,189.207861,10.112176,0.992,-3282.925090,-532.180099,593.140900,0.9750,0.000000,0.0,...,1363.115233,1363.115233,1363.115233,1363.115233,1363.115233,1363.115233,1363.115233,1363.115233,1363.115233,1363.115233
1755978,3518,189.207861,10.112176,0.994,-3287.789349,-532.968621,594.019748,0.9750,0.000000,0.0,...,1368.073043,1368.073043,1368.073043,1368.073043,1368.073043,1368.073043,1368.073043,1368.073043,1368.073043,1368.073043
1755979,3518,189.207861,10.112176,0.996,-3292.647914,-533.756221,594.897567,0.9750,0.000000,0.0,...,1373.025395,1373.025395,1373.025395,1373.025395,1373.025395,1373.025395,1373.025395,1373.025395,1373.025395,1373.025395


In [8]:
clusters_f = clusters.copy()
MIN_M = 0
MAX_Z = 1.1

true_clusters = clusters_f[clusters_f['type'] == 1].copy()
true_clusters = true_clusters[(true_clusters['M'] >= MIN_M) & (true_clusters['Z'] <= MAX_Z)]

false_clusters = clusters_f[clusters_f['type'] == 0].copy()
false_clusters = false_clusters[false_clusters['Z'].isin(true_clusters['Z'])]

clusters_f = pd.concat((true_clusters, false_clusters))

clusters_ext_f = clusters_ext.copy()
clusters_ext_f = clusters_ext_f[(clusters_ext_f['ID'].isin(clusters_f['ID'])) & (clusters_ext_f['Z'] <= MAX_Z)]

In [9]:
TEST_RATIO = 0.1
select_step = int(1.01 // TEST_RATIO)

mask = np.array([True] * clusters_f.shape[0])
mask[::select_step] = False

train_clusters = clusters_f.loc[mask].copy()
train_clusters.reset_index(drop=True, inplace=True)
test_clusters = clusters_f.loc[~mask].copy()
test_clusters.reset_index(drop=True, inplace=True)

train_ids = train_clusters['ID'].values
test_ids = test_clusters['ID'].values

In [10]:
train_clusters_ext = clusters_ext_f[clusters_ext_f['ID'].isin(train_ids)]
test_clusters_ext = clusters_ext_f[clusters_ext_f['ID'].isin(test_ids)]

In [11]:
FOLDS_NUM = 2

folds = {
    'norm': []
}
for i in range(FOLDS_NUM):
    test_index = list(range(i, train_clusters.shape[0], FOLDS_NUM))
    train_index = set(range(train_clusters.shape[0])) - set(test_index)
    train_index = sorted(train_index)
    folds['norm'].append((np.array(train_index), np.array(test_index)))

In [12]:
rads = list(range(1, 31))
sigmas = [round(i, 1) for i in np.arange(0.2, 14.2, 0.2)]

In [13]:
feas = []
for sigma in sigmas:
    feas.append(clusters[f'd_{sigma}'].values)
feas = np.vstack(feas).T
feas.shape

(3519, 70)

In [14]:
train_Y = clusters_f['type'].values

In [15]:
sigma_scores = np.zeros((clusters_f.shape[0], len(rads)))

for train_index, test_index in folds['norm']:
    t = []
    for i, rad in tqdm(enumerate(rads)):
        s = np.zeros(len(test_index))
        for j, sigma in enumerate(sigmas):
            s[feas[train_ids[test_index], j] <= rad] = sigma
        t.append(s)
    t = np.array(t).T
    sigma_scores[test_index] = t
    
sigma_rocaucs = []
for i in range(len(rads)):
    sigma_rocaucs.append(roc_auc_score(train_Y, sigma_scores[:, i]))

sigma_rocaucs

30it [00:00, 1223.10it/s]
30it [00:00, 1704.10it/s]


[0.5998924213230572,
 0.64897591522158,
 0.6929752729608222,
 0.7183904945407836,
 0.751995504174695,
 0.7822697495183044,
 0.8003275529865125,
 0.813641297366731,
 0.8281576750160566,
 0.8321865767501605,
 0.8381910725754658,
 0.8368298008991651,
 0.8425308285163776,
 0.8409868336544638,
 0.8434849068721901,
 0.8403673731535003,
 0.8363044315992292,
 0.8382707129094411,
 0.8366849710982658,
 0.8301544637122671,
 0.8233548490687219,
 0.8193805394990367,
 0.8157614001284521,
 0.8121429030186256,
 0.8082610789980733,
 0.8004036608863199,
 0.7947716763005781,
 0.7891101477199742,
 0.7813002569043032,
 0.776631021194605]

In [16]:
rfs = []
rf_scores = np.zeros(train_clusters.shape[0])
rf_rocaucs = []
for train_index, test_index in folds['norm']:
    rf = RandomForestClassifier(
        max_depth=10, 
        random_state=0,
        n_estimators=500,
        max_features=None
    ).fit(feas[train_ids[train_index]], train_Y[train_index])
    rfs.append(rf)
    preds = rf.predict_proba(feas[train_ids[test_index]])[:,1].reshape(-1)
    rf_scores[test_index] = preds
    rf_rocaucs.append(roc_auc_score(train_Y[test_index], preds))
    
rf_rocaucs

[0.8791802231001626, 0.8910462037350254]

In [17]:
feas_ext = []
for sigma in sigmas:
    feas_ext.append(clusters_ext[f'd_{sigma}'].values)
feas_ext = np.vstack(feas_ext).T
feas_ext.shape

(1755981, 70)

In [18]:
train_sigma_scores_ext = np.zeros((train_clusters_ext.shape[0], len(rads)))
train_rf_scores_ext = np.zeros(train_clusters_ext.shape[0])

for k, (train_index, test_index) in enumerate(folds['norm']):
    train_m = train_clusters_ext['ID'].isin(train_ids[test_index]).values
    
    full_m = clusters_ext['ID'].isin(train_ids[test_index]).values
    full_m *= (clusters_ext['Z'] <= MAX_Z).values
    
    t = []
    for i, rad in tqdm(enumerate(rads)):
        s = np.zeros(train_m.sum())
        for j, sigma in enumerate(sigmas):
            s[feas_ext[full_m, j] <= rad] = sigma
        t.append(s)
    t = np.array(t).T
    train_sigma_scores_ext[train_m] = t
    
    train_rf_scores_ext[train_m] = \
        rfs[k].predict_proba(feas_ext[full_m])[:,1].reshape(-1)

30it [00:13,  2.26it/s]
30it [00:13,  2.26it/s]


In [19]:
# for i, sigma in enumerate(rads):
#     train_clusters_ext[f'sigma_{sigma}_score'] = train_sigma_scores_ext[:, i]
# train_clusters_ext['rf_score'] = train_rf_scores_ext

full_m = clusters_ext['ID'].isin(train_ids).values
full_m *= (clusters_ext['Z'] <= MAX_Z).values

for i, sigma in enumerate(rads):
    t = np.zeros(clusters_ext.shape[0])
    t[full_m] = train_sigma_scores_ext[:, i]
    clusters_ext[f'sigma_{sigma}_score'] = t
    
t = np.zeros(clusters_ext.shape[0])
t[full_m] = train_rf_scores_ext
clusters_ext['rf_score'] = t

In [21]:
p, t = [], []
for id_ in train_ids:
    p.append(clusters_ext[(clusters_ext['ID'] == id_)]['rf_score'].max())
    t.append(clusters_ext[(clusters_ext['ID'] == id_)]['type'].iloc[0])
    
roc_auc_score(np.array(t).astype(int), np.array(p))

0.534031644063764