In [None]:
import pandas
import numpy as np
from pyBIA import ensemble_model

#The optimal sig threshold to apply as per Figure 2
sig = 0.31                                                                                                                                                                                                                                
df = pandas.read_csv('/Users/daniel/Desktop/Folders/Lyalpha/pyBIA_Paper_1/nsigs/BW_NSIG/BW_training_set_nsig_'+str(sig))

# Omit any non-detections
mask = np.where((df['area'] != -999) & np.isfinite(df['mag']))[0]

# Balance both classes to be of same size
blob_index = np.where(df['flag'].iloc[mask] == 0)[0]
other_index = np.where(df['flag'].iloc[mask] == 1)[0]
df_filtered = df.iloc[mask[np.concatenate((blob_index, other_index[:len(blob_index)]))]]

#These are the features to use, note that the catalog includes more than this!
columns = ['mag', 'mag_err', 'm00', 'm10', 'm01', 'm20', 'm11', 'm02', 'm30', 'm21', 'm12', 'm03', 'mu10', 
    'mu01', 'mu20', 'mu11', 'mu02', 'mu30', 'mu21', 'mu12', 'mu03', 'hu1', 'hu2', 'hu3', 'hu4', 'hu5', 'hu6', 'hu7', 
    'legendre_2', 'legendre_3', 'legendre_4', 'legendre_5', 'legendre_6', 'legendre_7', 'legendre_8', 'legendre_9', 
    'area', 'covar_sigx2', 'covar_sigy2', 'covar_sigxy', 'covariance_eigval1', 'covariance_eigval2', 'cxx', 'cxy', 'cyy', 
    'eccentricity', 'ellipticity', 'elongation', 'equivalent_radius', 'fwhm', 'gini', 'orientation', 'perimeter', 
    'semimajor_sigma', 'semiminor_sigma', 'max_value', 'min_value']

# Training data arrays
data_x, data_y = np.array(df_filtered[columns]), np.array(df_filtered['flag'])

"""
# Create the model object with feature and hyperparameter optimization enabled (5000 trials each)
# Enabling 10-fold cross validation which increases the hyperparameter optimization time ten-fold
model = ensemble_model.Classifier(data_x, data_y, clf='xgb', impute=True, optimize=True, boruta_trials=5000, n_iter=5000, opt_cv=10, limit_search=False)

# This is how the model is created and saved afterwards
model.create()
model.save()
"""

# This is how the model can be loaded
model = ensemble_model.Classifier(data_x, data_y, clf='xgb', impute=True)
model.load('/Users/daniel/Desktop/Folders/Lyalpha/pyBIA_Paper_1/models/ensemble_models/new_ensemble_5000_5000')

In [None]:
# Figure 3 Left Panel

# For plotting purposes change the labels from numeric to text, although the model accepts either
y_labels = []
for flag in data_y:
    y_labels.append('DIFFUSE') if flag == 0 else y_labels.append('OTHER')

# For plotting purposes, re-name the five confirmed blobs to Confirmed LyAlpha
confirmed_names = np.loadtxt('/Users/daniel/Desktop/Folders/pyBIA/pyBIA/data/obj_name_5', dtype=str)

for name in confirmed_names:
    index = np.where(df_filtered.obj_name == name)[0][0]
    y_labels[index] = r'Confirmed Ly$\alpha$'

# Plotting t-SNE projection with custom y_data labels, highlighting the scatter points for the confirmed blobs
model.plot_tsne(data_y=y_labels, special_class=r'Confirmed Ly$\alpha$', savefig=True)

# Figure 3 Right Panel

#Setting custom column names for plotting purposes 
columns = [r'$B_w$ Mag', r'$B_w$ MagErr', r'$M_{00}$', r'$M_{10}$', r'$M_{01}$', r'$M_{20}$', r'$M_{11}$', r'$M_{02}$', 
    r'$M_{30}$', r'$M_{21}$', r'$M_{12}$', r'$M_{03}$', r'$\mu_{10}$', r'$\mu_{01}$', r'$\mu_{20}$', r'$\mu_{11}$', 
    r'$\mu_{02}$', r'$\mu_{30}$', r'$\mu_{21}$', r'$\mu_{12}$', r'$\mu_{03}$', r'$h_1$', r'$h_2$', r'$h_3$', r'$h_4$', 
    r'$h_5$', r'$h_6$', r'$h_7$', r'$L_2$', r'$L_3$', r'$L_4$', r'$L_5$', r'$L_6$', r'$L_7$', r'$L_8$', r'$L_9$',
    'Area', r'$\sigma^2(x)$', r'$\sigma^2(y)$', r'$\sigma^2(xy)$', r'$\lambda_1$', r'$\lambda_2$', r'$C_{xx}$', r'$C_{xy}$', r'$C_{yy}$', 
    'Eccentricity', 'Ellipticity', 'Elongation', 'Equiv. Radius', 'FWHM', 'Gini', 'Orientation', 'Perimeter', 
    r'$\sigma_{\rm major}$', r'$\sigma_{\rm minor}$', 'Max Val.', 'Min Val.']

# Plotting only the top 25 accepted features
model.plot_feature_opt(feat_names=columns, top=20, include_other=True, include_shadow=True, 
    include_rejected=False, flip_axes=True, save_data=False, savefig=True)

In [None]:
# Figure 4 Left Panel
 
baseline = 0.92427745 # The maximum baseline accuracy as per Figure 2
model.plot_hyper_opt(baseline=baseline, xlim=(0, 5000), ylim=(0.8, 0.95), xlog=True, ylog=False, savefig=True)

# Figure 4 Right Panel 

model.plot_hyper_param_importance(plot_time=True, savefig=True)