In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
from pyBIA import ensemble_model 

#The optimal sig threshold to apply as per Figure 2
sig = 0.31                                                                                                                                                                                                                                
df = pandas.read_csv('/Users/daniel/Desktop/Folders/Lyalpha/pyBIA_Paper_1/nsigs/BW_NSIG/BW_training_set_nsig_'+str(sig))

# Omit any non-detections
mask = np.where((df['area'] != -999) & np.isfinite(df['mag']))[0]

# Balance both classes to be of same size
blob_index = np.where(df['flag'].iloc[mask] == 0)[0]
other_index = np.where(df['flag'].iloc[mask] == 1)[0]
df_filtered = df.iloc[mask[np.concatenate((blob_index, other_index[:len(blob_index)]))]]

#These are the features to use, note that the catalog includes more than this!
columns = ['mag', 'mag_err', 'm00', 'm10', 'm01', 'm20', 'm11', 'm02', 'm30', 'm21', 'm12', 'm03', 'mu10', 
    'mu01', 'mu20', 'mu11', 'mu02', 'mu30', 'mu21', 'mu12', 'mu03', 'hu1', 'hu2', 'hu3', 'hu4', 'hu5', 'hu6', 'hu7', 
    'legendre_2', 'legendre_3', 'legendre_4', 'legendre_5', 'legendre_6', 'legendre_7', 'legendre_8', 'legendre_9', 
    'area', 'covar_sigx2', 'covar_sigy2', 'covar_sigxy', 'covariance_eigval1', 'covariance_eigval2', 'cxx', 'cxy', 'cyy', 
    'eccentricity', 'ellipticity', 'elongation', 'equivalent_radius', 'fwhm', 'gini', 'orientation', 'perimeter', 
    'semimajor_sigma', 'semiminor_sigma', 'max_value', 'min_value']

# Training data arrays
data_x, data_y = np.array(df_filtered[columns]), np.array(df_filtered['flag'])

# This is the base model, no hyperparameter optimization, uses all features
base_model = ensemble_model.Classifier(data_x, data_y, clf='xgb', impute=True)
base_model.create()

# This is the optimized model
optimized_model = ensemble_model.Classifier(data_x, data_y, clf='xgb', impute=True)
optimized_model.load('/Users/daniel/Desktop/Folders/Lyalpha/pyBIA_Paper_1/models/ensemble_models/new_ensemble_5000_5000')

# Load the catalog containing all 2 million other objects, extracted using sig=0.31
other_all = pd.read_csv('/Users/daniel/Desktop/Folders/Lyalpha/pyBIA_Paper_1/catalogs/catalog_325_all_nodups')

# Omit non-detections
mask = np.where((other_all['area'] != -999) & np.isfinite(other_all['mag']))[0]
other_all = other_all.iloc[mask]

# Create the data_x array
other_data_x = np.array(other_all[columns_to_use])

# Predict all samples
predictions_base_model = base_model.predict(other_data_x)
predictions_optimized_model = optimized_model.predict(other_data_x)

# Select DIFFUSE detections (flag = 0)
index_base = np.where(predictions_base_model[:,0] == 0)[0]
index_optimized = np.where(predictions_optimized_model[:,0] == 0)[0]

# Index the catalog to select only the positive detections
candidate_catalog_base = other_all.iloc[index_base]
candidate_catalog_optimized = other_all.iloc[index_optimized]

# Save the probability predictions as a new column and save to CSV
candidate_catalog_base['proba'] = predictions_base_model[index][:,1]
candidate_catalog_optimized['proba'] = predictions_optimized_model[index][:,1]
#candidate_catalog_base.to_csv('/Users/daniel/Desktop/candidate_catalog_base.csv')
#candidate_catalog_optimized.to_csv('/Users/daniel/Desktop/candidate_catalog_optimized.csv')

In [None]:
# Generate the data for the histograms #

# Remove one OTHER object as the DIFFUSE will be cross-validated using LoO
other_training = df_filtered[df_filtered.flag == 1].iloc[1:]
diffuse_training =  df_filtered[df_filtered.flag == 0]

# The probas of the five confirmed blobs will be saved according to their published names
LABd05, PRG1, PRG2, PRG3, PRG4 = [],[],[],[],[]

# To store the probas of all DIFFUSE objects as well as their catalog names
all_diffuse_base_probas, all_diffuse_optimized_probas, names = [],[],[]

for i in range(len(diffuse_training)-1):

    # This will be the individual DIFFUSE sample to assess
    leave_one = np.array(diffuse_training[columns].iloc[i])
    # Removing this validation sample from the overall DIFFUSE training bag
    remaining = np.delete(np.array(diffuse_training[columns]), i, axis=0)

    # Setting the new training data
    data_x = np.r_[remaining, np.array(other_training[columns])]
    data_y = np.r_[[0]*len(remaining), [1]*len(other_training)]

    # Training the new base model
    new_base_model = base_model.model.fit(data_x, data_y)
    # Training the new optimized model
    new_optimized_model = optimized_model.model.fit(data_x, data_y)

    # Assess the left-out DIFFUSE sample using both the base and optimized models
    proba_base = new_base_model.predict_proba(leave_one.reshape(1,-1))
    proba_optimized = new_optimized_model.predict_proba(leave_one.reshape(1,-1))

    # Save only the probability prediction that the object is DIFFUSE
    if diffuse_training.obj_name.iloc[i] == 'NDWFS_J143410.9+331730':
        LABd05.append(float(proba_base[:,0])); LABd05.append(float(proba_optimized[:,0]))
    elif diffuse_training.obj_name.iloc[i] == 'NDWFS_J143512.2+351108': 
        PRG1.append(float(proba_base[:,0])); PRG1.append(float(proba_optimized[:,0]))
    elif diffuse_training.obj_name.iloc[i] == 'NDWFS_J142623.0+351422':
        PRG2.append(float(proba_base[:,0])); PRG2.append(float(proba_optimized[:,0]))
    elif diffuse_training.obj_name.iloc[i] == 'NDWFS_J143412.7+332939':
        PRG3.append(float(proba_base[:,0])); PRG3.append(float(proba_optimized[:,0]))
    elif diffuse_training.obj_name.iloc[i] == 'NDWFS_J142653.1+343856':
        PRG4.append(float(proba_base[:,0])); PRG4.append(float(proba_optimized[:,0]))
    else:
        all_diffuse_base_probas.append(float(proba_base[:,0]))
        all_diffuse_optimized_probas.append(float(proba_optimized[:,0]))
        names.append(diffuse_training.obj_name.iloc[i])

# The first value is the base model proba pred, the second the optimized model proba pred
five_diffuse_base_probas = np.c_[LABd05[0], PRG1[0], PRG2[0], PRG3[0], PRG4[0]]	
five_diffuse_optimized_probas = np.c_[LABd05[1], PRG1[1], PRG2[1], PRG3[1], PRG4[1]]	
five_names = ['LABd05', 'PRG1', 'PRG2', 'PRG3', 'PRG4']

# Save the base and optimized probabilities
np.savetxt('LoO_Confirmed_DIFFUSE', np.c_[five_names, five_diffuse_base_probas, five_diffuse_optimized_probas], header="Names, Base_Model, Optimized_Model", fmt='%s')
np.savetxt('LoO_DIFFUSE', np.c_[names, all_diffuse_base_probas, all_diffuse_optimized_probas], header="Names, Base_Model, Optimized_Model", fmt='%s')

In [None]:
# Figure 5 Left Panel -- Base Model #

# Confusion Matrix Plot

# Create label_y array for plotting purposes
y_labels = []
for flag in base_model.data_y:
    y_labels.append('DIFFUSE') if flag == 0 else y_labels.append('OTHER')

# Assess the accuracies using 10-fold cross-validation and normalize the accuracies
base_model.plot_conf_matrix(data_y=y_labels, k_fold=10, normalize=True, title='Base Model', savefig=True)

# Histogram Plot

candidate_catalog_base = pd.read_csv('/Users/daniel/Desktop/Folders/final_run/candidate_catalog_base.csv')
probas_candidates = np.array(candidate_catalog_base.proba)#.iloc[xxx]) #xxx = np.where(probas_candidates.area!=-999)[0]

# Inspecting two thresholds, 0.8 and 0.9
index_80, index_90 = np.where(probas_candidates >= 0.9)[0], np.where(probas_candidates >= 0.8)[0]

y=0.12
plt.axvline(x=0.9, linestyle='--', linewidth=2, alpha=0.6, color='k')
plt.text(0.9, 0.83+y, s=r" n(P) $\geq$ 0.9", weight="bold")
plt.axhline(y=0.81+y, linestyle='-', linewidth=1.2, color='k', xmin=0.81, xmax=0.99)
plt.text(0.925, 0.76+y, s=str(len(index_80)), weight="bold")
plt.axvline(x=0.8, linestyle='--', linewidth=2, alpha=0.6, color='k')
plt.text(0.8, 0.55+y, s=r" n(P) $\geq$ 0.8", weight="bold")
plt.axhline(y=0.53+y, linestyle='-', linewidth=1.2, color='k', xmin=0.61, xmax=0.79)
plt.text(0.82, 0.48+y, s=str(len(index_90)), weight="bold")

plt.hist(probas_candidates, bins=5, weights=np.ones(len(probas_candidates)) / len(probas_candidates), color='#377eb8', label='Candidates (n='+str(len(probas_candidates))+')')
plt.hist(all_diffuse_base_probas, bins=12, weights=np.ones(len(all_diffuse_base_probas)) / len(all_diffuse_base_probas), color='#ff7f00', alpha=0.6, label='DIFFUSE Training (n=865)')
plt.scatter(five_diffuse_base_probas, [0.051]*5, marker='*', c='k', s=1000, alpha=0.72, label=r'Confirmed Ly$\alpha$ (n=5)')

plt.xlabel('Probability Prediction', size=16); plt.ylabel('Normalized Counts', size=16)
plt.title('XGBoost Classification Output', size=18)
plt.xticks(ticks=[0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1.], labels=['0.4','','0.5','','0.6','','0.7','','0.8','','0.9','','1.0'], size=14)
plt.yticks(ticks=[0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1.0], size=14, labels=['0','','0.1','','0.2','','0.3','','0.4','','0.5','','0.6','','0.7','','0.8','','0.9','','1.0'])
plt.yticks(size=14); plt.xlim((0.5,1.0))
plt.legend(prop={'size': 14}, loc='upper left')
plt.text(0.935, 0.12, s="PRG4", weight="bold")
plt.savefig('/Users/daniel/Desktop/Final_Histogram_base.png', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
# Figure 5 Right Panel -- Optimized Model #

# Confusion Matrix Plot

optimized_model.plot_conf_matrix(data_y=y_labels, k_fold=10, normalize=True, title='Optimized Model', savefig=True)

# Histogram Plot

candidate_catalog_optimized = pd.read_csv('/Users/daniel/Desktop/Folders/final_run/candidate_catalog_optimized.csv')
probas_candidates = np.array(candidate_catalog_optimized.proba)#.iloc[xxx]) #xxx = np.where(probas_candidates.area!=-999)[0]

# Inspecting two thresholds, 0.8 and 0.9
index_80, index_90 = np.where(probas_candidates >= 0.9)[0], np.where(probas_candidates >= 0.8)[0]

y=0.12
plt.axvline(x=0.9, linestyle='--', linewidth=2, alpha=0.6, color='k')
plt.text(0.9, 0.83+y, s=r" n(P) $\geq$ 0.9", weight="bold")
plt.axhline(y=0.81+y, linestyle='-', linewidth=1.2, color='k', xmin=0.81, xmax=0.99)
plt.text(0.925, 0.76+y, s=str(len(index_80)), weight="bold")
plt.axvline(x=0.8, linestyle='--', linewidth=2, alpha=0.6, color='k')
plt.text(0.8, 0.55+y, s=r" n(P) $\geq$ 0.8", weight="bold")
plt.axhline(y=0.53+y, linestyle='-', linewidth=1.2, color='k', xmin=0.61, xmax=0.79)
plt.text(0.82, 0.48+y, s=str(len(index_90)), weight="bold")

plt.hist(probas_candidates, bins=5, weights=np.ones(len(probas_candidates)) / len(probas_candidates), color='#377eb8', label='Candidates (n='+str(len(probas_candidates))+')')
plt.hist(all_diffuse_optimized_probas, bins=12, weights=np.ones(len(all_diffuse_optimized_probas)) / len(all_diffuse_optimized_probas), color='#ff7f00', alpha=0.6, label='DIFFUSE Training (n=865)')
plt.scatter(five_diffuse_base_probas, [0.051]*5, marker='*', c='k', s=1000, alpha=0.72, label=r'Confirmed Ly$\alpha$ (n=5)')

plt.xlabel('Probability Prediction', size=16); plt.ylabel('Normalized Counts', size=16)
plt.title('XGBoost Classification Output', size=18)
plt.xticks(ticks=[0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1.], labels=['0.4','','0.5','','0.6','','0.7','','0.8','','0.9','','1.0'], size=14)
plt.yticks(ticks=[0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1.0], size=14, labels=['0','','0.1','','0.2','','0.3','','0.4','','0.5','','0.6','','0.7','','0.8','','0.9','','1.0'])
plt.yticks(size=14); plt.xlim((0.5,1.0))
plt.legend(prop={'size': 14}, loc='upper left')
plt.text(0.935, 0.12, s="PRG4", weight="bold")
plt.savefig('/Users/daniel/Desktop/Final_Histogram_base.png', bbox_inches='tight', dpi=300)
plt.show()