In [1]:
import os
import collections

import itertools
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from kolmov import crossval_table, get_color_fader

Welcome to JupyROOT 6.16/00
Using all sub packages with ROOT dependence


## Create a dictionary

Since the output of saphyra is like a dictionary we need to navigate on and get all information.

kolmov has a class called crossval_table which allow us to get this information and tranform into a pandas Dataframe.

The first thing to do is define a OrderedDict to access all information inside of saphyra tuned file.

In [2]:
def create_op_dict(op):
    d = {
              op+'_pd_ref'    : "reference/"+op+"_cutbased/pd_ref#0",
              op+'_fa_ref'    : "reference/"+op+"_cutbased/fa_ref#0",
              op+'_sp_ref'    : "reference/"+op+"_cutbased/sp_ref",
              op+'_pd_val'    : "reference/"+op+"_cutbased/pd_val#0",
              op+'_fa_val'    : "reference/"+op+"_cutbased/fa_val#0",
              op+'_sp_val'    : "reference/"+op+"_cutbased/sp_val",
              op+'_pd_op'     : "reference/"+op+"_cutbased/pd_op#0",
              op+'_fa_op'     : "reference/"+op+"_cutbased/fa_op#0",
              op+'_sp_op'     : "reference/"+op+"_cutbased/sp_op",

              # Counts
              op+'_pd_ref_passed'    : "reference/"+op+"_cutbased/pd_ref#1",
              op+'_fa_ref_passed'    : "reference/"+op+"_cutbased/fa_ref#1",
              op+'_pd_ref_total'     : "reference/"+op+"_cutbased/pd_ref#2",
              op+'_fa_ref_total'     : "reference/"+op+"_cutbased/fa_ref#2",
              op+'_pd_val_passed'    : "reference/"+op+"_cutbased/pd_val#1",
              op+'_fa_val_passed'    : "reference/"+op+"_cutbased/fa_val#1",
              op+'_pd_val_total'     : "reference/"+op+"_cutbased/pd_val#2",
              op+'_fa_val_total'     : "reference/"+op+"_cutbased/fa_val#2",
              op+'_pd_op_passed'     : "reference/"+op+"_cutbased/pd_op#1",
              op+'_fa_op_passed'     : "reference/"+op+"_cutbased/fa_op#1",
              op+'_pd_op_total'      : "reference/"+op+"_cutbased/pd_op#2",
              op+'_fa_op_total'      : "reference/"+op+"_cutbased/fa_op#2",
    }
    return d

tuned_info = collections.OrderedDict( {
              # validation
              "max_sp_val"      : 'summary/max_sp_val',
              "max_sp_pd_val"   : 'summary/max_sp_pd_val#0',
              "max_sp_fa_val"   : 'summary/max_sp_fa_val#0',
              # Operation
              "max_sp_op"       : 'summary/max_sp_op',
              "max_sp_pd_op"    : 'summary/max_sp_pd_op#0',
              "max_sp_fa_op"    : 'summary/max_sp_fa_op#0',
              } )

tuned_info.update(create_op_dict('tight'))
tuned_info.update(create_op_dict('medium'))
tuned_info.update(create_op_dict('loose'))
tuned_info.update(create_op_dict('vloose'))

In [6]:
etbins  = [4, 7, 10, 15]
etabins = [0.0, 0.8, 1.37, 1.54, 2.37, 2.47]

tunes_path    = "/home/natmourajr/Workspace/CERN/CERN-ATLAS-Qualify/tunings"
analysis_path = "/home/natmourajr/Workspace/CERN/CERN-ATLAS-Qualify/tunings"

## Initialize the crossval_table object

In this step we initialiaze the crossval_table object and fill with data from our training.

In [7]:
m_cv = crossval_table( tuned_info, etbins = etbins , etabins = etabins )
#m_cv.fill( os.path.join(tunes_path, 'v1/r0/*/*/*pic.gz'), 'v1.r0')
m_cv.fill( os.path.join(tunes_path, 'v1/r1/*/*/*.pic.gz'), 'v1.r1')

2021-09-26 23:38:57,332 | Py.crossval_table                       INFO Reading file for v1.r1 tag from /home/natmourajr/Workspace/CERN/CERN-ATLAS-Qualify/tunings/v1/r1/*/*/*.pic.gz
2021-09-26 23:38:57,332 | Py.crossval_table                       INFO There are 0 files for this task...
2021-09-26 23:38:57,332 | Py.crossval_table                       INFO Filling the table... 
2021-09-26 23:38:57,345 | Py.crossval_table                       INFO End of fill step, a pandas DataFrame was created...


In [8]:
best_inits = m_cv.filter_inits("max_sp_val")
print(len(best_inits))
best_inits.head()

0


Unnamed: 0,train_tag,et_bin,eta_bin,model_idx,sort,init,file_name,tuned_idx,max_sp_val,max_sp_pd_val,...,vloose_pd_ref_total,vloose_fa_ref_total,vloose_pd_val_passed,vloose_fa_val_passed,vloose_pd_val_total,vloose_fa_val_total,vloose_pd_op_passed,vloose_fa_op_passed,vloose_pd_op_total,vloose_fa_op_total


In [6]:
n_min, n_max = 2, 20
model_add_tag = { idx : '.mlp%i' %(neuron) for idx, neuron in enumerate(range(n_min, n_max +1))}
# add a sufix in train_tag
best_inits.train_tag = best_inits.train_tag + best_inits.model_idx.replace(model_add_tag)

In [7]:
best_inits.model_idx.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18])

In [8]:
10*len(best_inits.model_idx.unique())*15

2850

In [9]:
best_inits.head()

Unnamed: 0,train_tag,et_bin,eta_bin,model_idx,sort,init,file_name,tuned_idx,max_sp_val,max_sp_pd_val,...,vloose_pd_ref_total,vloose_fa_ref_total,vloose_pd_val_passed,vloose_fa_val_passed,vloose_pd_val_total,vloose_fa_val_total,vloose_pd_op_passed,vloose_fa_op_passed,vloose_pd_op_total,vloose_fa_op_total
1901,v1.r1.mlp2,0,0,0,0,1,/home/micael/Documents/NeuralRinger/cern_data/...,0,0.942942,0.961311,...,59183,375287,5854,6011,5919,37528,58529,57603,59183,375287
3583,v1.r1.mlp2,0,0,0,1,7,/home/micael/Documents/NeuralRinger/cern_data/...,0,0.944827,0.965366,...,59183,375287,5854,5508,5919,37528,58529,57946,59183,375287
2162,v1.r1.mlp2,0,0,0,2,9,/home/micael/Documents/NeuralRinger/cern_data/...,0,0.942368,0.959284,...,59183,375287,5854,5887,5919,37528,58529,57267,59183,375287
2195,v1.r1.mlp2,0,0,0,3,2,/home/micael/Documents/NeuralRinger/cern_data/...,0,0.943174,0.961642,...,59183,375287,5853,6490,5918,37529,58529,56767,59183,375287
2492,v1.r1.mlp2,0,0,0,4,0,/home/micael/Documents/NeuralRinger/cern_data/...,0,0.942811,0.957249,...,59183,375287,5853,6094,5918,37529,58529,57267,59183,375287


In [10]:
# since take a long time to open those files let's save into a .csv
print(analysis_path)
best_inits.to_csv(os.path.join(analysis_path, 'v1/r1/best_inits.csv'))

/home/micael/Documents/NeuralRinger/cern_data/jpsiee_analysis


In [11]:
print(analysis_path)
r1_path = 'v1/r1'

/home/micael/Documents/NeuralRinger/cern_data/jpsiee_analysis


In [12]:
map_key_dict ={
   'max_sp_val'    : (r'$SP_{max}$ (Test)', 'sp'),
   'max_sp_pd_val' : (r'$P_D$ (Test)', 'pd'),
   'max_sp_fa_val' : (r'$F_A$ (Test)', 'fa'),
   'auc_val'       : (r'AUC (Test)', 'auc'),
}

from kolmov.utils.constants import str_etbins_jpsiee, str_etabins
# using as simple function in order to make easier plot all need measures
def create_cool_catplot(df, key, kind, mapped_key, output_name, tuning_flag, tuning_folder, list_of_neuros=None):
    # create the box plot. 
    # rename the columns names.
    # map the model idx into real # neurons.
    
    if list_of_neuros is None:
        list_of_neuros = range(2, 20+1)
    sns.catplot(data=(df
                        .replace({'model_idx' : {i :  n for i, n in zip(range(0,df.model_idx.max()+1),
                        range(2,20+1))},
                                'et_bin'    : {i : str_etbins_jpsiee[i] for i in range(3)},
                                'eta_bin'   : {i : str_etabins[i] for i in range(5)}})
                        .rename({'model_idx'  : '# Neurons',
                                'et_bin'     : r'$E_T$',
                                'eta_bin'    : r'$\eta$',
                                key : mapped_key},
                        axis=1)), x='# Neurons',
                        y=mapped_key, col=r'$\eta$', 
                        row=r'$E_T$', kind=kind, sharey=False,
                        )

    plt.tight_layout()
    plt.savefig(os.path.join(analysis_path, '%s/plots/%s_plot_%s_%s.png' %(tuning_folder, kind, output_name, tuning_flag)), dpi=150, facecolor='white')
    plt.close()
    

def create_cool_scatterplot(df, key1, key2, mapped_key1, mapped_key2, output_name, tuning_flag, tuning_folder):
    
    sns.relplot(data=(best_inits.replace({'model_idx' : {i :  n for i, n in zip(best_inits.model_idx.unique(), [2, 5, 10, 15, 20])},
                                          'et_bin'    : {i : str_etbins_jpsiee[i] for i in range(3)},
                                          'eta_bin'   : {i : str_etabins[i] for i in range(5)}})
                      .rename({'model_idx'  : '# Neurons',
                               'et_bin'     : r'$E_T$',
                               'eta_bin'    : r'$\eta$',
                               key1         : mapped_key1,
                               key2         : mapped_key2}, axis=1)),
                x=mapped_key1, y=mapped_key2, 
                palette=['red', 'orange', 'green'], style='# Neurons',
                hue='# Neurons', row=r'$E_T$', col=r'$\eta$', facet_kws=dict(sharex=False, sharey=False))
    
    plt.tight_layout()
    plt.savefig(os.path.join(analysis_path, '%s/plots/scatter_plot_%s_%s.png' %(tuning_folder, output_name, tuning_flag)), dpi=150, facecolor='white')
    plt.close()

In [None]:
best_inits.head()

In [None]:
best_inits[best_inits.train_tag.str.contains('v1.r1')].head()

In [None]:
best_inits[best_inits.train_tag.str.contains('v1.r1')].shape

In [None]:
15*10*best_inits.model_idx.nunique()

In [None]:
ikey         = 'max_sp_val'
map_k, o_name = map_key_dict[ikey]

for ikind in ['box', 'violin', 'boxen']:
    create_cool_catplot(df=best_inits[best_inits.train_tag.str.contains('v1.r1')], key=ikey, mapped_key=map_k, 
                        kind=ikind, output_name=o_name, tuning_flag='v1.r1.all_neurons', tuning_folder=r1_path)

In [None]:
# select some models to filter
selected_models = ['v1.r1.mlp%i' %(ineuron) for ineuron in [2, 5, 10, 15, 20]]
print(selected_models)

In [None]:
best_inits[best_inits.train_tag.isin(selected_models)].train_tag.unique()

In [None]:
for ikey in map_key_dict.keys():
    map_k, o_name = map_key_dict[ikey]
    for ikind in ['box', 'violin', 'boxen']:
        create_cool_catplot(df=best_inits[best_inits.train_tag.isin(selected_models)], key=ikey, mapped_key=map_k,
                            kind=ikind, output_name=o_name, tuning_flag='v1.r1.selected_neurons', tuning_folder=r1_path)

## Filter the initializations and get the best sort

To get the best initialization in each sort and the best sort for each model configuration is easy since we are using pandas.


In [None]:
for iet in best_inits['et_bin'].unique():
    iet_mask = best_inits['et_bin'] == iet
    for ieta in best_inits['eta_bin'].unique():
        ieta_mask   = best_inits['eta_bin'] == ieta
        for tag, midx in zip(best_inits['train_tag'].unique(), best_inits['model_idx'].unique()):
            model_mask = best_inits['model_idx'] == midx
            tag_mask   = best_inits['train_tag'] == tag

            full_mask = iet_mask & ieta_mask & model_mask & tag_mask
            print(iet, ieta, tag, midx, best_inits.loc[full_mask].shape)

In [None]:
best_inits[(best_inits.train_tag == 'v1.r0.mlp2') & (best_inits.et_bin == 2.) & (best_inits.eta_bin == 0.)]

When we filter sorts we must to have only one entry since.

In [None]:
best_sorts = m_cv.filter_sorts( best_inits , 'max_sp_op')
print(len(best_sorts))

In [None]:
best_sorts

## Get the cross-validation table

In [None]:
for op in ['tight','medium','loose','vloose']:
    m_cv.dump_beamer_table( best_inits ,  [op], 'v1_r1_'+op, 
                             title = op+' Tunings (v1-r1)', 
                             tags = ['v1.r1.mlp2', 'v1.r1.mlp5', 'v1.r1.mlp10', 'v1.r1.mlp15', 'v1.r1.mlp20']
                           )

In [None]:
m_cv.integrate(best_inits, 'v1.r1.mlp2')

## Plot monitoring training curves

In [None]:
m_cv.plot_training_curves( best_inits, best_sorts , 'monitoring_curves' )

## Plot ROC Curves

In [None]:
m_cv.plot_roc_curves( best_sorts, ['v1.r1.mlp2', 'v1.r1.mlp5', 'v1.r1.mlp10', 'v1.r1.mlp15', 'v1.r1.mlp20'], 
                      ['v1.r1.mlp2', 'v1.r1.mlp5', 'v1.r1.mlp10', 'v1.r1.mlp15', 'v1.r1.mlp20'], 
                      'roc_curve.png', display=True, 
                      colors=get_color_fader('blue','red',5),
                      et_bin=2, eta_bin=0, xmin=-0.005, xmax=.25, ymin=0.9, ymax=1.005,
                      fontsize=20,
                      figsize=(7,7))