# Run pema

Authors:
 - Angevaare, Joran <j.angevaare@nikhef.nl> (based on the Peak_Classification_Tester) 


## This notebook ##

**Goal**
 - Simulate waveforms and compare how the acceptance of S1s and S2s changes as function of the set clustering and classification parameters.

**Known issues**
 -


**Start up ``strax`` + load tools**

In [None]:
import pema
import os

In [None]:
if not os.path.exists('init.py'):
    init = os.path.join(pema.__path__[0], '..', 'bin', "pema_init.py")
    !ln -s $init init.py
%run init.py

In [None]:
base_dir = '/mnt/d/pema/'
data_name = f'pema_test_{pema.__version__}'
fig_dir = os.path.join(base_dir, f'figures_summary_{data_name}')
data_dir = os.path.join(base_dir, 'processed_data')
raw_data_dir = os.path.join(base_dir, 'raw_data')
instructions_csv = f"./inst_{data_name}.csv"

# Output naming
default_label = 'Normal clustering'
custom_label = 'Changed clustering'

**Initialize the wavefrom simulator with instructions**

In [None]:
# Take a few arbitrary runs that allow to run jobs in parallel and get the 
# gains from CMT
run_list = list(f'{r:06}' for r in range(11665,11665+6))
run_list

In [None]:
# Just some id which allows CMT to load
run_id = run_list[0]

In [None]:
# setting up instructions like this may take a while. You can set e.g. 
instructions = dict(
    event_rate=100, # Don't make too large -> overlapping truth info
    chunk_size=5, # keep large -> less overhead but takes more RAM
    nchunk=100, # set to 100
    photons_low=1, #PE
    photons_high=100, #PE
    electrons_low=1, #
    electrons_high=100,
    tpc_radius=straxen.tpc_r,
    tpc_length=148.1, # TPC length approx
    drift_field = 18.5, # V/cm VERIFY!
    timing = 'uniform', #Double S1 peaks uniform over time   
)
fax_override = {
    's1_pattern_map' : ntauxfiles.get_abspath('XENONnT_s1_xyz_patterns_LCE_corrected_qes_MCva43fa9b_wires.pkl'),
    's2_pattern_map' : ntauxfiles.get_abspath('XENONnT_s2_xy_patterns_LCE_corrected_qes_MCva43fa9b_wires.pkl')}


## Write instructions to CSV

In [None]:
pema.inst_to_csv(
    instructions, 
    instructions_csv, 
    get_inst_from = pema.rand_instructions)

In [None]:
# TODO can we add noise?
config_update = dict(
    detector='XENONnT',
    fax_file=os.path.abspath(instructions_csv),
    fax_config='fax_config_nt_low_field.json',
    fax_config_override=fax_override,
)

In [None]:
st = pema.pema_context(base_dir=base_dir,
                       config_update=config_update,
                       raw_dir=raw_data_dir,
                       data_dir=data_dir)

### Optional set the config to use different defaults
useful when iteratively optimizing parameters

In [None]:
# st.set_config(
# {'s2_merge_max_duration': 30000,
#  's2_merge_max_gap': 5000})

### Make raw-records and peaklets of the default. Do each run in a separate job

In [None]:
# How do you want to initalialize an encironment, you need to change this
environ_init = '''eval "$(/home/angevaare/software/Miniconda3/bin/conda shell.bash hook)"
conda activate strax
export PATH=/home/angevaare/software/Miniconda3/envs/strax/bin:$PATH'''
environ_init

In [None]:
job_registry=[]
for r in run_list:
    print(r)
    job = pema.ProcessRun(st, run_id=r, target=('records', 'peaklets'), config={})
    job_registry.append(job)

    cmd, job_name = job.make_cmd()
    job.exec_local(cmd, job_name)

In [None]:
# When working locally
job_registry[-1].log_file.communicate();

In [None]:
# Check if all the files are stored.
pd.concat([j.all_stored(show_key=True) for j in job_registry])

# Submit the jobs
Let's start all the jobs for the requested configs


We are going to compare with another config and run jobs to create the associated data.

In [None]:
summary_config = {
    's2_merge_max_duration': 30000,
    's2_merge_max_gap': 5000,
    'peaklet_gap_threshold': 525
                 }

In [None]:
# Check if all the runs we need are available at the record level
selected_runs = [r for r in run_list if st.is_stored(r, 'records')]
print(f'Doing runs:\n{selected_runs}\n{len(selected_runs)/len(run_list)*100:.1f}%')
all_runs = len(selected_runs) == len(run_list)

In [None]:
# The configs we need to take into account.
confs = [st.config, summary_config]

In [None]:
job_registry = []
target = ('raw_records', 'records', 'truth_matched', 'match_acceptance_extended')
RAM = 15000
queue_max = 200
check_que_after = 50
part = 'xenon1t'

for i, conf in enumerate(tqdm(confs, desc='configs')):
    job = pema.ProcessRun(st, run_id=selected_runs, target=target, config=conf)
    job_registry.append(job)

    cmd, job_name = job.make_cmd()
#         job.exec_dali(cmd, 
#                       job_name, 
#                       bash_activate = environ_init
#                      ram = RAM,
#                      partition = part
#                      max_hours= '04:00:00')
    job.exec_local(cmd, job_name)
    job.log_file.communicate()
#         if i % check_que_after:
#             q = !squeue -u `echo $USER` | grep $part
#             while len(q)> queue_max:
#                 q = !squeue -u `echo $USER` | grep $part
#                 print(f'waiting 10s, queue is full. {len(q)}')
#                 time.sleep(10)

In [None]:
cmd

#### Check the progress of the data

In [None]:
pd.concat([j.all_stored(show_key=True) for j in job_registry])

### Load the simulated data using strax "default" and "custom" data

In [None]:
summary_config

In [None]:
# Update the config
st2 = st.new_context()
st2.set_config(summary_config)

In [None]:
# Now load the data.
truth = st.get_array(selected_runs, 'truth', progress_bar = False)
data_default = st.get_array(selected_runs, 'peak_basics', progress_bar = False)
data_custom = st2.get_array(selected_runs, 'peak_basics', progress_bar = False)

In [None]:
# There should be some difference
len(data_custom), len(data_default)

## Basic checks, what did we simulate?
Let's see for a second what kind of data we have simulated.

In [None]:
dpe_fraction = 0.219 # see fax_file
area_t = truth['n_photon']
area_d = data_default['area']
area_c = data_custom['area']
nbins = 50
# range_ = [0,1e4]
for typ in tqdm([1,2,4]):
    plt.figure(figsize=(7,4))
    mask = truth['type'] == typ
    plt.hist((truth['n_photon']*(1+ dpe_fraction))[mask], bins = nbins, 
             label = wfsim.RawData.symtype(typ) + ' in truth', alpha = 0.7)
    mask = data_default['type'] == typ
    plt.hist(area_d[mask], bins = nbins, 
             label = wfsim.RawData.symtype(typ) + ' in data', alpha = 0.7)
    mask = data_custom['type'] == typ
    plt.hist(area_c[mask], bins = nbins, 
         label = wfsim.RawData.symtype(typ) + ' in data-custom', alpha = 0.7)
    plt.xlabel("area [PE]")
    plt.legend()
    
    plt.yscale('log')
    plt.title(f'S{typ} area distribution')
#     pema.save_canvas(f'truth_vs_data_sim_s{typ}', save_dir=fig_dir)
    plt.show()

In [None]:
def dist_xyz(data, si=2, bins=50, **kwargs):
    f, axes = plt.subplots(1, 2, figsize=(13,6))
    plt.sca(axes[0])
    
    sel = data['type'] == si
    plt.hist2d(data[sel]['x']**2+data[sel]['y']**2, data[sel]['z'], bins=bins, **kwargs)
    plt.xlabel('$R^{2}$ [cm]')
    plt.ylabel('z [cm]')
    plt.sca(axes[1])
    plt.gca().set_aspect('equal')
    plt.hist2d(data[sel]['x'], data[sel]['y'], bins=bins, **kwargs)
    plt.xlabel('x')
    plt.ylabel('y')
    plt.gca().set_facecolor('lightgrey')
    plt.gca().add_artist(plt.Circle(
        (0, 0),
        straxen.tpc_r,
        edgecolor='c',
        facecolor='none',
        zorder=5,
        linewidth=1))

In [None]:
dist_xyz(truth, si=2, norm=LogNorm())
plt.suptitle('S2 distribution')

## New matching! 
Get the matched acceptance peaks from the plugin

In [None]:
# for j in job_registry:
#     j.purge_below()

In [None]:
default_acceptence = st.get_array(selected_runs, 'match_acceptance_extended')
custom_acceptence = st2.get_array(selected_runs, 'match_acceptance_extended')

### Plotting the results

In [None]:
def si_acceptance(si, binedges, on_axis='n_photon', nbins=50):
    mask = default_acceptence['type'] == si
    pema.summary_plots.acceptance_plot(
        default_acceptence[mask], 
        on_axis, 
        binedges, 
        nbins=nbins, 
        plot_label=default_label,
    )
    mask = custom_acceptence['type'] == si
    pema.summary_plots.acceptance_plot(
        custom_acceptence[mask], 
        on_axis, 
        binedges, 
        nbins=nbins, 
        plot_label=custom_label,
    )
    plt.ylabel('Arb. Acceptance')
    plt.title(f"S{si} acceptance")
    plt.legend()

In [None]:
si_acceptance(1, [0,100])
plt.show()

si_acceptance(2, [0,200])
pema.save_canvas('improved_s2_acceptance', save_dir=fig_dir)


In [None]:
def acceptance_summary(si, on_axis, axis_label, nbins = 100, plot_range = (0, 200), save_name=''):
    f, axes = plt.subplots(3, 1, figsize=(10,12), sharex=True)
    max_photons = 35
    plt.sca(axes[0])
    sel = ((default_acceptence['type'] == si) 
           & (default_acceptence[on_axis] > plot_range[0])
           & (default_acceptence[on_axis] < plot_range[1])
          )
    pema.summary_plots.plot_peak_matching_histogram(default_acceptence[sel], on_axis, bin_edges = nbins)
    plt.text(0.05,0.95, 
             default_label,
             transform=plt.gca().transAxes,
             ha = 'left',
             va = 'top',
             bbox=dict(boxstyle="round", fc="w")
            )
    plt.legend(loc=(1.01,0))
    plt.xlim(*plot_range)
  
    plt.sca(axes[1])
    sel = ((custom_acceptence['type'] == si) 
           & (custom_acceptence[on_axis] > plot_range[0])
           & (custom_acceptence[on_axis] < plot_range[1])
          )
    print(f'cust {np.sum(sel)}')
    pema.summary_plots.plot_peak_matching_histogram(custom_acceptence[sel], on_axis, bin_edges = nbins)
    plt.text(0.05,0.95, 
             custom_label,
             transform=plt.gca().transAxes,
             ha = 'left',
             va = 'top',
             bbox=dict(boxstyle="round", fc="w")
            )
    plt.legend(loc=(1.01,0))
    plt.xlim(*plot_range)
    
    plt.sca(axes[2])
    mask = default_acceptence['type'] == si
    pema.summary_plots.acceptance_plot(default_acceptence[mask], on_axis, plot_range, nbins=nbins, 
                                       plot_label=default_label)
    mask = custom_acceptence['type'] == si

    pema.summary_plots.acceptance_plot(custom_acceptence[mask], on_axis, plot_range, nbins=nbins, 
                                       plot_label=custom_label)
    plt.legend(loc=(1.01,0))
    plt.ylabel('Arb. acceptance faction')
    plt.xlim(*plot_range)
    plt.xlabel(axis_label)
    plt.ylim(0,1)

    plt.subplots_adjust(hspace=0)
    plt.suptitle(f'S{si} Acceptance', y=0.9)
    pema.save_canvas(f'{si}_acceptance_detailed_{save_name}', save_dir=fig_dir)


In [None]:
acceptance_summary(si = 1, 
                   on_axis = 'n_photon',
                   axis_label = 'N photons simulated', 
                   nbins = 100, 
                   plot_range = (0, 30),
                   save_name = 'tot_compare',)

In [None]:
acceptance_summary(si = 2, 
                   on_axis = 'n_photon',
                   axis_label = 'N photons simulated', 
                   nbins = 100, 
                   plot_range = (0, 250),
                  save_name = 'tot_compare')

In [None]:
acceptance_summary(si = 2, 
                   on_axis = 'z',
                   axis_label = 'z (simulated) [cm]', 
                   nbins = 75, 
                   plot_range = (-160, 10),
                   save_name = 'tot_compare')

# look at the difference in the waveforms

In [None]:
mask = custom_acceptence['rec_bias'] > 1

pema.compare_outcomes(st, default_acceptence[mask],                 
                      st2, custom_acceptence[mask],
                      only_different=False,
#                       fig_dir=os.path.join(fig_dir, 'total_config'),
                      max_peaks=20)

# Plot bias reconstruction and acceptance

In [None]:
def bias_recons(si):
    sel = custom_acceptence['type'] == si
    dat = custom_acceptence[sel]
    labels = np.unique(dat['outcome'])
    f, axes_rec = plt.subplots(len(labels), 2, figsize=(10,12))
    axes = axes_rec.T[0]
    for li, label in enumerate(labels):
        plt.sca(axes[li])
        color = next(axes[0]._get_lines.prop_cycler)['color']
        y, x = np.histogram(dat[dat['outcome']==label]['rec_bias'],
                            range=[0,1.2],
                            bins=25,
                           )
        x = (x[1:] + x[:-1])/2
        plt.scatter(x, y, c = color, label=label)
        plt.legend(loc='upper left', fontsize='small')
        plt.axvline(0.65)
        plt.ylabel('counts')
    plt.xlabel('Bias fraction')
    axes = axes_rec.T[1]
    for li, label in enumerate(labels):
        plt.sca(axes[li])
        color = next(axes[0]._get_lines.prop_cycler)['color']
        y, x = np.histogram(dat[dat['outcome']==label]['acceptance_fraction'],
                            range=[-1.25,1.25],
                            bins=5,
                           )
        x = (x[1:] + x[:-1])/2
        plt.plot(x, y, c = color, label=label, ds='steps-mid')
    #     plt.legend(loc='upper left', fontsize='small')
        plt.gca().yaxis.set_label_position("right")
        plt.gca().yaxis.tick_right()
        plt.ylabel('counts')
    plt.subplots_adjust(hspace=0, wspace=0.05)
    plt.xlabel('Acceptance fraction')
    plt.suptitle(f'S{si} bias reconstruction & arb. acceptance', y=0.9)    
    pema.save_canvas(f's{si}_bias_recon_per_outcome', save_dir=fig_dir)

In [None]:
bias_recons(1)
bias_recons(2)

In [None]:
for r in run_list:
    print(r)
    plt.hist(st.get_array(r, 'match_acceptance_extended')['acceptance_fraction'], label=r)
    plt.yscale('log')
    plt.title(r)
    plt.show()
    print()

In [None]:
df = st2.get_df(run_list[-1], 'match_acceptance_extended')
df[(df['rec_bias']<0.11)&(df['outcome']=='found')]

In [None]:
df

In [None]:
plt.plot( st2.get_df(run_list[-1], 'peaks_extended')['id'])
plt.ylim(bottom=0)

In [None]:
plt.plot( st2.get_df(run_list[-1], 'match_acceptance_extended')['matched_to'])
plt.ylim(bottom=0)

In [None]:
plt.plot( st2.get_df(run_list[-1], 'match_acceptance_extended')['matched_to'])
plt.ylim(bottom=0)