# File processor

csv_decoder.py is a packaged version of Josh's methods reading .csv generated by his c++ showering code.
save_and_load.py specifically saves and loads all essential lists/images used in the analysis to np.save formats.

This notebook shows the example use of methods in csv_decoder.py and save_and_load.py. It reads .csv files containing particle eta, phi, pt, ... produced by Josh's showering program, performs fast detector simulation to generate detector & jet images, and saves to faster np.save format. The save(), load() and load_cluster() methods are saved in 

In [1]:
#TODO store all variables from skype and p3 of paper

# Import local libraries
import csv_decoder
import save_and_load
import importlib
import numpy as np

In [2]:
# Jet and event image setting
width = 40
height = 40 

In [3]:
# Reading in custom showered files;
# This read produces event_list (collection of 4-vectors for event constituents) and event images
# NOTE: event_list is a LIST of 2D NUMPY ARRAYS with different length along axis=0 and same length along axis=1.

print('Loading hj events')
hj_event_list,hj_mass_list,hj_image_list, hj_higgs_list, hj_weight_list, num_hj_files = \
    csv_decoder.load_events(path="/home/ffu/higgs-classifier/showering/ggh-hj-csv/",\
                contains=".csv",pt_cut=1, width=width, height=height)
print('Loading vh events')
vh_event_list,vh_mass_list,vh_image_list, vh_higgs_list, vh_weight_list, num_vh_files = \
    csv_decoder.load_events(path="/home/ffu/higgs-classifier/showering/vh-csv",\
                contains=".csv",pt_cut=1, width=width, height=height)

print('Loading vbf events')
vbf_event_list,vbf_mass_list,vbf_image_list, vbf_higgs_list, vbf_weight_list, num_vbf_files = \
    csv_decoder.load_events(path="/home/ffu/higgs-classifier/showering/vbf-csv",\
                contains=".csv",pt_cut=1, width=width, height=height)
# TEMP: 


Loading hj events
Loading .csv event files from /home/ffu/higgs-classifier/showering/ggh-hj-csv/ containing '.csv'
List of files is: ['13701_seed_66352.csv', '21491_seed_50579.csv', '10821_seed_37915.csv', '25036_seed_78488.csv', '10706_seed_50891.csv', '32446_seed_7248.csv', '19981_seed_26367.csv', '11204_seed_61004.csv', '8907_seed_85653.csv', '11716_seed_94420.csv', '27338_seed_72629.csv', '2147_seed_48618.csv', '3841_seed_34994.csv', '16660_seed_12854.csv', '26068_seed_52269.csv', '84_seed_55366.csv', '527_seed_878.csv', '11277_seed_59254.csv', '30639_seed_5902.csv', '2131_seed_29459.csv', '19998_seed_63437.csv', '12396_seed_95743.csv', '29564_seed_30321.csv', '31653_seed_14633.csv', '22123_seed_29941.csv', '946_seed_36549.csv', '15844_seed_79439.csv', '26040_seed_14541.csv', '8973_seed_457.csv', '31789_seed_36820.csv', '12510_seed_61346.csv', '20393_seed_74068.csv', '32500_seed_291.csv', '23341_seed_102774.csv', '13584_seed_94711.csv', '22335_seed_21109.csv', '20740_seed_23852.csv

46files processed.
Currently reading: 31691_seed_85450.csv
47files processed.
Currently reading: 220_seed_59440.csv
48files processed.
Currently reading: 16753_seed_37474.csv
49files processed.
Currently reading: 7132_seed_21785.csv
50files processed.
Loading vbf events
Loading .csv event files from /home/ffu/higgs-classifier/showering/vbf-csv containing '.csv'
List of files is: ['5442_seed_34705.csv', '5995_seed_103548.csv', '18933_seed_92189.csv', '9254_seed_304.csv', '10553_seed_79140.csv', '16709_seed_65204.csv', '24538_seed_17239.csv', '24544_seed_8166.csv', '24306_seed_35393.csv', '10693_seed_25340.csv', '16598_seed_90397.csv', '28986_seed_75424.csv', '593_seed_12299.csv', '25818_seed_99275.csv', '14591_seed_74873.csv', '20836_seed_38854.csv', '10246_seed_19846.csv', '14231_seed_12859.csv', '28381_seed_11666.csv', '5650_seed_18400.csv', '28777_seed_4427.csv', '10158_seed_69863.csv', '21495_seed_74508.csv', '13470_seed_47530.csv', '17673_seed_77300.csv', '26001_seed_87517.csv', '1

In [4]:
print(vh_event_list[0][::100])

[[ 1.87976e+01  1.60030e-01  1.44452e+00  1.39570e-01 -2.11000e+02
   1.00000e+00]
 [ 2.19699e-01  6.45319e-02  9.38926e-01  0.00000e+00  2.20000e+01
   0.00000e+00]]


In [5]:
importlib.reload(csv_decoder)

# Check size of dataset
print("Dataset sizes:")
print('VH: ',len(vh_mass_list), ', Hj: ', len(hj_mass_list), ', VBF: ', len(vbf_mass_list))

# Cluster events_lists into jets. The results are named vh/hj_event_list_clustered
print('Clustering')
vh_event_list_clustered = csv_decoder.cluster_event(vh_event_list)
hj_event_list_clustered = csv_decoder.cluster_event(hj_event_list)
vbf_event_list_clustered = csv_decoder.cluster_event(vbf_event_list)



Dataset sizes:
VH:  75011 , Hj:  54279 , VBF:  41871
Clustering


In [6]:
# Finding and clustering 0.8 non-leading higgs jets into 0.2 jets
print('Reclustering')
vh_reclustered, junk = csv_decoder.recluster_event(vh_event_list_clustered, vh_higgs_list)
hj_reclustered, junk = csv_decoder.recluster_event(hj_event_list_clustered, hj_higgs_list)
vbf_reclustered, junk = csv_decoder.recluster_event(vbf_event_list_clustered, vbf_higgs_list)

Reclustering


In [8]:

# Produce jet images, the zero-center and normalize
print('Producing jet images')
vh_recluster_images = csv_decoder.return_jet_image_list(vh_event_list,
                                                           vh_reclustered,0.8, width=width, height=height)
hj_recluster_images = csv_decoder.return_jet_image_list(hj_event_list,
                                                           hj_reclustered,0.8, width=width, height=height)
vbf_recluster_images = csv_decoder.return_jet_image_list(vbf_event_list,
                                                           vbf_reclustered,0.8, width=width, height=height)

# Zero centering and normalizing
vh_image_list, hj_image_list, vbf_image_list = csv_decoder.zero_center_and_normalize((vh_image_list,hj_image_list,vbf_image_list))
vh_recluster_images, hj_recluster_images, vbf_recluster_images = csv_decoder.zero_center_and_normalize((vh_recluster_images, hj_recluster_images,vbf_recluster_images))

Producing jet images
Number of events with only one constituent in leading jet: 22
Number of events with only one constituent in leading jet: 8
Number of events with only one constituent in leading jet: 23


In [11]:
importlib.reload(save_and_load)
save_and_load.save('vh-hj-vbf', 'vh', vh_event_list, vh_mass_list, \
        vh_higgs_list, vh_weight_list, vh_image_list, \
        vh_recluster_images)
save_and_load.save('vh-hj-vbf', 'hj', hj_event_list, hj_mass_list, \
        hj_higgs_list, hj_weight_list, hj_image_list, \
        hj_recluster_images)
save_and_load.save('vh-hj-vbf', 'vbf', vbf_event_list, vbf_mass_list, \
        vbf_higgs_list, vbf_weight_list, vbf_image_list, \
        vbf_recluster_images)


In [16]:
# Testing saving and loading
importlib.reload(save_and_load)
vh_event_list_new, vh_mass_list_new, vh_higgs_list_new, \
vh_weight_list_new, vh_image_list_new, vh_recluster_images_new = save_and_load.load('vh-hj-vbf', 'vh')
hj_event_list_new, hj_mass_list_new, hj_higgs_list_new, \
hj_weight_list_new, hj_image_list_new, hj_recluster_images = save_and_load.load('vh-hj-vbf', 'hj')
vbf_event_list_new, vbf_mass_list_new, vbf_higgs_list_new, \
vbf_weight_list_new, vbf_image_list_new, vbf_recluster_images_new = save_and_load.load('vh-hj-vbf', 'vbf')

event_list_test = True
event_list_correct_count = 0
for i in range(len(vh_event_list)):
    event_list_test = event_list_test and np.array_equal(vh_event_list_new[i], vh_event_list[i])
    event_list_correct_count += int(np.array_equal(vh_event_list_new[i], vh_event_list[i]))
print(event_list_test)
print(event_list_correct_count)
print(np.array_equal(vh_mass_list_new, vh_mass_list))
print(np.array_equal(vh_higgs_list_new, vh_higgs_list))
print(np.array_equal(vh_weight_list_new, vh_weight_list))
print(np.array_equal(vh_image_list_new, vh_image_list))
print(np.array_equal(vh_recluster_images_new, vh_recluster_images))

True
75011
True
True
True
True
True


# Legacy code for hbb-qcd (no longer supported) 

In [None]:
# NOTE: NO LONGER SUPPORTED
# Reading in Josh's files; background is a single large file (pT cut must be 1 here for Josh's sample)
# This read produces event_list (collection of raw vectors) and event images
#
# This will be used to test saving mechanisms.
# THIS PART IS NO LONGER COMPATIBLE WITH NEW ANALYSIS CODE! NEW CODE READS WEIGHT AND HIGGS JET INFO.
print('Loading background events')
background_event_list,background_mass_list,background_image_list,num_background_files = \
    csv_decoder.load_events(path="/data1/users/jzlin/MLM/background_7413/",\
                contains="actual_actual",max_files=1,pt_cut=1, width=width, height=height)
num_background_files = 15693

print('Loading signal events')
signal_event_list,signal_mass_list,signal_image_list,num_signal_files = \
    csv_decoder.load_events(path="/data1/users/jzlin/MLM/heavy_signal/",\
                contains="actual_signal",max_read = len(background_event_list),pt_cut=1, width=width, height=height)

# TEMP: 

# Check size of dataset
print(len(background_mass_list),len(signal_mass_list))


# This is not used
background_mass_window = np.logical_and(np.array(background_mass_list) > 115,np.array(background_mass_list) < 135)

# Cluster events_lists into jets. The results are named background/signal_event_list_clustered
print('Clustering')
background_event_list_clustered = csv_decoder.cluster_event(background_event_list)
signal_event_list_clustered = csv_decoder.cluster_event(signal_event_list)

# Reclustering the events (i.e. clustering within events)
print('Reclustering')
background_reclustered = csv_decoder.recluster_event(background_event_list_clustered)
signal_reclustered = csv_decoder.recluster_event(signal_event_list_clustered)

# Produce jet images, the zero-center and normalize
print('Producing jet images')
background_recluster_images = csv_decoder.return_fine_image_list_reclustered(background_event_list,
                                                           background_reclustered,0.8, width=width, height=height)
signal_recluster_images = csv_decoder.return_fine_image_list_reclustered(signal_event_list,
                                                           signal_reclustered,0.8, width=width, height=height)

# Zero centering and normalizing
background_image_list, signal_image_list = csv_decoder.zero_center_and_normalize_pair(background_image_list,signal_image_list)
background_recluster_images, signal_recluster_images = csv_decoder.zero_center_and_normalize_pair(background_recluster_images, signal_recluster_images)

In [None]:
# Weight calculation for Josh's sample
backgroundCross = 2.048e-06 # Cross-section of processes in millibarns, NOT USED

actual_background_cross=2.84e-9 # In barns, used in background weight
average_number_accepted=2162 # Used in background weight

actual_signal_cross = np.average([1.738e-14,1.7277e-14]) # Used in signal weight
signal_accepted = np.average([8708-189,8827-172]) # Used in signal weight 

background_weight = actual_background_cross*35.9*1e15/(average_number_accepted*num_background_files)
signal_weight = actual_signal_cross*35.9*1e15/(signal_accepted*num_signal_files)
# Weight is calculated by cross section * 35.9(integrated luminosity) * 1e15 / # total event 
# cross_sec*L_int = dN/dt

# Testing saving time for Josh's samples
# Time: 19.26s
save_and_load.save('hbb-qcd', background_event_list, signal_event_list, background_mass_list, signal_mass_list,\
        background_weight, signal_weight, background_image_list, signal_image_list,\
        background_recluster_images, signal_recluster_images)

# Testing reading for .npy files of Josh's samples
new_background_event_list, new_signal_event_list, new_background_mass_list, new_signal_mass_list,\
        new_background_weight, new_signal_weight, new_background_image_list, new_signal_image_list,\
        new_background_recluster_images, new_signal_recluster_images = save_and_load.load('hbb-qcd')
print(np.array_equal(new_background_mass_list, background_mass_list))
print(np.array_equal(new_signal_mass_list, signal_mass_list))
print(np.array_equal(new_background_weight, background_weight))
print(np.array_equal(new_signal_weight, signal_weight))
print(np.array_equal(new_background_image_list, background_image_list))
print(np.array_equal(new_signal_image_list, signal_image_list))
print(np.array_equal(new_background_recluster_images, background_recluster_images))
print(np.array_equal(new_signal_recluster_images, signal_recluster_images))