# File processor

csv_decoder.py is a packaged version of Josh's methods reading .csv generated by his c++ showering code.
save_and_load.py specifically saves and loads all essential lists/images used in the analysis to np.save formats.

This notebook shows the example use of methods in csv_decoder.py and save_and_load.py. It reads .csv files containing particle eta, phi, pt, ... produced by Josh's showering program, performs fast detector simulation to generate detector & jet images, and saves to faster np.save format. The save(), load() and load_cluster() methods are saved in 

In [20]:
# Import local libraries
import csv_decoder
import save_and_load
import numpy as np

In [3]:
# Jet and event image setting
width = 40
height = 40 

In [9]:
# reading in event files from a given folder.
def generate_images(path, contains=".csv" ,max_read=float('inf'), max_files=float('inf'), 
                    weighted=0, pt_cut=1, width_param=width, height_param=height):
    # Reading in files
    event_list,mass_list,image_list,num_files = csv_decoder.load_events(
            path = path, 
            contains = contains, 
            max_read = max_read, 
            max_files = max_files, 
            weighted = weighted, 
            pt_cut = 1, width=width_param, height=height_param)

    # Cluster events_lists into jets. The results are named background/signal_event_list_clustered
    print('Clustering')
    event_list_clustered = csv_decoder.cluster_event(event_list)

    # Reclustering the events (i.e. clustering within events)
    print('Reclustering')
    reclustered = csv_decoder.recluster_event(event_list_clustered)

    # Produce jet images, the zero-center and normalize
    print('Producing jet images')
    recluster_images = csv_decoder.return_fine_image_list_reclustered(event_list,
                                                           reclustered,0.8, width=width, height=height)
    # Weight calculation, UNFINISHED!
    weight = -1
    
    return event_list, mass_list, weight, image_list, recluster_images


In [None]:
# Reading in Josh's files; background is a single large file (pT cut must be 1 here for Josh's sample)
# This read produces event_list (collection of raw vectors) and event images

# This will be used to test saving mechanisms.

print('Loading background events')
background_event_list,background_mass_list,background_image_list,num_background_files = \
    csv_decoder.load_events(path="/data1/users/jzlin/MLM/background_7413/",\
                contains="actual_actual",max_files=1,pt_cut=1, width=width, height=height)
num_background_files = 15693

print('Loading signal events')
signal_event_list,signal_mass_list,signal_image_list,num_signal_files = \
    csv_decoder.load_events(path="/data1/users/jzlin/MLM/heavy_signal/",\
                contains="actual_signal",max_read = len(background_event_list),pt_cut=1, width=width, height=height)

# TEMP: 

# Check size of dataset
print(len(background_mass_list),len(signal_mass_list))


# This is not used
background_mass_window = np.logical_and(np.array(background_mass_list) > 115,np.array(background_mass_list) < 135)

# Cluster events_lists into jets. The results are named background/signal_event_list_clustered
print('Clustering')
background_event_list_clustered = csv_decoder.cluster_event(background_event_list)
signal_event_list_clustered = csv_decoder.cluster_event(signal_event_list)

# Reclustering the events (i.e. clustering within events)
print('Reclustering')
background_reclustered = csv_decoder.recluster_event(background_event_list_clustered)
signal_reclustered = csv_decoder.recluster_event(signal_event_list_clustered)

# Produce jet images, the zero-center and normalize
print('Producing jet images')
background_recluster_images = csv_decoder.return_fine_image_list_reclustered(background_event_list,
                                                           background_reclustered,0.8, width=width, height=height)
signal_recluster_images = csv_decoder.return_fine_image_list_reclustered(signal_event_list,
                                                           signal_reclustered,0.8, width=width, height=height)

# Zero centering and normalizing
background_image_list, signal_image_list = csv_decoder.zero_center_and_normalize_pair(background_image_list,signal_image_list)
background_recluster_images, signal_recluster_images = csv_decoder.zero_center_and_normalize_pair(background_recluster_images, signal_recluster_images)

In [None]:
# Weight calculation for Josh's sample
backgroundCross = 2.048e-06 # Cross-section of processes in millibarns, NOT USED

actual_background_cross=2.84e-9 # In barns, used in background weight
average_number_accepted=2162 # Used in background weight

actual_signal_cross = np.average([1.738e-14,1.7277e-14]) # Used in signal weight
signal_accepted = np.average([8708-189,8827-172]) # Used in signal weight 

background_weight = actual_background_cross*35.9*1e15/(average_number_accepted*num_background_files)
signal_weight = actual_signal_cross*35.9*1e15/(signal_accepted*num_signal_files)
# Weight is calculated by cross section * 35.9(integrated luminosity) * 1e15 / # total event 
# cross_sec*L_int = dN/dt

# Testing saving time for Josh's samples
# Time: 19.26s
save_and_load.save('hbb-qcd', background_event_list, signal_event_list, background_mass_list, signal_mass_list,\
        background_weight, signal_weight, background_image_list, signal_image_list,\
        background_recluster_images, signal_recluster_images)

# Testing reading for .npy files of Josh's samples
new_background_event_list, new_signal_event_list, new_background_mass_list, new_signal_mass_list,\
        new_background_weight, new_signal_weight, new_background_image_list, new_signal_image_list,\
        new_background_recluster_images, new_signal_recluster_images = save_and_load.load('hbb-qcd')
print(np.array_equal(new_background_mass_list, background_mass_list))
print(np.array_equal(new_signal_mass_list, signal_mass_list))
print(np.array_equal(new_background_weight, background_weight))
print(np.array_equal(new_signal_weight, signal_weight))
print(np.array_equal(new_background_image_list, background_image_list))
print(np.array_equal(new_signal_image_list, signal_image_list))
print(np.array_equal(new_background_recluster_images, background_recluster_images))
print(np.array_equal(new_signal_recluster_images, signal_recluster_images))

In [10]:
# Processing VBF and VH samples
vbf_event_list, vbf_mass_list, vbf_weight, vbf_image_list, vbf_recluster_images = generate_images(path='/home/ffu/higgs-classifier/samples/vbf-csv/')
vh_event_list, vh_mass_list, vh_weight, vh_image_list, vh_recluster_images = generate_images(path='/home/ffu/higgs-classifier/samples/vh-csv/')

Loading .csv event files from /home/ffu/higgs-classifier/samples/vbf-csv/ containing ".csv"
List of files is: ['18785.csv_seed_35134.csv', '18811.csv_seed_89257.csv', '14476.csv_seed_55984.csv', '23458.csv_seed_36303.csv', '20603.csv_seed_31091.csv', '29482.csv_seed_36569.csv', '21204.csv_seed_54727.csv', '20755.csv_seed_65604.csv', '11662.csv_seed_48142.csv', '16875.csv_seed_52464.csv', '32557.csv_seed_3475.csv', '29973.csv_seed_94611.csv', '2811.csv_seed_46037.csv', 'shower.log', '25983.csv_seed_4370.csv', '11898.csv_seed_7268.csv', '1897.csv_seed_29944.csv', '21260.csv_seed_31664.csv', '19431.csv_seed_45453.csv', '9444.csv_seed_60674.csv', '7065.csv_seed_76323.csv', '12664.csv_seed_2591.csv', '942.csv_seed_25797.csv', '28338.csv_seed_91335.csv', '20731.csv_seed_68406.csv', '6579.csv_seed_63483.csv', '14591.csv_seed_590.csv', '13089.csv_seed_69302.csv', '23383.csv_seed_18721.csv', '15197.csv_seed_2718.csv', '3393.csv_seed_951.csv', '5161.csv_seed_104568.csv', '27011.csv_seed_4938.csv

11files processed.
Currently reading: 9616.csv_seed_88480.csv
9616.csv_seed_88480.csv
12files processed.
Currently reading: 7462.csv_seed_81435.csv
7462.csv_seed_81435.csv
13files processed.
Currently reading: 24404.csv_seed_35910.csv
24404.csv_seed_35910.csv
14files processed.
Currently reading: 14145.csv_seed_58825.csv
14145.csv_seed_58825.csv
15files processed.
Currently reading: 22005.csv_seed_51967.csv
22005.csv_seed_51967.csv
16files processed.
Currently reading: 9799.csv_seed_96263.csv
9799.csv_seed_96263.csv
17files processed.
Currently reading: 17667.csv_seed_25156.csv
17667.csv_seed_25156.csv
18files processed.
Currently reading: 2390.csv_seed_48839.csv
2390.csv_seed_48839.csv
19files processed.
Currently reading: 3657.csv_seed_36217.csv
3657.csv_seed_36217.csv
20files processed.
Currently reading: 11574.csv_seed_51397.csv
11574.csv_seed_51397.csv
21files processed.
Currently reading: 7217.csv_seed_92294.csv
7217.csv_seed_92294.csv
22files processed.
Currently reading: 15121.

In [25]:
# Zero-centering and normalizing
#vbf_image_list, vh_image_list = csv_decoder.zero_center_and_normalize_pair(vbf_image_list, vh_image_list)
(vbf_image_list, vh_image_list) = csv_decoder.zero_center_and_normalize((vbf_image_list, vh_image_list))
print(len(vbf_event_list))
print(len(vh_event_list))

393
1209
