## DSC180B Group11 Project Report

 #### This notebook serves to demonstrate different visual plots for exploratory data analysis(EDA) and dataset validation

In [1]:
# Necessary imports

import os
import sys
import pandas as pd
import numpy as np
import seaborn as sns
from pprint import pprint

sys.path.insert(0, 'src')
from load_data import path_generator, load_jet_features, load_num_sv
from mass_distribution import mass_distribution
from sv_mass_distribution import sv_mass_distribution

%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

----

### Generate sample jet data 

For performing EDA, we will utilize just certain number of QCD and Signal jets for memory and runtime optimization. Source of these files are located either in `train_mass_qcd` for QCD jets or `train_mass_hbb` for Signal jets. Those directories are then separated into numbers of subdirectories. On average, the size of `.root` files for QCD data is far smaller than those for Signal data; this is possibly due to QCD data existing in smaller partitioned files than Signal data. According to our calculation, approximately $55$ `.root` files of QCD jets are as big as $8$ `.root` files of Signal jets in terms of file(byte) size. Hence by default, `path_generator()` generates $55$ random QCD `.root` files and $8$ random Signal `.root` files.

In [3]:
qcd_eda_sets = path_generator('qcd', eda=True)
signal_eda_sets = path_generator('signal', eda=True)

print(f'Loaded {len(qcd_eda_sets)} QCD files and \
{len(signal_eda_sets)} Signal files')

Loaded 55 QCD files and 8 Signal files


----

### EDA\#1 Jet mass distribution

Our regression model is capable of predicting the mass, or weight, of particle jets based on the information learned from features relevant to jet and other existing components in the jet. These jets can be largely categorized into two distinct groups: `QCD` and `Signal`. Some signal jets are known to be relatively heavier than QCD jets due to the type of resulting decay products and their expected lifespan, which allow longer-living particles to be heavier.

##### Part1 -- Validation

Before exploring, we need to ensure each jet belongs to one and only one jet type. 

In [None]:
df_qcd = load_jet_features(qcd_eda_sets)
display(df_qcd.head())
print('\n', f'{df_qcd.shape[0]} randomly generated QCD jet samples')

In [None]:
df_signal = load_jet_features(signal_eda_sets)
display(df_signal.head())
print('\n', f'{df_signal.shape[0]} randomly generated Signal jet samples')

In [None]:
# QCD
# For this checkup, we only need label attribute

IS_QCDb = 'fj_isQCDb'
IS_QCDothers = 'fj_isQCDothers'
all_attrs = df_qcd.columns.tolist()
start_idx = all_attrs.index(IS_QCDb)
end_idx = all_attrs.index(IS_QCDothers)+1

qcd_labels = all_attrs[start_idx:end_idx]

In [None]:
df_qcd_labels = df_qcd[qcd_labels]
display(df_qcd_labels.head())

In [49]:
# We want each jet corresponding to exactly one type
# Having `True` in below print statement proves this

print(f'Each jet corresponds to exactly one type:\
 {len(df_qcd_labels.sum(axis=1).unique()) == 1}')

Each jet corresponds to exactly one type: True


In [50]:
# How many jets are there for different QCD types?

display(df_qcd_labels.sum(axis=0).sort_values(ascending=False)
        .to_frame(name='Count'))

Unnamed: 0,Count
fj_isQCDothers,251314
fj_isQCDlep,89640
fj_isQCDcc,29023
fj_isQCDc,28964
fj_isQCDb,7205
fj_isQCDbb,6805


In [51]:
# Signal jets
# For this checkup, we only need label attribute

IS_HBB = 'fj_H_bb'
IS_HQQ = 'fj_H_qq'
all_attrs = df_signal.columns.tolist()
start_idx = all_attrs.index(IS_HBB)
end_idx = all_attrs.index(IS_HQQ)+1

signal_labels = all_attrs[start_idx:end_idx]

In [52]:
df_signal_labels = df_signal[signal_labels]

# We're only going to include signal jets
# of types H_bb, H_cc, H_qq for performing EDA
# since these three types of Higgs jets 
# are the most common elementary particles
# Higgs bosons decay into
df_signal_labels = df_signal_labels[
    (df_signal_labels['fj_H_bb'] == 1) |
    (df_signal_labels['fj_H_cc'] == 1) |
    (df_signal_labels['fj_H_qq'] == 1)
]

# Drop observations that are associated to more than single type
df_signal_labels['temp'] = df_signal_labels['fj_H_bb'] + df_signal_labels['fj_H_cc'] + df_signal_labels['fj_H_qq']
print(f'Before filtering: {df_signal_labels.shape[0]} rows', '\n')

df_signal_labels = df_signal_labels[df_signal_labels['temp'] == 1].drop(columns='temp')
print(f'After filtering: {df_signal_labels.shape[0]} rows')

Before filtering: 479898 rows 

After filtering: 479856 rows


In [53]:
# We want each jet corresponding to exactly one type

print(f'Each jet corresponds to exactly one type:\
 {len(df_signal_labels.sum(axis=1).unique()) == 1}')

Each jet corresponds to exactly one type: True


In [54]:
# How many jets are there for different signal jet types?

display(df_signal_labels.sum(axis=0).sort_values(ascending=False).to_frame(name='Count'))

Unnamed: 0,Count
fj_H_qq,160226
fj_H_bb,159898
fj_H_cc,159732


##### Part2 -- Distribution of the jet mass for each jet type

In [55]:
# Filtering using the validation results

signal_idx = df_signal_labels.index.tolist()
df_signal = df_signal.filter(items=signal_idx, axis=0)

In [56]:
# Create temporary `class` label to differentiate QCD jets from signal jets
# Then concatenate QCD dataset to signal dataset

df_qcd['Type'] = 'QCD'
df_signal['Type'] = 'Signal'

df_qcd_and_signal = pd.concat([df_qcd, df_signal], axis=0)
display(df_qcd_and_signal.head())

Unnamed: 0,fj_pt,fj_eta,fj_phi,fj_mass,fj_msoftdrop,fj_deepTagMD_H4qvsQCD,fj_deepTag_HvsQCD,fj_PN_H4qvsQCD,fj_PN_XbbvsQCD,fj_genjetmsd,...,fj_genWstar_decay,fj_evt_met_covxx,fj_evt_met_covxy,fj_evt_met_covyy,fj_evt_met_dphi,fj_evt_met_pt,fj_evt_met_sig,fj_evt_pupmet_pt,fj_evt_pupmet_dphi,Type
0,577.0,1.253662,0.694458,463.25,459.25,-1000.0,-1000.0,0.015396,0.002554,443.75,...,-99.0,892.0,606.0,764.0,-3.049438,77.593201,4.257812,82.249359,-3.041138,QCD
1,487.75,-0.114594,-2.475586,69.625,32.5625,-1000.0,-1000.0,0.002708,0.010614,37.34375,...,-99.0,892.0,606.0,764.0,0.120605,77.593201,4.257812,82.249359,0.128906,QCD
2,338.5,0.770996,0.057434,44.125,0.154175,-1000.0,-1000.0,0.012904,0.081559,2.058594,...,-99.0,1692.0,66.5,518.0,-2.951965,49.905506,1.589844,204.212372,-3.056458,QCD
3,409.25,0.35083,-2.502441,78.625,5.953125,-1000.0,-1000.0,1.1e-05,2e-06,7.757812,...,-99.0,1568.0,1040.0,1388.0,1.37085,17.787409,0.67334,22.904118,0.955811,QCD
4,398.0,0.250732,1.083984,213.375,216.5,-1000.0,-1000.0,0.000108,2e-06,230.125,...,-99.0,1568.0,1040.0,1388.0,-2.215576,17.787409,0.67334,22.904118,-2.630615,QCD


In [57]:
hist, summary = mass_distribution(df_qcd_and_signal)

NameError: name 'avg_mass_signal' is not defined

In [58]:
df_qcd_and_signal.groupby('Type').aggregate(avg_jetmass=('fj_genjetmsd','mean'), med_jetmass=('fj_genjetmsd', 'median'))

Unnamed: 0_level_0,avg_jetmass,med_jetmass
Type,Unnamed: 1_level_1,Unnamed: 2_level_1
QCD,125.939079,57.875
Signal,180.705322,109.4375
