## DSC180B Group11 Project Report

 #### This notebook serves to demonstrate different visual plots for exploratory data analysis(EDA) and dataset validation

In [7]:
# Necessary imports

import os
import sys
import pandas as pd
import numpy as np
import seaborn as sns
from pprint import pprint

sys.path.insert(0, 'src')
from load_data import path_generator, load_jet_features, load_num_sv
from mass_distribution import mass_distribution
from sv_mass_distribution import sv_mass_distribution

----

### Generate sample jet data 

For performing EDA, we will utilize just certain number of QCD and Signal jets for memory and runtime optimization. Source of these files are located either in `train_mass_qcd` for QCD jets or `train_mass_hbb` for Signal jets. Those directories are then separated into numbers of subdirectories. On average, the size of `.root` files for QCD data is far smaller than those for Signal data; this is possibly due to QCD data existing in smaller partitioned files than Signal data. According to our calculation, approximately $55$ `.root` files of QCD jets are as big as $8$ `.root` files of Signal jets in terms of file(byte) size. Hence by default, `path_generator()` generates $55$ random QCD `.root` files and $8$ random Signal `.root` files.

In [10]:
qcd_eda_sets = path_generator('qcd', eda=True)
signal_eda_sets = path_generator('signal', eda=True)

print(f'Loaded {len(qcd_eda_sets)} QCD files and \
{len(signal_eda_sets)} Signal files')

Loaded 55 QCD files and 8 Signal files


----

### EDA\#1 Jet mass distribution

Our regression model is capable of predicting the mass, or weight, of particle jets based on the information learned from features relevant to jet and other existing components in the jet. These jets can be largely categorized into two distinct groups: `QCD` and `Signal`. Some signal jets are known to be relatively heavier than QCD jets due to the type of resulting decay products and their expected lifespan, which allow longer-living particles to be heavier.

##### Part1 -- Validation

Before exploring, we need to ensure each jet belongs to one and only one jet type. 

In [5]:
df_qcd = load_jet_features(qcd_eda_sets)
display(df_qcd.head())
print('\n', f'{df_qcd.shape[0]} randomly generated QCD jet samples')

Unnamed: 0,fj_pt,fj_eta,fj_phi,fj_mass,fj_msoftdrop,fj_deepTagMD_H4qvsQCD,fj_deepTag_HvsQCD,fj_PN_H4qvsQCD,fj_PN_XbbvsQCD,fj_genjetmsd,...,fj_genW_decay,fj_genWstar_decay,fj_evt_met_covxx,fj_evt_met_covxy,fj_evt_met_covyy,fj_evt_met_dphi,fj_evt_met_pt,fj_evt_met_sig,fj_evt_pupmet_pt,fj_evt_pupmet_dphi
0,552.0,-0.283264,-0.547363,355.0,355.0,-1000.0,-1000.0,0.000158,0.013931,366.0,...,-99.0,-99.0,2040.0,-1116.0,1668.0,-0.424683,34.307938,0.481445,46.918556,-0.228882
1,482.75,0.016449,2.612793,284.25,286.25,-1000.0,-1000.0,0.474515,0.001554,287.75,...,-99.0,-99.0,2040.0,-1116.0,1668.0,2.698346,34.307938,0.481445,46.918556,2.894147
2,553.0,-0.2677,-2.07373,164.375,3.195312,-1000.0,-1000.0,0.000376,0.003152,3.703125,...,-99.0,-99.0,1280.0,1244.0,3496.0,3.122314,54.951088,0.779297,59.462116,3.141357
3,481.25,-0.625488,1.160156,142.25,13.710938,-1000.0,-1000.0,2.7e-05,0.000314,20.15625,...,-99.0,-99.0,1280.0,1244.0,3496.0,-0.111572,54.951088,0.779297,59.462116,-0.092529
4,507.75,-2.178711,1.963623,57.4375,1.329102,-1000.0,-1000.0,1.4e-05,0.00442,1.149414,...,-99.0,-99.0,810.0,2.476562,806.0,2.56175,40.881451,2.070312,76.528282,2.505353



 413325 randomly generated QCD jet samples


In [6]:
df_signal = load_jet_features(signal_eda_sets)
display(df_signal.head())
print('\n', f'{df_signal.shape[0]} randomly generated Signal jet samples')

Unnamed: 0,fj_pt,fj_eta,fj_phi,fj_mass,fj_msoftdrop,fj_deepTagMD_H4qvsQCD,fj_deepTag_HvsQCD,fj_PN_H4qvsQCD,fj_PN_XbbvsQCD,fj_genjetmsd,...,fj_genW_decay,fj_genWstar_decay,fj_evt_met_covxx,fj_evt_met_covxy,fj_evt_met_covyy,fj_evt_met_dphi,fj_evt_met_pt,fj_evt_met_sig,fj_evt_pupmet_pt,fj_evt_pupmet_dphi
0,1599.0,-0.137848,1.955322,95.0625,30.21875,-1000.0,-1000.0,0.007365,0.999586,28.640625,...,-99.0,-99.0,1492.0,-61.75,1452.0,2.755353,45.343674,1.418945,56.79369,3.061017
1,1507.0,0.254028,-1.185547,145.875,16.59375,-1000.0,-1000.0,0.000732,0.999319,27.71875,...,-99.0,-99.0,1492.0,-61.75,1452.0,-0.386963,45.343674,1.418945,56.79369,-0.081299
2,1218.0,-1.104492,0.587646,179.75,29.265625,-1000.0,-1000.0,7.7e-05,0.008649,30.0,...,-99.0,-99.0,1052.0,-31.0,1188.0,-2.641846,67.014511,3.982422,42.740677,-2.358398
3,1210.0,0.209106,-2.534668,201.0,24.90625,-1000.0,-1000.0,0.001788,0.992894,27.625,...,-99.0,-99.0,1052.0,-31.0,1188.0,0.480469,67.014511,3.982422,42.740677,0.763916
4,1577.0,-0.291626,1.657227,140.625,33.25,-1000.0,-1000.0,0.000995,0.99986,29.984375,...,-99.0,-99.0,830.0,-456.0,7008.0,3.116193,319.941223,14.554688,346.265839,-3.126465



 509020 randomly generated Signal jet samples


In [11]:
# QCD
# For this checkup, we only need label attribute

IS_QCDb = 'fj_isQCDb'
IS_QCDothers = 'fj_isQCDothers'
all_attrs = df_qcd.columns.tolist()
start_idx = all_attrs.index(IS_QCDb)
end_idx = all_attrs.index(IS_QCDothers)+1

qcd_labels = all_attrs[start_idx:end_idx]

In [12]:
df_qcd_labels = df_qcd[qcd_labels]
display(df_qcd_labels.head())

Unnamed: 0,fj_isQCDb,fj_isQCDbb,fj_isQCDc,fj_isQCDcc,fj_isQCDlep,fj_isQCDothers
0,0,0,0,0,0,1
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,0,0,1,0,0,0


In [13]:
# We want each jet corresponding to exactly one type
# Having `True` in below print statement proves this

print(f'Each jet corresponds to exactly one type:\
 {len(df_qcd_labels.sum(axis=1).unique()) == 1}')

Each jet corresponds to exactly one type: True


In [14]:
# How many jets are there for different QCD types?

display(df_qcd_labels.sum(axis=0).sort_values(ascending=False)
        .to_frame(name='Count'))

Unnamed: 0,Count
fj_isQCDothers,252118
fj_isQCDlep,89702
fj_isQCDcc,28917
fj_isQCDc,28763
fj_isQCDb,7034
fj_isQCDbb,6791


In [15]:
# Signal jets
# For this checkup, we only need label attribute

IS_HBB = 'fj_H_bb'
IS_HQQ = 'fj_H_qq'
all_attrs = df_signal.columns.tolist()
start_idx = all_attrs.index(IS_HBB)
end_idx = all_attrs.index(IS_HQQ)+1

signal_labels = all_attrs[start_idx:end_idx]

In [16]:
df_signal_labels = df_signal[signal_labels]

# We're only going to include signal jets
# of types H_bb, H_cc, H_qq for performing EDA
# since these three types of Higgs jets 
# are the most common elementary particles
# Higgs bosons decay into
df_signal_labels = df_signal_labels[
    (df_signal_labels['fj_H_bb'] == 1) |
    (df_signal_labels['fj_H_cc'] == 1) |
    (df_signal_labels['fj_H_qq'] == 1)
]

# Drop observations that are associated to more than single type
df_signal_labels['temp'] = df_signal_labels['fj_H_bb'] + df_signal_labels['fj_H_cc'] + df_signal_labels['fj_H_qq']
print(f'Before filtering: {df_signal_labels.shape[0]} rows', '\n')

df_signal_labels = df_signal_labels[df_signal_labels['temp'] == 1].drop(columns='temp')
print(f'After filtering: {df_signal_labels.shape[0]} rows')

Before filtering: 484912 rows 

After filtering: 484872 rows


In [17]:
# We want each jet corresponding to exactly one type

print(f'Each jet corresponds to exactly one type:\
 {len(df_signal_labels.sum(axis=1).unique()) == 1}')

Each jet corresponds to exactly one type: True


In [18]:
# How many jets are there for different signal jet types?

display(df_signal_labels.sum(axis=0).sort_values(ascending=False).to_frame(name='Count'))

Unnamed: 0,Count
fj_H_bb,161845
fj_H_cc,161602
fj_H_qq,161425


##### Part2 -- Distribution of the jet mass for each jet type

In [20]:
# Filtering using the validation results

signal_idx = df_signal_labels.index.tolist()
df_signal = df_signal.filter(items=signal_idx, axis=0)

In [21]:
# Create temporary `class` label to differentiate QCD jets from signal jets
# Then concatenate QCD dataset to signal dataset

df_qcd['Type'] = 'QCD'
df_signal['Type'] = 'Signal'

df_qcd_and_signal = pd.concat([df_qcd, df_signal], axis=0)
display(df_qcd_and_signal.head())

Unnamed: 0,fj_pt,fj_eta,fj_phi,fj_mass,fj_msoftdrop,fj_deepTagMD_H4qvsQCD,fj_deepTag_HvsQCD,fj_PN_H4qvsQCD,fj_PN_XbbvsQCD,fj_genjetmsd,...,fj_genWstar_decay,fj_evt_met_covxx,fj_evt_met_covxy,fj_evt_met_covyy,fj_evt_met_dphi,fj_evt_met_pt,fj_evt_met_sig,fj_evt_pupmet_pt,fj_evt_pupmet_dphi,Type
0,552.0,-0.283264,-0.547363,355.0,355.0,-1000.0,-1000.0,0.000158,0.013931,366.0,...,-99.0,2040.0,-1116.0,1668.0,-0.424683,34.307938,0.481445,46.918556,-0.228882,QCD
1,482.75,0.016449,2.612793,284.25,286.25,-1000.0,-1000.0,0.474515,0.001554,287.75,...,-99.0,2040.0,-1116.0,1668.0,2.698346,34.307938,0.481445,46.918556,2.894147,QCD
2,553.0,-0.2677,-2.07373,164.375,3.195312,-1000.0,-1000.0,0.000376,0.003152,3.703125,...,-99.0,1280.0,1244.0,3496.0,3.122314,54.951088,0.779297,59.462116,3.141357,QCD
3,481.25,-0.625488,1.160156,142.25,13.710938,-1000.0,-1000.0,2.7e-05,0.000314,20.15625,...,-99.0,1280.0,1244.0,3496.0,-0.111572,54.951088,0.779297,59.462116,-0.092529,QCD
4,507.75,-2.178711,1.963623,57.4375,1.329102,-1000.0,-1000.0,1.4e-05,0.00442,1.149414,...,-99.0,810.0,2.476562,806.0,2.56175,40.881451,2.070312,76.528282,2.505353,QCD


In [26]:
hist, summary = mass_distribution(df_qcd_and_signal)

NameError: name 'avg_mass_signal' is not defined

In [24]:
stat = (df_qcd_and_signal
     .groupby('Type')
     .aggregate(avg_jetmass=('fj_genjetmsd','mean'), med_jetmass=('fj_genjetmsd', 'median'))
)
display(stat)

Unnamed: 0_level_0,avg_jetmass,med_jetmass
Type,Unnamed: 1_level_1,Unnamed: 2_level_1
QCD,125.412399,57.59375
Signal,181.311661,110.1875


In [25]:
avg_mass_qcd = stat.loc['QCD', 'avg_jetmass']
avg_mass_signal = stat.loc['Signal', 'avg_jetmass']

med_mass_qcd = stat.loc['QCD', 'med_jetmass']
med_mass_signal = stat.loc['Signal', 'med_jetmass']

# Text description to put on top of histogram
text = f'Average mass of Signal jets: {avg_mass_signal:.5}\n\
Average mass of QCD jets: {avg_mass_qcd:.5}'

In [None]:
# Used `.displot()` from seaborn for visualization

_ = sns.set(context='notebook', rc={'figure.figsize':(14,8)}, 
            style='ticks', palette='pastel')
ax = sns.histplot(x='fj_genjetmsd', data=df_qcd_and_signal, hue='Type',
                bins=range(0, 1250, 125), multiple='stack')

_ = ax.set_title('Distribution of jet mass by jet type', fontdict={'size':15, 'weight':'bold'})
_ = ax.set_xlabel('Generator-level soft drop mass')
_ = ax.text(550, 300000, text)
_ = ax.set_xticks(range(0,1250,125))
_ = ax.set_ylabel('Counts')
_ = ax.spines['right'].set_visible(False)
_ = ax.spines['top'].set_visible(False)