DNN training module for Hµµ analysis
===

In [None]:
!mkdir output/trained_models/

In [None]:
def extract_unbinned_data(path, classes, the_channel):
    proc_out = util.load(path)
    unbin_keys = [k for k in list(proc_out.keys()) if 'unbin' in k]
    
    # Get list [var, smp, ch, reg] for every unbinned output accumulator
    items = [[i.split('_unbin_')[0]]+\
                    [i.split('_unbin_')[1].split('_c_')[0]]+\
                    i.split('_unbin_')[1].split('_c_')[1].split('_r_') for i in unbin_keys]
    variables = set()
    samples = set()
    regions = set()
    channels = set()
    for item in items:
        variables.add(item[0])
        samples.add(item[1])
        channels.add(item[2])
        regions.add(item[3])
        
    variables = list(variables)
    dfs = {}
    cls_idx = 0
    for cls, smp_list in classes.items():
        for s in smp_list:
            for r in regions:
                c = the_channel
                lbl = f'{c}_channel_{s}_{r}'
                dfs[lbl] = pd.DataFrame(columns=variables+['class','class_idx'])
                for v in variables:
                    dfs[lbl][v] = proc_out[f'{v}_unbin_{s}_c_{c}_r_{r}'].value
                dfs[lbl]['class'] = cls
                dfs[lbl]['class_idx'] = cls_idx
                dfs[lbl] = dfs[lbl].iloc[0:3000]
                print(lbl)
        cls_idx += 1
    return dfs

def scale_data(inputs):
    x_mean = np.mean(x_train[inputs].values,axis=0)
    x_std = np.std(x_train[inputs].values,axis=0)
    training_data = (x_train[inputs]-x_mean)/x_std
    testing_data = (x_test[inputs]-x_mean)/x_std
    np.save("output/trained_models/scalers", [x_mean, x_std])
    return training_data, testing_data


In [None]:
inputs_binary = {
    'background': ['dy', 'ewk_lljj_mll50_mjj120', 'ttjets_dl'],
    'signal': ['ggh_amcPS', 'vbf_amcPS'],
}

inputs_multi = {
    'ggh': ['ggh_amcPS'],
    'vbf': ['vbf_amcPS'],
    'dy': ['dy'],
    'ewk': ['ewk']
}

training_features = ['dimuon_mass', 'dimuon_pt', 'dimuon_eta', 'dimuon_dEta', 'mu1_pt', 'mu2_pt']

In [None]:
from coffea import util
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

load_path = 'output/test_dask.coffea'
    
classes = inputs_binary

# TODO: parallelize loading
df_dict = extract_unbinned_data(load_path, classes, 'vbf')

df = pd.DataFrame()
df = pd.concat(df_dict)

x_train, x_test, y_train, y_test = train_test_split(df[training_features], df['class_idx'], train_size=0.6, shuffle=True)

for i in range(len(classes)):
    cls_name = list(classes.keys())[i]
    train_evts = len(y_train[y_train==i])
    print(f"{train_evts} training events in class {cls_name}")
    
# scale data
x_train, x_test = scale_data(training_features)

In [None]:
from keras.models import Model
from keras.layers import Dense, Activation, Input, Dropout, Concatenate, Lambda, BatchNormalization
from keras import backend as K

# load model
input_dim = len(training_features)
label = 'test'
inputs = Input(shape=(input_dim,), name = label+'_input')
x = Dense(100, name = label+'_layer_1', activation='tanh')(inputs)
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
x = Dense(100, name = label+'_layer_2', activation='tanh')(x)
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
x = Dense(100, name = label+'_layer_3', activation='tanh')(x)
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
outputs = Dense(1, name = label+'_output',  activation='sigmoid')(x)

model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
model.summary()

In [None]:
# train and save model
history = model.fit(x_train, y_train, epochs=500, batch_size=2048, verbose=1,
                                    validation_split=0.2, shuffle=True)
model.save('output/trained_models/test.h5')

In [None]:
import matplotlib.pyplot as plt
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,5))

ax1.plot(history.history['acc'])
ax1.plot(history.history['val_acc'])
ax1.set_title('model accuracy')
ax1.set_ylabel('accuracy')
ax1.set_xlabel('epoch')
ax1.legend(['train', 'test'], loc='best')

ax2.plot(history.history['loss'])
ax2.plot(history.history['val_loss'])
ax2.set_title('model loss')
ax2.set_ylabel('loss')
ax2.set_xlabel('epoch')
ax2.legend(['train', 'test'], loc='best')

f.subplots_adjust(wspace=0.3)