# Simple neural network

## Config

In [4]:
### CONFIG ###
thefile = 'prepared-20181001_prefix-pNN_input_2tag_Sig_BKGs.pickle'
### END ###

do3tag = False
doTrainNetwork = True

layer_size = 1
dropout = None
nepochs = 50
batch_size = 128
modeltag = 'nn_l{0}e{1}b{2}'.format(layer_size,nepochs,batch_size)

netfile = "model_{0}.h5".format(modeltag)
graphfile = "graph_{0}.png".format(modeltag)
file_performance = "perf_{0}.pickle".format(modeltag)
file_minmax_scaler = "minmax_scaler.pickle"
file_quantile_scaler = "quantile_scaler.pickle"

## Import

In [5]:
import argparse
import os
import re
from glob import glob
import numpy as np
import pandas as pd
import sys

import pickle

from sklearn.utils import shuffle
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.externals import joblib

from keras.models import Model, load_model
from keras.layers import Input, Dense, Dropout
from keras.regularizers import l1_l2
from keras.optimizers import SGD
from keras import backend as K
from keras.utils import plot_model

ModuleNotFoundError: No module named 'keras'

## Assistant functions

In [3]:
import logging
import sys

#############################################################
def setup_custom_logger(name):
    formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
                                  datefmt='%Y-%m-%d %H:%M:%S')
    handler = logging.FileHandler('log.txt', mode='w')
    handler.setFormatter(formatter)
    screen_handler = logging.StreamHandler(stream=sys.stdout)
    screen_handler.setFormatter(formatter)
    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(handler)
    logger.addHandler(screen_handler)
    return logger

#############################################################
def get_model(n_invars, n_hidden_nodes, regularization=None, dropout=None):
    if not isinstance(n_hidden_nodes, list):
        n_hidden_nodes = [n_hidden_nodes]

    x = Input(shape=(n_invars,))
    d = x
    for n in n_hidden_nodes:
        d = Dense(n, activation="relu", kernel_regularizer=regularization)(d)
        if dropout:
            d = Dropout(dropout)(d)
    y = Dense(1, activation="sigmoid")(d)
    return Model(x, y)
##########################################################
def preprocess(X, quantile_trsf=None, minmax_trsf=None):
    if quantile_trsf is None and minmax_trsf is None:
        quantile_trsf = QuantileTransformer()
        minmax_trsf = MinMaxScaler()
        quantile_trsf.fit(X[:, :-1])
        minmax_trsf.fit(X[:, -1][:, np.newaxis])
    elif quantile_trsf is not None and minmax_trsf is not None:
        pass
    else:
        return None
    X_trsf_no_mass = quantile_trsf.transform(X[:, :-1])
    X_trsf_mass = minmax_trsf.transform(X[:, -1][:, np.newaxis])

    return quantile_trsf, minmax_trsf, np.hstack([X_trsf_no_mass, X_trsf_mass])
##########################################################


#############################################################
import numpy as np

# map mA,mH -> DISD
dict_mAmH_dsid = {
  'ggF_llbb' : {
    (230,130) : 306939,
    (250,130) : 306940,
    (230,150) : 306941,
    (300,130) : 306942,
    (300,150) : 306943,
    (300,200) : 306944,
    (350,250) : 306945,
    (400,130) : 306946,
    (400,200) : 306948,
    (400,250) : 344587,
    (500,130) : 306952,
    (500,200) : 306955,
    (500,300) : 306958,
    (500,350) : 308468,
    (500,400) : 306959,
    (600,130) : 306962,
    (600,300) : 306966,
    (600,400) : 344588,
    (600,450) : 308469,
    (600,500) : 306967,
    (700,130) : 306968,
    (700,200) : 306970,
    (700,300) : 306972,
    (700,400) : 306973,
    (700,500) : 306974,
    (700,600) : 308568,
    (800,130) : 308569,
    (800,300) : 308570,
    (800,500) : 344589,
    (800,700) : 308571,
  },
}

mH_min = 130
mH_max = 700
mA_min = 230
mA_max = 800

# mAmH -> DSID list
#############################################################
# mAmH_list is a list of mA,mH pair
# prod: ggF_llbb, bbA_llbb ...
def mAmH2disd( mAmH_list, prod ):
  return [ dict_mAmH_dsid[prod][(mA,mH)] for mA,mH in mAmH_list ]

# random sampling of background mA and mH
#############################################################
# nm: mA or mH
# the whole df
def sample_bkg_mass(nm, df, method=0, seed=None):
  n_bkg = (~df.IsSignal).sum()
  m_bkg = None
  if seed:
    np.random.seed(seed)
  if method==0:
    m_bkg = np.random.choice(df[nm][~df.IsSignal],size=n_bkg)
  else:
    #m_bkg = np.random.uniform(np.min(m[isSig]),np.max(m[isSig]),size=n_bkg)
    if nm == 'mA':
      m_bkg = np.random.uniform( mA_min, mH_min, size=n_bkg)
    if nm == 'mH':
      m_bkg = np.random.uniform( mH_min, mA_max, size=n_bkg)
  df.loc[~df.IsSignal, nm] = m_bkg

# choose signal samples
#############################################################
# mAmH_list is a list of mA,mH pair
# prod: ggF_llbb, bbA_llbb ...
def choose_signal_samples( df, mAmH_list, prod ):
  print('choose_signal_samples: signal tot # =',df.IsSignal.sum())
  dsid_list = mAmH2disd( mAmH_list, prod )
  df_chosen = df[ ((df.IsSignal) & (df.label.isin(dsid_list)) ) | (~df.IsSignal) ]
  print('choose_signal_samples: signal tot chosen to use # =',df_chosen.IsSignal.sum() )
  return df_chosen

# list input variables
#############################################################
def input_variables( do3tag, df ):
  inputs = None
  if do3tag:
    inputs = df[["lep0pt","lep0eta","lep0phi",
             "lep1pt","lep1eta","lep1phi",
             "jet0pt","jet0eta","jet0phi",
             "jet1pt","jet1eta","jet1phi",
             "jet2pt","jet2eta","jet2phi",
             "mA","mH"]].values
  else:
    inputs = df[["lep0pt","lep0eta","lep0phi",
             "lep1pt","lep1eta","lep1phi",
             "jet0pt","jet0eta","jet0phi",
             "jet1pt","jet1eta","jet1phi",
             "mA","mH"]].values
  return inputs
