# Imports and Helper Functions

In [None]:
import multiprocessing
num_available_cpus = multiprocessing.cpu_count()

print("Number of available CPUs:", num_available_cpus)

import sys

import math
import time
import tqdm

import numpy as np
import scipy as sp
from scipy import stats
from scipy.spatial import ConvexHull

import itertools
import logging
import matplotlib.pyplot as plt

import pandas as pd
import h5py

from sklearn import metrics

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.distributions import MultivariateNormal
import torch.utils.data as utils

from argparse import ArgumentParser
import re

sys.path.append("../new_flows")
from flows import RealNVP, Planar, MAF
from models import NormalizingFlowModel

In [None]:
from nflows.flows.base import Flow
from nflows.flows.autoregressive import MaskedAutoregressiveFlow
from nflows.distributions.normal import StandardNormal
from nflows.transforms.base import CompositeTransform
from nflows.transforms.autoregressive import MaskedAffineAutoregressiveTransform, MaskedPiecewiseQuadraticAutoregressiveTransform, MaskedPiecewiseRationalQuadraticAutoregressiveTransform
from nflows.transforms.permutations import ReversePermutation

In [None]:
from helper_functions import *

# Load and process bkg samples

In [None]:
num_batches = 1

Mjj_cut = 1200
pt_cut = 550
eta_cut = None

In [None]:
torch.cuda.empty_cache()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device =", device)
torch.set_default_tensor_type('torch.cuda.FloatTensor') if torch.cuda.is_available() else print ('cpu')

torch.set_num_threads(num_available_cpus)

print("Number of threads:", torch.get_num_threads())
print("Number of interop threads:", torch.get_num_interop_threads())

In [None]:
bkg_data, bkg_unnorm_data, bkg_masses = LAPS_train(sample_type = 'qcdbkg', num_batches = num_batches)
num_bkg_samples = bkg_data.shape[0]

bkg_mean = np.mean(bkg_unnorm_data, axis=0)
bkg_std = np.std(bkg_unnorm_data, axis=0)

In [None]:
print(num_bkg_samples)

plt.hist(bkg_masses, bins=50)
plt.show()

In [None]:
total_PureBkg = torch.tensor(bkg_data)
total_PureBkg_selection = total_PureBkg

In [None]:
total_PureBkg_selection.shape

In [None]:
bs = 10000 * num_batches
bkgAE_train_iterator = utils.DataLoader(total_PureBkg_selection, batch_size=bs, shuffle=True) 
bkgAE_test_iterator = utils.DataLoader(total_PureBkg_selection, batch_size=bs)

# Build the bkg-trained model

In [None]:
num_features = 14
hidden_features = 56

num_layers = 4
num_blocks_per_layer = 4
#num_iter = 10000
num_iter = 1000
print_interval = 20

#Current flow_type options: 'MAF', 'NSQUAD' (neural spline quadratic), 'NSRATQUAD' (neural spline rational quadratic)
flow_type = 'NSQUAD'

save_models = False

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device =", device)

In [None]:
print(type(bkg_data[0,0]))

In [None]:
bkg_loss_dict = dict()
bkg_flow_list = []

In [None]:
filename = 'Pure_NF_%s_k%s_hf%s_nbpl%s' % (flow_type, num_layers, hidden_features, num_blocks_per_layer)

print("FCNN Hidden Layer Width: ", hidden_features)

print('------------------------------------')

bkg_base_dist = StandardNormal(shape=[num_features])

bkg_transforms = []
for _ in range(num_layers):
    bkg_transforms.append(ReversePermutation(features=num_features))
    if flow_type == 'MAF': 
        bkg_transforms.append(MaskedAffineAutoregressiveTransform(features=num_features, 
                                                          hidden_features=hidden_features))
    elif flow_type == 'NSQUAD': 
        bkg_transforms.append(MaskedPiecewiseQuadraticAutoregressiveTransform(features=num_features, 
                                                          hidden_features=hidden_features, tail_bound = 3.0, tails='linear'))
    elif flow_type == 'NSRATQUAD': 
        bkg_transforms.append(MaskedPiecewiseRationalQuadraticAutoregressiveTransform(features=num_features, 
                                                          hidden_features=hidden_features, tail_bound = 3.0, tails='linear'))

bkg_transform = CompositeTransform(bkg_transforms)

bkg_flow = Flow(bkg_transform, bkg_base_dist)
#print(bkg_flow)

bkg_optimizer = optim.Adam(bkg_flow.parameters())

bkg_tick = time.time()

bkg_min_loss = 999999
bkg_best_flow = None

cur_losses = []
patience_count = 0

for i in range(num_iter):
    
    terminate = False

    for batch_idx, x in enumerate(bkgAE_train_iterator):

        #x, y = datasets.make_moons(1024, noise=.1)
        #x = bkg_tr_data
        #x = torch.tensor(x, dtype=torch.float32)

        bkg_optimizer.zero_grad()
        loss = -bkg_flow.log_prob(inputs=x)[0].mean()
        loss.backward()
        bkg_optimizer.step()

        if batch_idx == len(bkgAE_train_iterator) - 1 :

            xline = torch.linspace(-2, 2)
            yline = torch.linspace(-2, 2)
            xgrid, ygrid = torch.meshgrid(xline, yline)
            xyinput = torch.cat([xgrid.reshape(-1, 1), ygrid.reshape(-1, 1)], dim=1)

            with torch.no_grad():
                #bkg_zgrid = bkg_flow.log_prob(xyinput)[0].exp().reshape(100, 100)
                bkg_zgrid = -bkg_flow.log_prob(x)[0].cpu().numpy()

                #print(bkg_zgrid.shape)
                #bkg_zgrid = bkg_zgrid[bkg_zgrid < 10]
                #print(bkg_zgrid.shape)
                #print(bkg_zgrid[:5])

            if (i + 1) % print_interval == 0: 
                print('bkg Iteration {} Complete'.format(i + 1))

            bkg_print_loss = loss.detach().cpu().numpy()
            cur_losses.append(bkg_print_loss) 
            print('Loss: ', bkg_print_loss)

            if bkg_print_loss < bkg_min_loss: 
                patience_count = 0
                bkg_best_flow = bkg_flow
                if save_models: 
                    torch.save(bkg_flow, "new_sample_flows/M3000_TV/bkg_%s.pt" % (filename))
                bkg_min_loss = bkg_print_loss
                if (i + 1) % print_interval == 0: 
                    print('SAVING MODEL')
            else: 
                patience_count += 1
                if (i + 1) % print_interval == 0: 
                    print('NOT SAVING MODEL (PATIENCE = %s)' % patience_count)
                if patience_count == 10: 
                    terminate = True
                    break

            bkg_tock = time.time()

            if (i + 1) % print_interval == 0: 
                print('Time: ', bkg_tock - bkg_tick)
                print('------------------------------------')
                !nvidia-smi

            #plt.contourf(xgrid.numpy(), ygrid.numpy(), bkg_zgrid.numpy())
            #plt.title('iteration {}'.format(i + 1))
            #plt.show()

            #plt.hist(bkg_zgrid, bins=30, histtype='step')
            #plt.title('iteration {}'.format(i + 1))
            #plt.show()
            
    if terminate: 
        break

bkg_loss_dict[hidden_features] = float(bkg_min_loss)
bkg_flow_list.append(bkg_best_flow)

print('------------------------------------')

In [None]:
plt.plot(cur_losses)
plt.xlabel('Epoch')
plt.ylabel('Bkg-trained loss')
plt.show()

# Load and process sig samples

In [None]:
num_batches = 1

Mjj_cut = 1200
pt_cut = 550
eta_cut = None

In [None]:
sig_data, sig_unnorm_data, sig_masses = LAPS_train(sample_type = 'Wp3000', num_batches = num_batches, inp_meanstd = (bkg_mean, bkg_std))
num_sig_samples = sig_data.shape[0]

sig_training_data_percentage = 98
num_sig_training_samples = int(sig_training_data_percentage * num_sig_samples / 100)

indices = np.random.permutation(num_sig_samples)
sig_training_indices, sig_testing_indices = indices[:num_sig_training_samples], indices[num_sig_training_samples:]
sig_training_data, sig_testing_data = sig_data[sig_training_indices], sig_data[sig_testing_indices]
sig_unnorm_training_data, sig_unnorm_testing_data = sig_unnorm_data[sig_training_indices], sig_unnorm_data[sig_testing_indices]
sig_training_masses, sig_testing_masses = sig_masses[sig_training_indices], sig_masses[sig_testing_indices]

print(num_sig_samples)
print(num_sig_training_samples)
print(sig_testing_data.shape[0])
print(num_sig_training_samples + sig_testing_data.shape[0] == num_sig_samples)

In [None]:
print(num_sig_training_samples)

plt.hist(sig_training_masses, bins=50)
plt.show()

In [None]:
total_PureSig = torch.tensor(sig_training_data)
total_PureSig_selection = total_PureSig

In [None]:
total_PureSig_selection.shape

In [None]:
bs = 3500 * num_batches
sigAE_train_iterator = utils.DataLoader(total_PureSig_selection, batch_size=bs, shuffle=True) 
sigAE_test_iterator = utils.DataLoader(total_PureSig_selection, batch_size=bs)

# Build the sig-trained model

In [None]:
num_features = 14
hidden_features = 56

num_layers = 4
num_blocks_per_layer = 4
#num_iter = 10000
num_iter = 1000
print_interval = 20

#Current flow_type options: 'MAF', 'NSQUAD' (neural spline quadratic), 'NSRATQUAD' (neural spline rational quadratic)
flow_type = 'NSQUAD'

save_models = False

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device =", device)

In [None]:
print(type(sig_training_data[0,0]))

In [None]:
sig_loss_dict = dict()
sig_flow_list = []

In [None]:
filename = 'Pure_NF_%s_k%s_hf%s_nbpl%s' % (flow_type, num_layers, hidden_features, num_blocks_per_layer)

print("FCNN Hidden Layer Width: ", hidden_features)

print('------------------------------------')

sig_base_dist = StandardNormal(shape=[num_features])

sig_transforms = []
for _ in range(num_layers):
    sig_transforms.append(ReversePermutation(features=num_features))
    if flow_type == 'MAF': 
        sig_transforms.append(MaskedAffineAutoregressiveTransform(features=num_features, 
                                                          hidden_features=hidden_features))
    elif flow_type == 'NSQUAD': 
        sig_transforms.append(MaskedPiecewiseQuadraticAutoregressiveTransform(features=num_features, 
                                                          hidden_features=hidden_features, tail_bound = 3.0, tails='linear'))
    elif flow_type == 'NSRATQUAD': 
        sig_transforms.append(MaskedPiecewiseRationalQuadraticAutoregressiveTransform(features=num_features, 
                                                          hidden_features=hidden_features, tail_bound = 3.0, tails='linear'))

sig_transform = CompositeTransform(sig_transforms)

sig_flow = Flow(sig_transform, sig_base_dist)
#print(sig_flow)

sig_optimizer = optim.Adam(sig_flow.parameters())

sig_tick = time.time()

sig_min_loss = 999999
sig_best_flow = None

cur_losses = []
patience_count = 0

for i in range(num_iter):
    
    terminate = False

    for batch_idx, x in enumerate(sigAE_train_iterator):

        #x, y = datasets.make_moons(1024, noise=.1)
        #x = sig_tr_data
        #x = torch.tensor(x, dtype=torch.float32)

        sig_optimizer.zero_grad()
        loss = -sig_flow.log_prob(inputs=x)[0].mean()
        loss.backward()
        sig_optimizer.step()

        if batch_idx == len(sigAE_train_iterator) - 1 :

            xline = torch.linspace(-2, 2)
            yline = torch.linspace(-2, 2)
            xgrid, ygrid = torch.meshgrid(xline, yline)
            xyinput = torch.cat([xgrid.reshape(-1, 1), ygrid.reshape(-1, 1)], dim=1)

            with torch.no_grad():
                #sig_zgrid = sig_flow.log_prob(xyinput)[0].exp().reshape(100, 100)
                sig_zgrid = -sig_flow.log_prob(x)[0].cpu().numpy()

                #print(sig_zgrid.shape)
                #sig_zgrid = sig_zgrid[sig_zgrid < 10]
                #print(sig_zgrid.shape)
                #print(sig_zgrid[:5])
                
            if (i + 1) % print_interval == 0: 
                print('sig Iteration {} Complete'.format(i + 1))

            sig_print_loss = loss.detach().cpu().numpy()
            cur_losses.append(sig_print_loss) 
            print('Loss: ', sig_print_loss)

            if sig_print_loss < sig_min_loss: 
                patience_count = 0
                sig_best_flow = sig_flow
                if save_models: 
                    torch.save(sig_flow, "new_sample_flows/M3000_TV/sig_%s.pt" % (filename))
                sig_min_loss = sig_print_loss
                if (i + 1) % print_interval == 0: 
                    print('SAVING MODEL')
            else: 
                patience_count += 1
                if (i + 1) % print_interval == 0: 
                    print('NOT SAVING MODEL (PATIENCE = %s)' % patience_count)
                if patience_count == 10: 
                    terminate = True
                    break

            sig_tock = time.time()

            if (i + 1) % print_interval == 0: 
                print('Time: ', sig_tock - sig_tick)
                print('------------------------------------')
                !nvidia-smi

            #plt.contourf(xgrid.numpy(), ygrid.numpy(), sig_zgrid.numpy())
            #plt.title('iteration {}'.format(i + 1))
            #plt.show()

            #plt.hist(sig_zgrid, bins=30, histtype='step')
            #plt.title('iteration {}'.format(i + 1))
            #plt.show()
            
    if terminate: 
        break

sig_loss_dict[hidden_features] = float(sig_min_loss)
sig_flow_list.append(sig_best_flow)

print('------------------------------------')

In [None]:
plt.plot(cur_losses)
plt.xlabel('Epoch')
plt.ylabel('Sig-trained Loss')
plt.show()

# Bkg-trained post-training analysis

In [None]:
plt.rcParams["figure.figsize"] = (5,5)

In [None]:
bkg_test = torch.tensor(bkg_data)
sig_test = torch.tensor(sig_training_data)

In [None]:
def get_tpr_fpr_bkgtr(sigloss,bkgloss):
    bins = np.linspace(0,100,10001)
    tpr = []
    fpr = []
    for cut in bins:
        tpr.append(np.where(sigloss>cut)[0].shape[0]/len(sigloss))
        fpr.append(np.where(bkgloss>cut)[0].shape[0]/len(bkgloss))

    return tpr,fpr

In [None]:
new_bkg_flow_list = []

new_bkg_loss_dict = dict()

filename = 'Pure_NF_%s_k%s_hf%s_nbpl%s' % (flow_type, num_layers, hidden_features, num_blocks_per_layer)

new_bkg_flow = torch.load("new_sample_flows/M3000_TV/bkg_%s.pt" % (filename))

new_bkg_flow_list.append(new_bkg_flow)

new_bkg_loss = -new_bkg_flow.log_prob(bkg_data)[0].mean().detach().cpu().numpy()
new_bkg_loss_dict[hidden_features] = new_bkg_loss

print(bkg_loss_dict)
print(new_bkg_loss_dict)

In [None]:
num_samples_tr = int(bkg_data.shape[0])

with torch.no_grad(): 
    bkg_samples = new_bkg_flow_list[-1].sample(num_samples_tr).detach().cpu().numpy()
#bkg_samples = new_bkg_flow_list[0].sample(num_samples_tr).detach().cpu().numpy()
print(bkg_samples.shape)
bkg_samples_bad_indices = np.argwhere((bkg_samples < -10) | (bkg_samples > 10))[:,0]
print(bkg_samples_bad_indices[:25])
new_bkg_samples = np.delete(bkg_samples, bkg_samples_bad_indices, axis = 0)

print(new_bkg_samples.shape)

print("Hidden Features: ", hidden_features)

plot_titles = [r'$M_{j1}$', r'Jet 1 $\tau_{21}$', r'Jet 1 $\tau_{32}$', r'Jet 1 $\tau_{43}$', r'Jet 1 $\tau_s$', r'Jet 1 $P_b$', r'Jet 1 $n_{pf}$', 
              r'$M_{j2}$', r'Jet 2 $\tau_{21}$', r'Jet 2 $\tau_{32}$', r'Jet 2 $\tau_{43}$', r'Jet 2 $\tau_s$', r'Jet 2 $P_b$', r'Jet 2 $n_{pf}$',]

for index in range(num_features): 
    n, bins, patches = plt.hist(bkg_data[:, index], bins=50, histtype='step', label='Input distribution')
    plt.hist(new_bkg_samples[:, index], bins=bins, histtype='step', label='NF estimated density')
    if index % 7 == 4: 
        plt.legend(loc=(1.04,0.85))
    plt.title(plot_titles[index])
    plt.show()
    
#plt.hist outputs binning, pass that as input to make binning the same for two hists

In [None]:
bkgtr_bkgloss = -new_bkg_flow.log_prob(bkg_data)[0].detach().cpu().numpy()
bkgtr_sigloss = -new_bkg_flow.log_prob(sig_training_data)[0].detach().cpu().numpy()
print(bkgtr_bkgloss.shape)
print(bkgtr_sigloss.shape)

In [None]:
tpr, fpr = get_tpr_fpr_bkgtr(bkgtr_sigloss, bkgtr_bkgloss)
tpr_np, fpr_np = np.array(tpr), np.array(fpr)

nonzero_idx = np.nonzero(fpr_np)

tpr_inverse = tpr_np[nonzero_idx]
fpr_inverse = 1/fpr_np[nonzero_idx]

In [None]:
plt.plot(tpr_inverse,fpr_inverse)
plt.xlabel(r'$\epsilon_{sig}$',fontsize=15)
plt.ylabel(r'$1/\epsilon_{bkg}$',fontsize=15)
plt.yscale('log')
plt.title('Bkg-trained Pure NF Model')
plt.show()

In [None]:
bkgtr_auc = metrics.auc(fpr,tpr)
plt.plot(fpr,tpr)
plt.xlabel(r'$\epsilon_{bkg}$',fontsize=15)
plt.ylabel(r'$\epsilon_{sig}$',fontsize=15)
plt.title('Bkg-trained Pure NF Model (AUC = %s)' % round(bkgtr_auc,3))
plt.show()

# Sig-trained post-training analysis

In [None]:
def get_tpr_fpr_sigtr(sigloss,bkgloss):
    bins = np.linspace(0,100,10001)
    tpr = []
    fpr = []
    for cut in bins:
        tpr.append(np.where(sigloss<cut)[0].shape[0]/len(sigloss))
        fpr.append(np.where(bkgloss<cut)[0].shape[0]/len(bkgloss))

    return tpr,fpr

In [None]:
new_sig_flow_list = []

new_sig_loss_dict = dict()

filename = 'Pure_NF_%s_k%s_hf%s_nbpl%s' % (flow_type, num_layers, hidden_features, num_blocks_per_layer)

new_sig_flow = torch.load("new_sample_flows/M3000_TV/sig_%s.pt" % (filename))

new_sig_flow_list.append(new_sig_flow)

new_sig_loss = -new_sig_flow.log_prob(sig_training_data)[0].mean().detach().cpu().numpy()
new_sig_loss_dict[hidden_features] = new_sig_loss

print(sig_loss_dict)
print(new_sig_loss_dict)

In [None]:
num_samples_tr = int(sig_training_data.shape[0])

with torch.no_grad(): 
    sig_samples = new_sig_flow_list[-1].sample(num_samples_tr).detach().cpu().numpy()
#sig_samples = new_sig_flow_list[0].sample(num_samples_tr).detach().cpu().numpy()
print(sig_samples.shape)
sig_samples_bad_indices = np.argwhere((sig_samples < -10) | (sig_samples > 10))[:,0]
print(sig_samples_bad_indices[:25])
new_sig_samples = np.delete(sig_samples, sig_samples_bad_indices, axis = 0)

print(new_sig_samples.shape)

print("Hidden Features: ", hidden_features)

plot_titles = [r'$M_{j1}$', r'Jet 1 $\tau_{21}$', r'Jet 1 $\tau_{32}$', r'Jet 1 $\tau_{43}$', r'Jet 1 $\tau_s$', r'Jet 1 $P_b$', r'Jet 1 $n_{pf}$', 
              r'$M_{j2}$', r'Jet 2 $\tau_{21}$', r'Jet 2 $\tau_{32}$', r'Jet 2 $\tau_{43}$', r'Jet 2 $\tau_s$', r'Jet 2 $P_b$', r'Jet 2 $n_{pf}$',]

for index in range(num_features): 
    n, bins, patches = plt.hist(sig_training_data[:, index], bins=50, histtype='step', label='Input distribution')
    plt.hist(new_sig_samples[:, index], bins=bins, histtype='step', label='NF estimated density')
    if index % 7 == 4: 
        plt.legend(loc=(1.04,0.85))
    plt.title(plot_titles[index])
    plt.show()
    
#plt.hist outputs binning, pass that as input to make binning the same for two hists

In [None]:
sigtr_bkgloss = -new_sig_flow.log_prob(bkg_data)[0].detach().cpu().numpy()
sigtr_sigloss = -new_sig_flow.log_prob(sig_training_data)[0].detach().cpu().numpy()
print(sigtr_bkgloss.shape)
print(sigtr_sigloss.shape)

In [None]:
tpr, fpr = get_tpr_fpr_sigtr(sigtr_sigloss, sigtr_bkgloss)
tpr_np, fpr_np = np.array(tpr), np.array(fpr)

nonzero_idx = np.nonzero(fpr_np)

tpr_inverse = tpr_np[nonzero_idx]
fpr_inverse = 1/fpr_np[nonzero_idx]

In [None]:
plt.plot(tpr_inverse,fpr_inverse)
plt.xlabel(r'$\epsilon_{sig}$',fontsize=15)
plt.ylabel(r'$1/\epsilon_{bkg}$',fontsize=15)
plt.yscale('log')
plt.title('Sig-trained Pure NF Model')
plt.show()

In [None]:
sigtr_auc = metrics.auc(fpr,tpr)
plt.plot(fpr,tpr)
plt.xlabel(r'$\epsilon_{bkg}$',fontsize=15)
plt.ylabel(r'$\epsilon_{sig}$',fontsize=15)
plt.title('Sig-trained Pure NF Model (AUC = %s)' % round(sigtr_auc,3))
plt.show()

# Bump Hunt CSV File Converter

In [None]:
num_bkg_batches = 2
num_sig_batches = 1

In [None]:
plt.rcParams["figure.figsize"] = (5,5)

In [None]:
bkg_data, bkg_unnorm_data, bkg_masses = LAPS_test(sample_type = 'qcdbkg', num_batches = num_bkg_batches)

bkg_mean = np.mean(bkg_unnorm_data, axis=0)
bkg_std = np.std(bkg_unnorm_data, axis=0)

In [None]:
num_bkg_samples = bkg_data.shape[0]
num_sig_samples = sig_testing_data.shape[0]
sig_sample_percentage = 100 * num_sig_samples / (num_bkg_samples + num_sig_samples)

bkg_data_WL = np.concatenate((bkg_data, np.zeros((num_bkg_samples,1), dtype='float32')), axis=1)
sig_testing_data_WL = np.concatenate((sig_testing_data, np.ones((num_sig_samples,1), dtype='float32')), axis=1)

test_data = np.concatenate((bkg_data_WL, sig_testing_data_WL), axis=0)
test_masses = np.concatenate((bkg_masses, sig_testing_masses), axis=0)
#test_data = bkg_data_WL
#test_masses = bkg_masses
test_data_WM = np.concatenate((test_masses, test_data), axis=1)

np.random.shuffle(test_data_WM)

test_masses = test_data_WM[:,0]
test_labels = test_data_WM[:,-1]
test_data = test_data_WM[:,1:-1]

num_test_samples = test_data.shape[0]

print("Num bkg samples: ", num_bkg_samples)
print("Num sig samples: ", num_sig_samples)
print("Num test samples: ", num_test_samples)
print("Signal Percentage: ", sig_sample_percentage)

In [None]:
df_mass = pd.DataFrame(np.ndarray.tolist(test_masses))
df_mass.to_csv('csv_files/test_masses.csv')

In [None]:
filename = 'Pure_NF_%s_k%s_hf%s_nbpl%s' % (flow_type, num_layers, hidden_features, num_blocks_per_layer)
bkg_model = torch.load("new_sample_flows/M3000_TV/bkg_%s.pt" % (filename))
sig_model = torch.load("new_sample_flows/M3000_TV/sig_%s.pt" % (filename))

In [None]:
bkg_loss_indices = np.where(test_labels==0)
sig_loss_indices = np.where(test_labels==1)

In [None]:
bkgtr_test_losses = -bkg_model.log_prob(test_data)[0].detach().cpu().numpy()
print(bkgtr_test_losses.shape)

In [None]:
bkgtr_bkg_losses = bkgtr_test_losses[bkg_loss_indices]
print(bkgtr_bkg_losses.shape)

bkgtr_sig_losses = bkgtr_test_losses[sig_loss_indices]
print(bkgtr_sig_losses.shape)

In [None]:
df_bkgloss = pd.DataFrame(np.ndarray.tolist(bkgtr_test_losses))
df_bkgloss.to_csv('csv_files/bkgtr_test_losses.csv')

In [None]:
sigtr_test_losses = -sig_model.log_prob(test_data)[0].detach().cpu().numpy()
print(sigtr_test_losses.shape)

In [None]:
sigtr_bkg_losses = sigtr_test_losses[bkg_loss_indices]
print(sigtr_bkg_losses.shape)

sigtr_sig_losses = sigtr_test_losses[sig_loss_indices]
print(sigtr_sig_losses.shape)

In [None]:
df_sigloss = pd.DataFrame(np.ndarray.tolist(sigtr_test_losses))
df_sigloss.to_csv('csv_files/sigtr_test_losses.csv')

In [None]:
bkgloss_cut = 15
low_bkgloss_indices = np.where(bkgtr_test_losses > bkgloss_cut)[0]
for index in range(2,8):  
    
    sigloss_cut = 5*index
    low_sigloss_indices = np.where(sigtr_test_losses < sigloss_cut)[0]
    low_loss_indices = np.intersect1d(low_bkgloss_indices, low_sigloss_indices)
    low_loss_test_masses = test_masses[low_loss_indices]
    low_loss_bkg_masses = test_masses[np.intersect1d(low_loss_indices, bkg_loss_indices)]
    
    n, bins, patches = plt.hist(low_loss_test_masses, bins=50, histtype = 'step', label = 'bkg + sig')
    plt.hist(low_loss_bkg_masses, bins = bins, histtype = 'step', label = 'bkg only')
    
    plt.xlabel('bkgloss > %s, sigloss < %s' % (bkgloss_cut, sigloss_cut))
    plt.legend()
    plt.show()

In [None]:
plt.rcParams["figure.figsize"] = (10,10)

In [None]:
plt.scatter(bkgtr_bkg_losses, sigtr_bkg_losses, s=2, label = 'Bkg samples')
plt.scatter(bkgtr_sig_losses, sigtr_sig_losses, s=2, label = 'Sig samples')
plt.xlim(0,100)
plt.ylim(0,100)
plt.xlabel("bkg-Trained Model Loss")
plt.ylabel("sig-Trained Model Loss")
plt.title("Testing Data QUAK Space")
plt.legend()
plt.show()

# Load and process RSGraviton3000 samples

In [None]:
plt.rcParams["figure.figsize"] = (5,5)

In [None]:
RSGraviton3000_data, _, _ = LAPS_test(sample_type = 'RSGraviton3000', num_batches = 1, inp_meanstd = (bkg_mean, bkg_std))
num_RSGraviton3000_samples = RSGraviton3000_data.shape[0]

RSGraviton3000_testing_data_percentage = 100 - sig_training_data_percentage
num_RSGraviton3000_testing_samples = int(RSGraviton3000_testing_data_percentage * num_RSGraviton3000_samples / 100)
print('Number of testing samples: %s' % num_RSGraviton3000_testing_samples)

indices = np.random.permutation(num_RSGraviton3000_samples)
RSGraviton3000_testing_indices = indices[:num_RSGraviton3000_testing_samples]
RSGraviton3000_testing_data = RSGraviton3000_data[RSGraviton3000_testing_indices]

In [None]:
bkgtr_RSGraviton3000_losses = -bkg_model.log_prob(RSGraviton3000_testing_data)[0].detach().cpu().numpy()

In [None]:
sigtr_RSGraviton3000_losses = -sig_model.log_prob(RSGraviton3000_testing_data)[0].detach().cpu().numpy()

# Load and process Qstar3000 samples

In [None]:
plt.rcParams["figure.figsize"] = (5,5)

In [None]:
Qstar3000_data, _, _ = LAPS_test(sample_type = 'Qstar3000', num_batches = 1, inp_meanstd = (bkg_mean, bkg_std))
num_Qstar3000_samples = Qstar3000_data.shape[0]

Qstar3000_testing_data_percentage = 100 - sig_training_data_percentage
num_Qstar3000_testing_samples = int(Qstar3000_testing_data_percentage * num_Qstar3000_samples / 100)
print('Number of testing samples: %s' % num_Qstar3000_testing_samples)

indices = np.random.permutation(num_Qstar3000_samples)
Qstar3000_testing_indices = indices[:num_Qstar3000_testing_samples]
Qstar3000_testing_data = Qstar3000_data[Qstar3000_testing_indices]

In [None]:
bkgtr_Qstar3000_losses = -bkg_model.log_prob(Qstar3000_testing_data)[0].detach().cpu().numpy()

In [None]:
sigtr_Qstar3000_losses = -sig_model.log_prob(Qstar3000_testing_data)[0].detach().cpu().numpy()

# Load and process Wkk3000 samples

In [None]:
plt.rcParams["figure.figsize"] = (5,5)

In [None]:
Wkk3000_data, _, _ = LAPS_test(sample_type = 'Wkk3000', num_batches = 1, inp_meanstd = (bkg_mean, bkg_std))
num_Wkk3000_samples = Wkk3000_data.shape[0]

Wkk3000_testing_data_percentage = 100 - sig_training_data_percentage
num_Wkk3000_testing_samples = int(Wkk3000_testing_data_percentage * num_Wkk3000_samples / 100)
print('Number of testing samples: %s' % num_Wkk3000_testing_samples)

indices = np.random.permutation(num_Wkk3000_samples)
Wkk3000_testing_indices = indices[:num_Wkk3000_testing_samples]
Wkk3000_testing_data = Wkk3000_data[Wkk3000_testing_indices]

In [None]:
bkgtr_Wkk3000_losses = -bkg_model.log_prob(Wkk3000_testing_data)[0].detach().cpu().numpy()

In [None]:
sigtr_Wkk3000_losses = -sig_model.log_prob(Wkk3000_testing_data)[0].detach().cpu().numpy()

# Master QUAK Space 

In [None]:
x_bad_loss_cutoff = 100
y_bad_loss_cutoff = 100

In [None]:
plt.rcParams["figure.figsize"] = (10,10)

In [None]:
bkgtr_bkg_losses = np.append(bkgtr_bkg_losses, np.array([0,]))
sigtr_bkg_losses = np.append(sigtr_bkg_losses, np.array([0,]))

In [None]:
plt.scatter(bkgtr_bkg_losses, sigtr_bkg_losses, s=2, label = 'QCD bkg samples')
plt.scatter(bkgtr_sig_losses, sigtr_sig_losses, s=2, label = r'''W'$\rightarrow$tB' (M=3000) sig samples''')
plt.scatter(bkgtr_Wkk3000_losses, sigtr_Wkk3000_losses, s=2, label = r'''Wkk$\rightarrow$WR$\rightarrow$W+WW (M=3000) sig samples''')
plt.scatter(bkgtr_RSGraviton3000_losses, sigtr_RSGraviton3000_losses, s=2, label = r'''G$\rightarrow$gg (M=3000) sig samples''')
plt.scatter(bkgtr_Qstar3000_losses, sigtr_Qstar3000_losses, s=2, label = r'''q*$\rightarrow$qW (M=3000) sig samples''')
plt.xlim(0, x_bad_loss_cutoff)
plt.ylim(0, y_bad_loss_cutoff)
plt.xlabel('QCD Bkg Model Loss')
plt.ylabel(r'''W'$\rightarrow$tB' (M=3000) Sig Model Loss''')
plt.title('Testing Data QUAK Space (Scatter Plot)')
plt.legend()
plt.show()

In [None]:
num_bins = 5000

bkgtr_all_losses = np.concatenate((bkgtr_bkg_losses, bkgtr_sig_losses, bkgtr_Wkk3000_losses, bkgtr_RSGraviton3000_losses, bkgtr_Qstar3000_losses))
sigtr_all_losses = np.concatenate((sigtr_bkg_losses, sigtr_sig_losses, sigtr_Wkk3000_losses, sigtr_RSGraviton3000_losses, sigtr_Qstar3000_losses))

plt.hist2d(bkgtr_all_losses, sigtr_all_losses, cmap = plt.cm.jet, bins=num_bins)
plt.colorbar()
plt.xlabel('QCD Bkg Model Loss')
plt.ylabel(r'''W'$\rightarrow$tB' (M=3000) Sig Model Loss''')
plt.title('Testing Data QUAK Space (Heat Map)')
plt.xlim(0, x_bad_loss_cutoff)
plt.ylim(0, y_bad_loss_cutoff)
plt.show()

# Normalized Input Variable Density Histograms

In [None]:
plt.rcParams["figure.figsize"] = (5,5)

In [None]:
plot_titles = [r'$M_{j1}$', r'Jet 1 $\tau_{21}$', r'Jet 1 $\tau_{32}$', r'Jet 1 $\tau_{43}$', r'Jet 1 $\tau_s$', r'Jet 1 $P_b$', r'Jet 1 $n_{pf}$', 
              r'$M_{j2}$', r'Jet 2 $\tau_{21}$', r'Jet 2 $\tau_{32}$', r'Jet 2 $\tau_{43}$', r'Jet 2 $\tau_s$', r'Jet 2 $P_b$', r'Jet 2 $n_{pf}$',]

for index in range(num_features): 
    n, bins, patches = plt.hist(bkg_data[:, index], bins=30, histtype='step', density=True, label='QCD bkg samples')
    plt.hist(sig_testing_data[:, index], bins=bins, histtype='step', density=True, label=r'''W'$\rightarrow$WZ (M=3000) sig samples''')
    plt.hist(Wkk3000_data[:, index], bins=bins, histtype='step', density=True, label=r'''Wkk$\rightarrow$WR$\rightarrow$W+WW (M=3000) sig samples''')
    plt.hist(RSGraviton3000_data[:, index], bins=bins, histtype='step', density=True, label=r'''G$\rightarrow$gg (M=3000) sig samples''')
    plt.hist(Qstar3000_data[:, index], bins=bins, histtype='step', density=True, label=r'''q*$\rightarrow$qW (M=3000) sig samples''')
    if index % 7 == 4: 
        plt.legend(loc=(1.04,0.64))
    plt.title(plot_titles[index])
    plt.show()