In [1]:
import h5py
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.externals import joblib
import pickle
import argparse
import os

DATA_DIR = '/bigdata/shared/analysis/'
BACKGROUND = ['DYJets','Other','QCD','SingleTop','TTJets','WJets','ZInv']
SIGNAL = ['T2qq_900_850','T2qq_850_800','T2qq_800_750','T2qq_750_700','T2qq_700_650',
          'T2qq_650_600','T2qq_600_550','T2qq_550_500','T2qq_500_450', 'T2qq_450_400']


In [2]:
def remove_field_name(a, name):
    names = list(a.dtype.names)
    if name in names:
        names.remove(name)
    b = a[names]
    return b

In [49]:
Signal = h5py.File(DATA_DIR+SIGNAL[1]+'.h5','r')['Data'][:] # 1 = 850-800
features = (list(Signal.dtype.names))
print (features)
descr = Signal.dtype.descr
for i in range(len(descr)):
    if 'f4' not in descr[i][1]: descr[i] = (descr[i][0], np.float32)
print (descr)

['label', 'weight', 'alphaT', 'dPhiMinJetMET', 'dPhiRazor', 'HT', 'jet1MT', 'leadingJetCISV', 'leadingJetPt', 'MET', 'MHT', 'MR', 'MT2', 'nSelectedJets', 'Rsq', 'subleadingJetPt']
[('label', '<f4'), ('weight', '<f4'), ('alphaT', '<f4'), ('dPhiMinJetMET', '<f4'), ('dPhiRazor', '<f4'), ('HT', '<f4'), ('jet1MT', '<f4'), ('leadingJetCISV', '<f4'), ('leadingJetPt', '<f4'), ('MET', '<f4'), ('MHT', '<f4'), ('MR', '<f4'), ('MT2', '<f4'), ('nSelectedJets', <class 'numpy.float32'>), ('Rsq', '<f4'), ('subleadingJetPt', '<f4')]


In [52]:
for feature in features:
    if 'label' == feature or 'weight' == feature: continue

    print ("Removing {}".format(feature))
    from training import to_regular_array

    def remove_outlier(arr):
        # Will remove outlier according to each feature.
        print ("Removing outlier")

        def clip(arr, lower, sub_lower, upper, sub_upper):
            arr[arr < lower] = float(sub_lower)
            arr[arr > upper] = float(sub_upper)
            return arr

        arr['alphaT'] = clip(arr['alphaT'], 0, 0, 10, 10)
        arr['dPhiMinJetMET'] = clip(arr['dPhiMinJetMET'], -np.pi, 0, np.pi, 0)
        arr['dPhiRazor'] = clip(arr['dPhiRazor'], -np.pi, 0, np.pi, 0)
        arr['HT'] = clip(arr['HT'], 0, 0, 3000, 3000)
        arr['jet1MT'] = clip(arr['jet1MT'], 0, 0, 3000, 3000)
        arr['leadingJetCISV'] = clip(arr['leadingJetCISV'], 0, 0, 1, 1)
        arr['leadingJetPt'] = clip(arr['leadingJetPt'], 0, 0, 2000, 2000)
        arr['MET'] = clip(arr['MET'], 0, 0, 5000, 5000)
        arr['MHT'] = clip(arr['MHT'], 0, 0, 2000, 2000)
        arr['MR'] = clip(arr['MR'], 0, 0, 5000, 5000)
        arr['MT2'] = clip(arr['MT2'], 0, 0, 5000, 5000)
        arr['nSelectedJets'] = clip(arr['nSelectedJets'], 0, 0, 20, 20)
        arr['Rsq'] = clip(arr['Rsq'], 0, 0, 2, 2)
        arr['subleadingJetPt'] = clip(arr['subleadingJetPt'], 0, 0, 2000, 2000)

        return arr

    def split_dataset(dataset):
        np.random.shuffle(dataset)
        train_index = int(0.6 * dataset.shape[0])
        val_index = int(train_index*4/3)
        training_set = dataset[:train_index]
        val_set = dataset[train_index:val_index]
        test_set = dataset[val_index:]
        return training_set, val_set, test_set

    for i,bkg in enumerate(BACKGROUND):
        _file = h5py.File(DATA_DIR+'/'+bkg+'.h5','r')
        _background = to_regular_array(remove_field_name(remove_outlier(_file['Data'][:]), feature))
        if i == 0: Background = np.copy(_background)
        else: Background = np.vstack((Background, _background))
    Signal = to_regular_array(remove_field_name(remove_outlier(h5py.File(DATA_DIR+SIGNAL[0]+'.h5','r')['Data'][:]),feature))

    def to_regular_array(struct_array):
        # There is an integer column (nSelectedJets) in the structured array. 
        # Need to convert to float before converting to regular array
        dt = struct_array.dtype.descr
        for i in range(len(dt)):
            if 'f4' not in dt[i][1]:
                dt[i] = (dt[i][0],np.float32)
        converted = np.array(struct_array, dtype=dt)
        return converted.view((np.float32, len(converted.dtype.names)))
    
    from training import clean_dataset
    Signal = clean_dataset(Signal)
    Background = clean_dataset(Background)
    
    train_bkg, val_bkg, test_bkg = split_dataset(Background)
    train_sn, val_sn, test_sn = split_dataset(Signal)

    train_weight = float(train_sn.shape[0])/float(train_bkg.shape[0])
    val_weight = float(val_sn.shape[0])/float(val_bkg.shape[0])

    def random_frac(arr, fraction, label = ''):
        print("Getting a fraction of {} from {}".format(fraction,label))
        print("- Initial shape: ")
        print(arr.shape)
        sample_size = int(fraction*arr.shape[0])
        print("- Sampling {} entries".format(sample_size))
        idx = np.random.randint(arr.shape[0],size=sample_size)
        out = arr[idx,:]
        print("- Output shape: ")
        print(out.shape)
        return out


    train_bkg = random_frac(train_bkg, train_weight, "Training Background")
    val_bkg = random_frac(val_bkg, train_weight, "Validation Background")

    with h5py.File(DATA_DIR+"/FeatureRemoval/Undersampling_Dataset_No_{}.h5".format(feature),'w') as outfile:
        print(train_sn.shape)
        print(train_bkg.shape)
        train_dataset = np.vstack((train_sn, train_bkg))
        val_dataset = np.vstack((val_sn, val_bkg))
        test_dataset = np.vstack((test_sn, test_bkg))
        print("Finished stacking")
        outfile.create_dataset("Training", data=train_dataset)
        outfile.create_dataset("Validation", data=val_dataset)
        outfile.create_dataset("Test",data=test_dataset)


Removing alphaT
Removing outlier
Removing outlier
Removing outlier
Removing outlier
Removing outlier
Removing outlier
Removing outlier
Removing outlier
Before cleaning: (14765, 15)
After cleaning: (14765, 15)
Before cleaning: (51627989, 15)
After cleaning: (51627989, 15)
Getting a fraction of 0.0002859882880710085 from Training Background
- Initial shape: 
(30976793, 15)
- Sampling 8859 entries
- Output shape: 
(8859, 15)
Getting a fraction of 0.0002859882880710085 from Validation Background
- Initial shape: 
(10325597, 15)
- Sampling 2952 entries
- Output shape: 
(2952, 15)
(8859, 15)
(8859, 15)
Finished stacking
Removing dPhiMinJetMET
Removing outlier
Removing outlier
Removing outlier
Removing outlier
Removing outlier
Removing outlier
Removing outlier
Removing outlier
Before cleaning: (14765, 15)
After cleaning: (14765, 15)
Before cleaning: (51627989, 15)
After cleaning: (51627805, 15)
Getting a fraction of 0.0002859893036320254 from Training Background
- Initial shape: 
(30976683, 1

Removing outlier
Removing outlier
Removing outlier
Before cleaning: (14765, 15)
After cleaning: (14765, 15)
Before cleaning: (51627989, 15)
After cleaning: (51627805, 15)
Getting a fraction of 0.0002859893036320254 from Training Background
- Initial shape: 
(30976683, 15)
- Sampling 8859 entries
- Output shape: 
(8859, 15)
Getting a fraction of 0.0002859893036320254 from Validation Background
- Initial shape: 
(10325561, 15)
- Sampling 2952 entries
- Output shape: 
(2952, 15)
(8859, 15)
(8859, 15)
Finished stacking


In [48]:
% ls /bigdata/shared/analysis/FeatureRemoval

Undersampling_Dataset_No_alphaT.h5
Undersampling_Dataset_No_dPhiMinJetMET.h5
Undersampling_Dataset_No_dPhiRazor.h5
Undersampling_Dataset_No_HT.h5
Undersampling_Dataset_No_jet1MT.h5
Undersampling_Dataset_No_leadingJetCISV.h5
Undersampling_Dataset_No_leadingJetPt.h5
Undersampling_Dataset_No_MET.h5
Undersampling_Dataset_No_MHT.h5
Undersampling_Dataset_No_MR.h5
Undersampling_Dataset_No_MT2.h5
Undersampling_Dataset_No_nSelectedJets.h5
Undersampling_Dataset_No_Rsq.h5
Undersampling_Dataset_No_subleadingJetPt.h5


In [47]:
% rm /bigdata/shared/analysis/FeatureRemoval/Undersampling_Dataset_No_weight.h5 /bigdata/shared/analysis/FeatureRemoval/Undersampling_Dataset_No_label.h5