In [1]:
import os
import time
import numpy as np
import xgboost as xgb
from copy import deepcopy
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

In [2]:
np.set_printoptions(suppress=True, linewidth=150, precision=2)

In [3]:
def perf_measure(y_true, y_pred):
    
    cnf_matrix = confusion_matrix(y_true, y_pred)
    
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TP = np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)

    FP = FP.astype(float)
    FN = FN.astype(float)
    TP = TP.astype(float)
    TN = TN.astype(float)

    # Specificity or true negative rate
    TNR = TN/(TN+FP) 
    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    # Negative predictive value
    NPV = TN/(TN+FN)
    # Fall out or false positive rate
    FPR = FP/(FP+TN)
    # False negative rate
    FNR = FN/(TP+FN)
    # False discovery rate
    FDR = FP/(TP+FP)
    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)
    
    FSCORE = np.divide((2*PPV*TPR), (PPV+TPR))

    

    
    return PPV, TPR, FSCORE, FNR, FPR, TNR

### Loading Synthetic Data

In [4]:
name_of_particle = 'JetHTs'

X_train = np.load("matrices/" + name_of_particle +"_train.npy",)
y_train = np.load("matrices/" + name_of_particle +"_y_train.npy",)
X_val = np.load("matrices/" + name_of_particle +"_val.npy",)
y_val = np.load("matrices/" + name_of_particle +"_y_val.npy",)
X_test = np.load("matrices/" + name_of_particle +"_test.npy",)
y_test = np.load("matrices/" + name_of_particle +"_y_test.npy",)
X_train = X_train[:, :-3]
X_val = X_val[:, :-3]
X_test = X_test[:, :-3]
_, V = X_train.shape
K = 2
V

169

In [5]:
# epsilon, depth, max_leaf, min_leaf 
setting_rndFor = [(60, 20, 50, 5)]  # (120, 20, 50, 5), (120, 40, 50, 5), (50, 20, 25, 5), (60, 20, 50, 3), 

In [6]:
for i in setting_rndFor:

    start = time.time()

    clf = RandomForestClassifier(n_estimators=i[0], max_depth=i[1],
                                 max_leaf_nodes=i[2], min_samples_leaf=i[3],
                                 random_state=0)
    clf.fit(X_train, y_train) 
    labels_pred_RndFor = clf.predict(X_test)
    f1_score_RndFor = precision_recall_fscore_support(y_test, labels_pred_RndFor, average='macro') # Does not take into account labels imbalanced
    end = time.time()
    print(end-start)
    
    PPVr, TPRr, FSCOREr, FNRr, FPRr, TNRr = perf_measure(y_true=y_test, y_pred=labels_pred_RndFor)
    
    
    PPVr, TPRr, FSCOREr, FNRr, FPRr, TNRr

50.97291970252991


In [7]:
PPVr, TPRr, FSCOREr, FNRr, FPRr, TNRr = perf_measure(y_true=y_test, y_pred=labels_pred_RndFor)
    
    
PPVr, TPRr, FSCOREr, FNRr, FPRr, TNRr

(array([1.  , 0.99]),
 array([1.  , 0.66]),
 array([1.  , 0.79]),
 array([0.  , 0.34]),
 array([0.34, 0.  ]),
 array([0.66, 1.  ]))

In [8]:
cnf_matrix = confusion_matrix(y_test, labels_pred_RndForndForndForRndFor)
cnf_matrix

array([[27647,     2],
       [  131,   253]])

In [None]:
7.23353467e-05

### Evaluating and Features role investigation

In [None]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", 
                              max_depth=10, n_estimators=160,
                              num_class=2)

In [None]:
# %%time
xgb_model.fit(X_train, y_train)

labels_pred_XGb = xgb_model.predict(X_test)

In [None]:
PPV, TPR, FSCORE, FNR, FPR, TNR = perf_measure(y_true=y_test, y_pred=labels_pred_XGb)

PPV, TPR, FSCORE, FNR, FPR, TNR