In [None]:
# Global Setup!
isWeighted = False
nTrees     = 1000
maxDepth   = 3

plotVariables = True

version  = "Final"

version += ("_" + str(nTrees) + "_" + str(maxDepth))

print(version)

In [None]:
# Add the relevant scripts from LArMachineLearningData
# Nice the process so it can run with lots of cores on low priority
import os

# Add path for LArMachineLearningData
import sys
pandoraMVADir = ""
#dataDir       = "../"
dataDir       = "./"

#sys.path.append(pandoraMVADir + 'LArMachineLearningData/scripts')
sys.path.append(pandoraMVADir + '.')

# Import pandora libraries
from importlib import reload
from PandoraBDT import *

# Import concatenation tool
from itertools import chain

# Import relevant SKLearn libraries
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn import metrics

# Set global params
testTrainFraction = 0.5
nCores = -1

In [None]:
# Data and Name
trainingFile = dataDir + 'training_output_hierarchy.txt'
#trainingFile = dataDir + 'training_outputnoChargeInfo.txt'
BDTName = "PfoCharacterisation"

# Directories
plotsDir = pandoraMVADir + 'bdt/plots/' + BDTName + '/' + version + '/'
saveDir  = pandoraMVADir + 'bdt/trained/' + BDTName + '/' + version + '/'
print(plotsDir)
print(saveDir)

if not os.path.exists(plotsDir):
    os.makedirs(plotsDir)
    
if not os.path.exists(saveDir):
    os.makedirs(saveDir)

In [None]:
# Set background and signal label names
params = {
    'labelNames': ['Shower','Track'],
    'signalDefs': [0, 1],
    'signalCols': ['r', 'b']
}

# Create the base BDT to vary the params from and compare to
baseBDT = AdaBoostClassifier(DecisionTreeClassifier(max_depth=maxDepth),algorithm='SAMME', 
                             random_state=42, n_estimators=nTrees)

In [None]:
# Load the data
data, nFeatures, nExamples = LoadData(trainingFile, ',')

In [None]:
features, labels           = SplitTrainingSet(data, nFeatures)

# Split into train and test samples
xTrain, yTrain, xTest, yTest = Sample(features, labels, testTrainFraction)

# Split into signal and background based on the true labels
signalFeatures = features[labels==1]
backgroundFeatures = features[labels==0]

# Check the features array is the same size as the feature names array
print (len(features[0]))
print (np.shape(features))
print('Total: '+str(len(features))+', signal: '+
      str(len(signalFeatures))+' and background: '+
      str(len(backgroundFeatures)))

In [None]:
# Construct the Pandas dataframe
# First create a dictionary
allDict = {i: features[:, i] for i in range(nFeatures)}
allDict.update({'Labels': labels})

# Create the Pandas dataframe, create seperate df for signal/background
df = pd.DataFrame(data=allDict)

In [None]:
df

In [None]:
def DrawVariablesDF(df, params, topdir, save=True):
    for column in df:
        if column == 'Labels':
            continue    
        fig, ax = plt.subplots()
        df.pivot(columns='Labels')[column].plot.hist(bins=50, alpha=0.5, color=params['signalCols'], edgecolor='k', density=True, ax=ax)
        ax.legend(params['labelNames']);
        ax.set_xlabel(column)

        plt.tight_layout()
        if save:
            plt.savefig(topdir + 'Feature_' + str(column) + '.png')
            plt.savefig(topdir + 'Feature_' + str(column) + '.pdf')
        plt.show()
        plt.close()

In [None]:
# Make plots drawing the variables for signal/background
if plotVariables : DrawVariablesDF(df, params, plotsDir, save = True)

In [None]:
def Correlation(df, label, topdir, save=False):
    plt.figure(figsize=(10, 10))
    plt.title(label)

    ax = sns.heatmap(df.corr(), cmap='coolwarm', vmax=1.0, vmin=-1.0,
                     annot=True, square=True, fmt='.2g')

    ax.invert_yaxis()

    if save:
        plt.savefig(topdir + '/' + label.replace(" ", "_") + ".png", bbox_inches='tight')
        plt.savefig(topdir + '/' + label.replace(" ", "_") + ".pdf", bbox_inches='tight')
    plt.show()
    plt.close()

In [None]:
# Make correlation matricies
if plotVariables :
    Correlation(df[df['Labels']==params['signalDefs'][0]], params['labelNames'][0] + ' Correlation Matrix',plotsDir, save = True)
    Correlation(df[df['Labels']==params['signalDefs'][1]], params['labelNames'][1] + ' Correlation Matrix',plotsDir, save = True)

In [None]:
# Reference BDT with controlled hyperparams
baseBDT.fit(xTrain,yTrain)

In [None]:
# Plot ROC curves
fig, ax = plt.subplots()
metrics.plot_roc_curve(baseBDT, xTest, yTest, ax=ax)

plt.title("ROC Curves")
ax.invert_xaxis()
ax.legend()
ax.grid()

plt.savefig(plotsDir + '/' + "roc.png", bbox_inches='tight')
plt.savefig(plotsDir + '/' + "roc.pdf", bbox_inches='tight')

In [None]:
# Plot Confusion Matricies
fig, ax = plt.subplots()
metrics.plot_confusion_matrix(baseBDT, xTest, yTest, display_labels=params['labelNames'],
                             ax=ax, normalize='true')
ax.invert_xaxis()
plt.title("Confusion matrix (True Normalised)")

plt.savefig(plotsDir + '/' + "confusion_matrix.png", bbox_inches='tight')
plt.savefig(plotsDir + '/' + "confusion_matrix.pdf", bbox_inches='tight')
plt.show()

In [None]:
# Print more detailed performance info
bdtPredicted = baseBDT.predict(xTest)

print ("Background (0): ", params['labelNames'][0])
print ("Signal (1): ", params['labelNames'][1])
print ("BDT:\n", metrics.classification_report(yTest, bdtPredicted))

In [None]:
# Plot importance of features
importanceDF = pd.DataFrame({'Features': range(len(features[0])), 'Importance Score':baseBDT.feature_importances_})
print (importanceDF.sort_values(by=['Importance Score']))
ax = importanceDF.sort_values(by=['Importance Score'])\
    .plot(kind='barh', x='Features', y='Importance Score')

plt.savefig(plotsDir + '/' + "feature_importance.png", bbox_inches='tight')

plt.savefig(plotsDir + '/' + "feature_importance.pdf", bbox_inches='tight')

In [None]:
import scipy.stats as sci

def PlotBdtScores2(bdtModel, X_test, Y_test, X_train, Y_train, title, parameters, topDir, save=False):
    # Testing BDT Using Remainder of Training Sample
    test_results = bdtModel.decision_function(X_test)
    train_results = bdtModel.decision_function(X_train)

    test_results_signal = test_results[Y_test == 1]
    train_results_signal = train_results[Y_train == 1]
    test_results_background = test_results[Y_test == 0]
    train_results_background = train_results[Y_train == 0]

    fig, ax = plt.subplots()

    ax.set_title('Overtraining Test: ' + title)

    sigEff = 0
    bkgRej = 0

    for i, n, g in zip(parameters['SignalDefinition'], parameters['ClassNames'], parameters['PlotColors']):
        entries, bins, patches = ax.hist(train_results[Y_train == i],
                                         bins=parameters['nBins'],
                                         range=(-1, 1),
                                         facecolor=g,
                                         label='%s' % n,
                                         alpha=.5,
                                         density=True,
                                         edgecolor='k')

        counts, bin_edges = np.histogram(test_results[Y_test == i],
                                         range=(-1, 1), bins=parameters['nBins'], density=True)

        bin_centres = (bin_edges[:-1] + bin_edges[1:])/2.
        ax.errorbar(bin_centres, counts, fmt='o', color=g)

        if i == 1:
            nEntries = sum(counts)
            nEntriesPassing = sum(counts[parameters['OptimalBinCut']:])
            sigEff = nEntriesPassing/nEntries
        elif i == 0:
            nEntries = sum(counts)
            nEntriesFailing = sum(counts[:parameters['OptimalBinCut']])
            bkgRej = nEntriesFailing/nEntries

    signalKSTest, ksSig = sci.ks_2samp(
        test_results_signal, train_results_signal)
    backgroundKSTest, ksBck = sci.ks_2samp(
        test_results_background, train_results_background)

    score = bdtModel.score(X_test,Y_test)

    plt.text(0.88, 0.5, "Sig Eff: {:.2%}\nBkg Rej: {:.2%}\nScore Cut: {:.2}\n\nSig KS: {:.2}\nBack KS: {:.2}\nSig P: {:.2}\nBck P: {:.2}\n\nScore: {:.4} "
             .format(sigEff, bkgRej, parameters['OptimalScoreCut'], signalKSTest, backgroundKSTest, ksSig, ksBck, score),
             horizontalalignment='center',
             verticalalignment='center',
             transform=ax.transAxes)

    x1, x2, y1, y2 = plt.axis()
    plt.axis((x1, x2, y1, y2 * 1.1))
    plt.legend(loc='upper right')
    plt.ylabel('Samples')
    plt.xlabel('Score')
    plt.tight_layout()

    if save:
        plt.savefig(topDir + '/' + title.replace(" ", "_") + '.pdf')
        plt.savefig(topDir + '/' + title.replace(" ", "_") + '.png')

    plt.show()
    plt.close()

    print("KS Signal:     "+str(signalKSTest)+" with P value: "+str(ksSig))
    print("KS BackGround: "+str(backgroundKSTest)+" with P value: "+str(ksBck))

    txt = str(title.replace("Vertex Vertex ","").replace("Vertex Region ","").replace("_","").replace(" ","")) + ' & {score:.4} & {signalKSTest:.2} (p={ksSig:.2}) & {backgroundKSTest:.2} (p={ksBck:.2})'
    print(txt.format(score=score*100, signalKSTest=signalKSTest, ksSig=ksSig, backgroundKSTest=backgroundKSTest, ksBck=ksBck))

In [None]:
import PandoraBDT
reload (PandoraBDT)
from PandoraBDT import *

parameters = {
  'ClassNames': ['True Vertex', 'Incorrect Vertex'],
  'SignalDefinition': [1, 0],
  'PlotColors': ['b', 'r'],
  'nBins': 100,
  'PlotStep': 1.0,
  'OptimalBinCut': 0,
  'OptimalScoreCut': 0.0,
  'nTrees': 100,
  'TreeDepth': 3
}

FindOptimalSignificanceCut(baseBDT, xTest, yTest, parameters)
PlotBdtScores2(baseBDT, xTest, yTest, xTrain, yTrain, 'Vertex Region ' + version, parameters, plotsDir, True)