In [None]:
# Add the relevant scripts from LArMachineLearningData
# Nice the process so it can run with lots of cores on low priority
import os
os.nice(20)

# Add path for LArMachineLearningData
import sys
pandoraMVADir = os.environ['MY_TEST_AREA'] + 'LArMachineLearningData/'
sys.path.append(pandoraMVADir + 'scripts')

from PandoraBDT import *

# Import relevant SKLearn stuff
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn import metrics

# Set global params
testTrainFraction = 0.5
nCores = -1

In [None]:
# Set some analysis specific things
trainingFile = pandoraMVADir + 'SVM_training_data_pfocharacterisation_example.txt'

BDTName = "PFOCharBDT"

featureNames = ['Length', 
                'Straight Line Diff Mean',
                'Max Fit Gap Length', 
                'Sliding Linear Fit RMS',
                'Vertex Distance', 
                'PCA Secondary-Primary EigenValue Ratio',
                'PCA Tertiary-Primary EigenValue Ratio',
                'Opening Angle Diff', 
               ]


# Set background and signal label names
params = {
    'labelNames': ['True Shower','True Track'],
    'signalDefs': [0, 1],
    'signalCols': ['r', 'b'],
    'nBins': 100,
    'PlotStep': 1.0,
    'OptimalBinCut': 50,
    'OptimalScoreCut': 0.5,
    'nTrees': 100,
    'TreeDepth': 3,
    'logY': False
}

# Create the base BDT to vary the params from and compare to
baseBDT = AdaBoostClassifier(DecisionTreeClassifier(max_depth=params['TreeDepth']),algorithm='SAMME', 
                         random_state=42, n_estimators=params['nTrees'])

# Split the data into many subsets to grid search over (Set seed for reproducibility)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

In [None]:
# Load the data
data, nFeatures, nExamples = LoadData(trainingFile, ',')
featuresOrg, labelsOrg = SplitTrainingSet(data, nFeatures)
features, labels = Randomize(featuresOrg, labelsOrg, True)

# Split into train and test samples
xTrain, yTrain, xTest, yTest = Sample(features, labels, testTrainFraction)

# Split into signal and background based on the true labels
signalFeatures = features[labels==1]
backgroundFeatures = features[labels==0]

# Check the features array is the same size as the feature names array
print (len(featureNames))
print (np.shape(features))
print('Total: '+str(len(features))+', signal: '+
      str(len(signalFeatures))+' and background: '+
      str(len(backgroundFeatures)))

In [None]:
# Construct the Pandas dataframe
# First crete a dictionary
allDict = {featureNames[i]: features[:, i] for i in range(nFeatures)}
allDict.update({'Labels': labels})

# Create the Pandas dataframe, create seperate df for signal/background
df = pd.DataFrame(data=allDict)

In [None]:
# Make plots drawing the variables for signal/background
DrawVariablesDF(df, params)

In [None]:
# Make correlation matricies
dfSig = df[df['Labels']==params['signalDefs'][0]].drop('Labels', axis=1)
dfBck = df[df['Labels']==params['signalDefs'][1]].drop('Labels', axis=1)

CorrelationDF(dfSig, params['labelNames'][0] + ' Correlation Matrix')
CorrelationDF(dfBck, params['labelNames'][1] + ' Correlation Matrix')

In [None]:
# If we want to make a plot comparing two variables;
xMetric = 'Vertex Distance'
yMetric = 'PCA Secondary-Primary EigenValue Ratio'

sns.jointplot(data=df, x=xMetric, y=yMetric, hue='Labels',
              xlim=(np.quantile(df[xMetric], 0.02), np.quantile(df[xMetric], 0.98)), 
              ylim=(np.quantile(df[yMetric], 0.02), np.quantile(df[yMetric], 0.98)))

In [None]:
# For plotting all combos, not very useful when we have too many variables
sns.pairplot(df, hue='Labels')

In [None]:
# Define size of grid search
depthRange = 3
treeRange = 3

# Set up ranges for grid search
depthArray = np.linspace(1, depthRange, depthRange, dtype=int)
treeArray = np.logspace(0, treeRange-1, treeRange, dtype=int)
#treeArray = np.linspace(100, 100*treeRange, treeRange, dtype=int)

# Print arrays for debugging
print ("Depth Array:", depthArray)
print ("Tree Array: ", treeArray)

# Construct a dictionary to loop over
paramGrid = dict(base_estimator__max_depth=depthArray, n_estimators=treeArray)

# Perform the grid search
grid = GridSearchCV(baseBDT, param_grid=paramGrid, cv=cv, n_jobs=nCores, 
                    verbose=9, refit=True, return_train_score=True)


In [None]:
# Run the grid search
grid.fit(xTrain, yTrain)

print("The best parameters are %s with a score of %0.2f"% 
      (grid.best_params_, grid.best_score_))

# Put the output of the grid in a conveneant df
gridResults = pd.DataFrame(grid.cv_results_)
gridResults.rename(columns={"param_base_estimator__max_depth": "MaxDepth"}, inplace=True)
gridResults.rename(columns={"param_n_estimators": "NTrees"}, inplace=True)

In [None]:
testScores = gridResults.pivot("MaxDepth", "NTrees", "mean_test_score")
testStd = gridResults.pivot("MaxDepth", "NTrees", "std_test_score")
trainScores = gridResults.pivot("MaxDepth", "NTrees", "mean_train_score")

trainTestDiff = trainScores - testScores

plt.figure(figsize=(4, 4), constrained_layout=True)
sns.heatmap(testScores, cmap='bwr', linewidths=0, annot=True)
plt.title('Validation accuracy: Test')
plt.gca().invert_yaxis()
plt.savefig('TestScores.pdf')
plt.show()

plt.figure(figsize=(4, 4), constrained_layout=True)
sns.heatmap(testStd, cmap='bwr', linewidths=0, annot=True)
plt.title('Validation accuracy: Std Test Score')
plt.gca().invert_yaxis()
plt.savefig('TrainStds.pdf')
plt.show()

plt.figure(figsize=(4, 4), constrained_layout=True)
sns.heatmap(trainScores, cmap='bwr', linewidths=0, annot=True)
plt.title('Validation accuracy: Train')
plt.gca().invert_yaxis()
plt.savefig('TrainScores.pdf')
plt.show()

plt.figure(figsize=(4, 4), constrained_layout=True)
sns.heatmap(trainTestDiff, cmap='bwr', linewidths=0, annot=True)
plt.title('Validation accuracy: Train Test Diff')
plt.gca().invert_yaxis()
plt.savefig('TrainTestDiff.pdf')
plt.show()

In [None]:
# Reference BDT with controlled hyperparams
baseBDT.fit(xTrain,yTrain)

In [None]:
# Plot ROC curves
fig, ax = plt.subplots()
metrics.plot_roc_curve(grid, xTest, yTest, ax=ax)
metrics.plot_roc_curve(baseBDT, xTest, yTest, ax=ax)

plt.title("ROC Curves")
ax.invert_xaxis()
ax.legend()
ax.grid()

In [None]:
# Plot Confusion Matricies
fig, ax = plt.subplots()
metrics.plot_confusion_matrix(grid, xTest, yTest, display_labels=params['labelNames'],
                             ax=ax, normalize='true')
ax.invert_xaxis()
#ax.invert_zaxis()
plt.title("Confusion matrix (True Normalised)")
plt.show()

In [None]:
# Print more detailed performance info
bdtPredicted = baseBDT.predict(xTest)
gridPredicted = grid.predict(xTest)

print ("Background (0): ", params['labelNames'][0])
print ("Signal (1): ", params['labelNames'][1])
print ("BDT:\n", metrics.classification_report(yTest, bdtPredicted))
print ("Grid:\n", metrics.classification_report(yTest, gridPredicted))

In [None]:
# Search performance over training sample size
train_sizes_array = np.linspace(0.0,1, 20)

train_sizes, train_scores, test_scores = learning_curve(baseBDT, features,
    labels, train_sizes=train_sizes_array[1:], n_jobs=nCores, verbose=9, cv=cv)

mean_train_scores = np.mean(train_scores, axis=1)
mean_test_scores = np.mean(test_scores, axis=1)

std_train_scores = np.std(train_scores, axis=1)
std_test_scores = np.std(test_scores, axis=1)

In [None]:
# Plot training progression
fig, ax = plt.subplots()
plt.title("Training Progression")
plt.xlabel("Number of Training Examples")
plt.ylabel("Score")

plt.plot(train_sizes, mean_train_scores, label='Train Score', color='b')
plt.fill_between(train_sizes, mean_train_scores - std_train_scores,
                         mean_train_scores + std_train_scores, alpha=0.1,
                         color="b")

plt.plot(train_sizes, mean_test_scores, label='Test Score', color='r')
plt.fill_between(train_sizes, mean_test_scores - std_test_scores,
                         mean_test_scores + std_test_scores, alpha=0.1,
                         color="r")
#plt.plot(train_sizes, std_test_scores, label='Test Score Std.', color='k')

plt.grid()
plt.legend()

In [None]:
# Search over a metric
cppalplhaArray = np.linspace(0,0.001,11)

train_scores, test_scores = validation_curve(
    baseBDT, features, labels, param_name='base_estimator__ccp_alpha',
    param_range=cppalplhaArray, n_jobs=nCores, verbose=9, cv=cv)

mean_train_scores = np.mean(train_scores, axis=1)
mean_test_scores = np.mean(test_scores, axis=1)

std_train_scores = np.std(train_scores, axis=1)
std_test_scores = np.std(test_scores, axis=1)

print ("Means: "+str(mean_test_scores)+" and std. "
       +str(std_test_scores))

In [None]:
# Plot grid search
plt.plot(cppalplhaArray, mean_train_scores, label='Train Score', color='b')
plt.fill_between(cppalplhaArray, mean_train_scores - std_train_scores,
                         mean_train_scores + std_train_scores, alpha=0.1,
                         color="b")
plt.plot(cppalplhaArray, mean_test_scores, label='Test Score', color='r')
plt.fill_between(cppalplhaArray, mean_test_scores - std_test_scores,
                         mean_test_scores + std_test_scores, alpha=0.1,
                         color="r")
plt.grid()
#plt.xscale('log')
plt.legend()

In [None]:
# Search over a metric
learningRateArray = np.linspace(0.1,1.5,15)

train_scores, test_scores = validation_curve(
    baseBDT, features, labels, param_name='learning_rate',
    param_range=learningRateArray, n_jobs=nCores, verbose=9, cv=cv)

mean_train_scores = np.mean(train_scores, axis=1)
mean_test_scores = np.mean(test_scores, axis=1)

std_train_scores = np.std(train_scores, axis=1)
std_test_scores = np.std(test_scores, axis=1)

print ("Means: "+str(mean_test_scores)+" and std. "
       +str(std_test_scores))

In [None]:
# Plot grid search
plt.plot(learningRateArray, mean_train_scores, label='Train Score', color='b')
plt.fill_between(learningRateArray, mean_train_scores - std_train_scores,
                         mean_train_scores + std_train_scores, alpha=0.1,
                         color="b")
plt.plot(learningRateArray, mean_test_scores, label='Test Score', color='r')
plt.fill_between(learningRateArray, mean_test_scores - std_test_scores,
                         mean_test_scores + std_test_scores, alpha=0.1,
                         color="r")
plt.grid()
#plt.xscale('log')
plt.legend()

In [None]:
# Plot importance of features
importanceDF = pd.DataFrame({'Features': featureNames, 'Importance Score':baseBDT.feature_importances_})
print (importanceDF.sort_values(by=['Importance Score']))
ax = importanceDF.sort_values(by=['Importance Score'])\
    .plot(kind='barh', x='Features', y='Importance Score')

In [None]:
# Print all tunable params
baseBDT.get_params().keys()

In [None]:
import PandoraBDT
from importlib import reload

reload (PandoraBDT)
from PandoraBDT import *

print (np.shape(xTest))
print (np.shape(yTest))
print (np.shape(xTrain))
print (np.shape(yTrain))


PlotBdtKSScores(baseBDT, xTest, yTest, xTrain, yTrain, 'Vertex Region', params)

In [None]:
WriteXmlFile(BDTName+".xml", baseBDT, BDTName)
SerializeToPkl(BDTName+".pkl", baseBDT)