# General Techniques
This notebook collects some general techniques applicable to all classifiers

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, roc_curve, auc
from scipy.stats import ks_2samp
from sklearn.utils import shuffle
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
import Datasets
import Plotting
import Tools

In [None]:
matplotlib.rcParams['figure.figsize'] = (10, 5)

### Output Distributions, ROC Curves and Overtraining
Let's consider again the spiral dataset but with the distributions overlapping more

In [None]:
# Creating training and testing datasets for the signal and background
sig_train = Datasets.gen_spiral( a=0.2, s=0.0, n=1000, w=0.15 )
sig_test  = Datasets.gen_spiral( a=0.2, s=0.0, n=1000, w=0.15 )

bkg_train = Datasets.gen_spiral( a=-0.2, s=0.2, n=1000, w=0.15 )
bkg_test  = Datasets.gen_spiral( a=-0.2, s=0.2, n=1000, w=0.15 )

# Using a BDT
ABDT = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=3),
                         algorithm="SAMME",
                         n_estimators=100)

Tools.train_mva(ABDT,sig_train,bkg_train)

In [None]:
# Plot the decision boundary and points for the testing data
Tools.evaluate_mva(ABDT,sig_test,bkg_test)
plt.show()

In [None]:
def plot_output(clf,sig,bkg):

    n_bins = 40
    
    sig_output = clf.decision_function(sig.values)
    bkg_output = clf.decision_function(bkg.values)
    
    d_min = min(sig_output.min(),bkg_output.min())
    d_max = max(sig_output.max(),bkg_output.max())
    
    plt.hist(bkg_output,bins=n_bins,range=(d_min,d_max), color='tab:orange', label='bkg train',alpha=0.6, density=True)
    plt.hist(sig_output,bins=n_bins,range=(d_min,d_max), color='tab:blue', label='sig train', alpha=0.6, density=True)

    plt.legend()
    plt.show()

    return

In [None]:
plot_output(ABDT,sig_test,bkg_test)

In [None]:
def calc_roc(clf,sig,bkg):
    X = np.concatenate( [sig.values,bkg.values] )
    y = np.concatenate( [np.ones(len(sig.index)),np.zeros(len(bkg.index))] )

    y_score = clf.decision_function(X)

    fpr, tpr, _ = roc_curve(y.ravel(), y_score.ravel())

    roc_auc = auc(fpr,tpr)

    plt.figure()
    plt.plot(tpr,1-fpr,color='tab:orange', lw=2, label='ROC Curve (area = %0.2f)'%roc_auc)
    #plt.plot([1,0],[1,0], color='tab:blue', lw=2, linestyle='--')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel('Sig Eff')
    plt.ylabel('1 - Bkg Eff')
    plt.legend(loc='best')
    
    plt.show()
    
    return roc_auc

In [None]:
test_auc = calc_roc(ABDT,sig_test,bkg_test)
print("Test AUC: ",test_auc)

#### Identifying Overtraining
One way to potentially identify overtraining is to consider the MVA output distributions for the training and testing samples. Ideally these will be similar such that the output of the MVA for the training dataset is similar to the output of the MVA for the testing training set.

In [None]:
def plot_compare_outputs(clf,sig_train,bkg_train,sig_test,bkg_test):

    n_bins = 40
    
    sig_train_output = clf.decision_function(sig_train.values)
    bkg_train_output = clf.decision_function(bkg_train.values)

    sig_test_output = clf.decision_function(sig_test.values)
    bkg_test_output = clf.decision_function(bkg_test.values)
    
    d_min = min(sig_train_output.min(),bkg_train_output.min())
    d_max = max(sig_train_output.max(),bkg_train_output.max())
    
    sig_tr,bins,_ = plt.hist(bkg_train_output,bins=n_bins,range=(d_min,d_max), color='tab:orange', label='bkg train',alpha=0.6, density=True)
    plt.hist(sig_train_output,bins=n_bins,range=(d_min,d_max), color='tab:blue', label='sig train', alpha=0.6, density=True)

    bin_centers = (bins[:-1]+bins[1:])/2
    sig_te,_ = np.histogram(sig_test_output,bins=bins,density=True)
    bkg_te,_ = np.histogram(bkg_test_output,bins=bins,density=True)

    plt.plot(bin_centers,bkg_te, 'o', c='tab:orange', label='bkg test', alpha=0.9, markeredgecolor='k')
    plt.plot(bin_centers,sig_te, 'o', c='tab:blue', label='sig test', alpha=0.9, markeredgecolor='k')

    print(ks_2samp(sig_tr,sig_te))
    
    plt.legend()
    plt.show()

    return

The Kolmogorov-Smirnov (KS) Test provides a test statistic to evaluate whether the two distributions (trainging and testing) are from the same underlying distribution.
You can consider the __p-value__ and compare it to a level of significance $\boldsymbol{\alpha}$ (we usually $\boldsymbol{\alpha}$__=0.05__ or $\boldsymbol{\alpha}$__=0.02__), and then if the __p-value__ is less than $\boldsymbol{\alpha}$ it is very likely the two distributions are different.

In [None]:
plot_compare_outputs(ABDT,sig_train,bkg_train,sig_test,bkg_test)

### Cross-validation

In [None]:
def errorVsSize(clf,cv,sig,bkg,njobs,train_sizes=np.linspace(.1, 1.0, 10)):
    
    X = np.concatenate( [sig.values,bkg.values] )
    y = np.concatenate( [np.ones(len(sig.index)),np.zeros(len(bkg.index))] )

    X,y = shuffle(X,y)

    train_sizes, train_scores, test_scores = learning_curve(clf, X, y, cv=cv, n_jobs=njobs, train_sizes=train_sizes)
    
    train_errors_mean = np.mean(train_scores, axis=1)
    train_errors_std = np.std(train_scores, axis=1)
    test_errors_mean = np.mean(test_scores, axis=1)
    test_errors_std = np.std(test_scores, axis=1)

    plt.figure()
    plt.xlabel("Training Samples")
    plt.ylabel("1-Error")

    plt.grid()
    plt.fill_between(train_sizes, train_errors_mean - train_errors_std,
                     train_errors_mean + train_errors_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_errors_mean - test_errors_std,
                     test_errors_mean + test_errors_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_errors_mean, 'o-', color="r",
             label="Training Error")
    plt.plot(train_sizes, test_errors_mean, 'o-', color="g",
             label="Test Error")

    plt.legend(loc="best")
    plt.show()
    
    return

In [None]:
sig = Datasets.gen_spiral( a=0.2, s=0.0, n=1000, w=0.15 )
bkg = Datasets.gen_spiral( a=-0.2, s=0.2, n=1000, w=0.15 )

ABDT = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=3),
                         algorithm="SAMME",
                         n_estimators=100)

cv = ShuffleSplit(n_splits=20, test_size=0.2, random_state=0)
errorVsSize(ABDT,cv,sig,bkg,4)