# Multi GS Classifier using Random Forests

### Attempt to classifier GS events with a Random Forest

### Labels: 0 = noGS, 1 = GSb , 2 = GSbb

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('/Users/rohan/research/gs_classifier/output_multigs.csv')
print(df.shape)

feats = df.iloc[:,0:12]
y = df.isgs

(164656, 13)


In [3]:
print(feats.shape)
feats.head(5)

(164656, 12)


Unnamed: 0,weight,dr_bb,bb_pt,bb_m,jet1_csv,jet1_pt,jet1_eta,jet1_phi,jet2_csv,jet2_pt,jet2_eta,jet2_phi
0,0.068587,0.947933,806.513,418.969666,0.970258,469.232178,-1.552434,0.988524,0.906826,340.613373,-0.6225,1.172366
1,0.065783,1.710818,345.10672,202.336594,0.993821,47.740227,1.110587,-1.902524,0.958397,346.806488,0.763946,-0.227192
2,0.06915,0.821434,699.727417,307.803986,0.996421,344.196381,0.981522,-0.86086,0.901411,418.045929,1.034307,-0.041124
3,0.069805,1.968648,235.376617,302.447571,0.987,153.347351,0.517652,0.096551,0.862928,140.702484,-0.972188,1.383389
4,0.068559,1.061551,743.231018,418.841614,0.907696,374.111206,0.631812,1.520363,0.853196,370.016937,-0.425185,1.618583


In [4]:
print(y.shape)
y.head(5)

(164656,)


0    2
1    2
2    0
3    1
4    0
Name: isgs, dtype: int64

## Compute a baseline metric
### Here using a simple cut on the dr_bb distribution
#### In previous iteration of the RPV search we used dr_bb < 1.6 to classify gs

In [114]:
print("# of GS predicted as GS:         {}").format(
    round(float(sum(df[df.dr_bb<1.6].weight*(df[df.dr_bb<1.6].isgs>0))),3))
print("# of GS not predicted as GS:     {}").format(
    round(float(sum(df[df.dr_bb>=1.6].weight*(df[df.dr_bb>=1.6].isgs>0))),3))
print("# of not GS predicted as GS:     {}").format(
    round(float(sum(df[df.dr_bb<1.6].weight*(df[df.dr_bb<1.6].isgs==0))),3))
print("# of not GS predicted as not GS: {}").format(
    round(float(sum(df[df.dr_bb>=1.6].weight*(df[df.dr_bb>=1.6].isgs==0))),3))

# of GS predicted as GS:         345.099
# of GS not predicted as GS:     313.982
# of not GS predicted as GS:     314.699
# of not GS predicted as not GS: 417.843


In [123]:
cm_cut = np.ndarray((2,2), buffer=np.array([345.099, 313.982, 314.699, 417.843]))
print("Accuracy: {}").format((cm_cut.trace()/cm_cut.sum()).round(3))
print("Confusion matrix:")
print(cm_cut)

print("\n Normalized confusion matrix:")
print((cm_cut/cm_cut.sum(axis=1)[:, np.newaxis]).round(3))

Accuracy: 0.548
Confusion matrix:
[[ 345.099  313.982]
 [ 314.699  417.843]]

 Normalized confusion matrix:
[[ 0.524  0.476]
 [ 0.43   0.57 ]]


## Split data in train, test, and validation sets

In [5]:
from sklearn.model_selection import train_test_split
feats_train, feats_test, y_train, y_test = train_test_split(feats, y, train_size=0.6, random_state=354)
feats_test, feats_val, y_test, y_val = train_test_split(feats_test, y_test, train_size=0.5, random_state=3874)

weights_train, X_train = feats_train.weight, feats_train.iloc[:,1:12]
weights_test , X_test  = feats_test.weight ,  feats_test.iloc[:,1:12]
weights_val  , X_val   = feats_val.weight  ,  feats_val.iloc[:,1:12]

In [7]:
from sklearn.ensemble import RandomForestClassifier

# Configure RF
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train, sample_weight=weights_train.values)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [21]:
# Make predictions
pred_train = rf.predict(X_train)
pred_test = rf.predict(X_test)

In [23]:
# Collapse confusion matrix into noGS vs GS
def get_cm_gs(mat, norm=False):
    if mat.shape != (3,3): print("Matrix shape must be (3,3)")
        
    cm_gs = mat
    cm_gs[1,:]+=cm_gs[2,:]
    cm_gs[:,1]+=cm_gs[:,2]
    cm_gs = cm_gs[:2,:2]
    
    if norm:
        return(cm_gs/cm_gs.sum(axis=1)[:, np.newaxis])
    else:
        return(cm_gs)

In [25]:
from sklearn.metrics import confusion_matrix

#Print results
# Train accuracy
cm_train = confusion_matrix(y_train, pred_train, sample_weight=weights_train.values)
print("Train Multi-GS Accuracy: {}").format((cm_train.trace()/cm_train.sum()).round(3))
print(cm_train/cm_train.sum(axis=1)[:, np.newaxis])
print("\n")

cm_gs_train = get_cm_gs(cm_train)
print("Train GS Accuracy: {}").format((cm_gs_train.trace()/cm_gs_train.sum()).round(3))
print(cm_gs_train/cm_gs_train.sum(axis=1)[:, np.newaxis])
print("\n")

# Test accuracy
cm_test = confusion_matrix(y_test, pred_test, sample_weight=weights_test.values)
print("Test Accuracy: {}").format((cm_test.trace()/cm_test.sum()).round(3))
print(cm_test/cm_test.sum(axis=1)[:, np.newaxis])
print("\n")

cm_gs_test = get_cm_gs(cm_test)
print("Train GS Accuracy: {}").format((cm_gs_test.trace()/cm_gs_test.sum()).round(3))
print(cm_gs_test/cm_gs_test.sum(axis=1)[:, np.newaxis])

Train Multi-GS Accuracy: 1.0
[[ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]]


Train GS Accuracy: 1.0
[[ 1.  0.]
 [ 0.  1.]]


Test Accuracy: 0.581
[[ 0.73542835  0.17566554  0.0889061 ]
 [ 0.5338034   0.37625723  0.08993938]
 [ 0.40422675  0.11013163  0.48564162]]


Train GS Accuracy: 0.627
[[ 0.73542835  0.26457165]
 [ 0.49043536  0.50956464]]


### Now regularize

In [142]:
from sklearn.model_selection import GridSearchCV

rf_grid = RandomForestClassifier()

params = {'max_depth' : range(1,5),
          'min_impurity_split' : range(0,30,5),
          'criterion' : ['gini','entropy'],
          'n_estimators' : [100],
          'n_jobs' : [2]
         }

grid_search = GridSearchCV(estimator=rf_grid,
                           param_grid=params)


In [166]:
rf_grids = []
for ix, depth in enumerate(range(2,17,2)):
    for iy, impurity in enumerate(np.arange(0,0.4,0.1)):
    
        print("({}, {}): DEPTH = {}, IMPURITY = {}").format(ix, iy,depth,impurity)

        rf_grid = RandomForestClassifier(n_estimators=500, n_jobs=3, max_depth=depth, min_impurity_split=impurity)
        rf_grid.fit(X_train, y_train, sample_weight=weights_train.values)

        rf_grids.append(rf_grid)
        
        # Print results
        # Train accuracy
#        cm_train = confusion_matrix(y_train, rf_grid.predict(X_train), sample_weight=weights_train.values)
#        print("Train Multi-GS Accuracy: {}").format((cm_train.trace()/cm_train.sum()).round(3))
#        print(cm_train/cm_train.sum(axis=1)[:, np.newaxis])
#        print("\n")

        cm_gs_train = get_cm_gs(cm_train)
        print("Train GS Accuracy: {}").format((cm_gs_train.trace()/cm_gs_train.sum()).round(3))
#        print(cm_gs_train/cm_gs_train.sum(axis=1)[:, np.newaxis])
#        print("\n")

        # Test accuracy
        cm_test = confusion_matrix(y_test, rf_grid.predict(X_test), sample_weight=weights_test.values)
#        print("Test Accuracy: {}").format((cm_test.trace()/cm_test.sum()).round(3))
#        print(cm_test/cm_test.sum(axis=1)[:, np.newaxis])
#        print("\n")

        cm_gs_test = get_cm_gs(cm_test)
        print("Test GS Accuracy: {}").format((cm_gs_test.trace()/cm_gs_test.sum()).round(3))
        print(cm_gs_test/cm_gs_test.sum(axis=1)[:, np.newaxis])
        print('\n')

(0, 0): DEPTH = 2, IMPURITY = 0.0
Train GS Accuracy: 0.917
Test GS Accuracy: 0.52
[[ 1.  0.]
 [ 1.  0.]]


(0, 1): DEPTH = 2, IMPURITY = 0.1
Train GS Accuracy: 0.92
Test GS Accuracy: 0.52
[[ 1.  0.]
 [ 1.  0.]]


(0, 2): DEPTH = 2, IMPURITY = 0.2
Train GS Accuracy: 0.924
Test GS Accuracy: 0.52
[[ 1.  0.]
 [ 1.  0.]]


(0, 3): DEPTH = 2, IMPURITY = 0.3
Train GS Accuracy: 0.926
Test GS Accuracy: 0.52
[[ 1.  0.]
 [ 1.  0.]]


(1, 0): DEPTH = 4, IMPURITY = 0.0
Train GS Accuracy: 0.929
Test GS Accuracy: 0.589
[[ 0.87510037  0.12489963]
 [ 0.72094464  0.27905536]]


(1, 1): DEPTH = 4, IMPURITY = 0.1
Train GS Accuracy: 0.932
Test GS Accuracy: 0.593
[[ 0.86598449  0.13401551]
 [ 0.70376765  0.29623235]]


(1, 2): DEPTH = 4, IMPURITY = 0.2
Train GS Accuracy: 0.934
Test GS Accuracy: 0.591
[[ 0.86758472  0.13241528]
 [ 0.70864006  0.29135994]]


(1, 3): DEPTH = 4, IMPURITY = 0.3
Train GS Accuracy: 0.936
Test GS Accuracy: 0.59
[[ 0.87460089  0.12539911]
 [ 0.71881194  0.28118806]]


(2, 0): DEPTH 

### Conclusion

#### Using a random forest did not improve on the decision tree.
#### Random Forests are beneficial for two reasons 1) They reduce overfitting and 2) They force the trees to learn other secondary variables
#### But the original decision tree 1) was not overfitting  and 2) there are really only a few (non-correlated) variables that discriminate, i.e. dr_bb/bb_m, jet1_csv, and jet2_csv