In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import uproot

from sklearn.metrics import roc_auc_score

In [2]:
tree = uproot.open("global_train.root")['tree']

In [4]:
tree.keys()

[b'VeloCharge',
 b'BremPIDe',
 b'CaloNeutralPrs',
 b'CaloNeutralSpd',
 b'InAccBrem',
 b'InAccSpd',
 b'CaloPrsE',
 b'InAccPrs',
 b'HcalPIDe',
 b'CaloHcalE',
 b'InAccHcal',
 b'CaloTrajectoryL',
 b'EcalPIDe',
 b'CaloNeutralEcal',
 b'CaloTrMatch',
 b'CaloElectronMatch',
 b'CaloChargedPrs',
 b'CaloChargedSpd',
 b'MuonNShared',
 b'CaloBremMatch',
 b'MuonIsLooseMuon',
 b'MuonIsMuon',
 b'MuonBkgLL',
 b'InAccEcal',
 b'MuonMuLL',
 b'TrackMatchChi2',
 b'TrackGhostProbability',
 b'TrackType',
 b'EcalPIDmu',
 b'TrackPt',
 b'TrackP',
 b'NumProtoParticles',
 b'RichAboveMuThres',
 b'NumCaloHypos',
 b'InAccMuon',
 b'TrackChi2PerDof',
 b'TrackNumDof',
 b'NumUpstreamTracks',
 b'NumLongTracks',
 b'NumDownstreamTracks',
 b'CaloEcalChi2',
 b'NumRich1Hits',
 b'NumMuonTracks',
 b'CombDLLmu',
 b'CaloSpdE',
 b'PrsPIDe',
 b'HcalPIDmu',
 b'NumSPDHits',
 b'NumVeloTracks',
 b'NumTTracks',
 b'TrackFitMatchChi2',
 b'CombDLLpi',
 b'RichDLLbt',
 b'TrackHistory',
 b'RichDLLpi',
 b'TrackCloneDist',
 b'TrackFitVeloNDoF',


In [7]:
pdg_id = tree.array('MCParticleType')

In [12]:
sel = (np.abs(pdg_id) == 321) | (np.abs(pdg_id) == 211)

In [13]:
def get_feature(key):
    return tree.array(key)[sel]

In [14]:
is_k = np.abs(get_feature('MCParticleType')) == 321

In [17]:
from tqdm import tqdm_notebook as tqdm

In [25]:
scores = {key : roc_auc_score(is_k, get_feature(key)) for key in tqdm(tree.keys())
              if b'MC' not in key and
                 b'Comb' not in key}

HBox(children=(IntProgress(value=0, max=107), HTML(value='')))

In [29]:
best20 = sorted(scores.items(), key=lambda x: max(x[1], 1. - x[1]))[-20:]

In [36]:
from itertools import combinations_with_replacement as combwr

In [41]:
scores_2 = {
    tuple(sorted((f1, f2))) : roc_auc_score(is_k, get_feature(f1) * get_feature(f2))
        for f1, f2 in tqdm(combwr(list(zip(*best20))[0], 2))
}
    

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [43]:
sorted(scores_2.items(), key=lambda x: -max(x[1], 1. - x[1]))

[((b'RichAbovePiThres', b'RichDLLk'), 0.9463163060455513),
 ((b'RichDLLk', b'VeloCharge'), 0.9363588742862012),
 ((b'RichDLLk', b'TrackPt'), 0.9356572255385232),
 ((b'RichDLLk', b'TrackFitTNDoF'), 0.9340715547107858),
 ((b'RichDLLk', b'TrackHistory'), 0.9323486587628173),
 ((b'RichDLLk', b'TrackP'), 0.9301782541798217),
 ((b'RichDLLk', b'TrackLikelihood'), 0.07355534118743048),
 ((b'InAccMuon', b'RichDLLk'), 0.9256574149367924),
 ((b'RichDLLk', b'TrackGhostProbability'), 0.913155653381006),
 ((b'RichDLLk', b'piplus_IP_OWNPV'), 0.9111231405324076),
 ((b'RichAbovePiThres', b'RichDLLbt'), 0.9033755640585059),
 ((b'RichAbovePiThres', b'RichDLLp'), 0.9003389261957406),
 ((b'RichDLLbt', b'VeloCharge'), 0.8952844633195653),
 ((b'RichDLLbt', b'TrackFitTNDoF'), 0.8949199436120304),
 ((b'RichDLLbt', b'TrackHistory'), 0.8925229110388972),
 ((b'RichDLLp', b'VeloCharge'), 0.8924522955942784),
 ((b'RichDLLp', b'TrackFitTNDoF'), 0.891647454831904),
 ((b'RichDLLp', b'TrackHistory'), 0.8894115869469426

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler

In [49]:
def fit_n_score(feature_names):
    features = [get_feature(f) for f in feature_names]
    features += [get_feature(f1) * get_feature(f2) for f1, f2 in combwr(feature_names, 2)]
    
    features = np.array(features).T
    
    scaler = RobustScaler()
    scaler.fit(features)
    features = scaler.transform(features)
    
    model = LogisticRegression(solver='liblinear')
    
    model.fit(features, is_k)
    predictions = model.predict_proba(features)[:,1]
    
    return roc_auc_score(is_k, predictions)

In [50]:
scores_quad = {fnames : fit_n_score(fnames) for fnames, _ in tqdm(scores_2.items())}

HBox(children=(IntProgress(value=0, max=210), HTML(value='')))



In [52]:
sorted(scores_quad.items(), key=lambda x: -x[1])

[((b'RichDLLk', b'TrackP'), 0.9487246254039718),
 ((b'RichDLLk', b'TrackPt'), 0.9484843004446368),
 ((b'RichDLLe', b'RichDLLk'), 0.948304606360435),
 ((b'RichDLLk', b'RichUsedR2Gas'), 0.9478605119398218),
 ((b'RichDLLk', b'TrackFitTNDoF'), 0.9475994772562764),
 ((b'RichAbovePrThres', b'RichDLLk'), 0.9475208454636412),
 ((b'CaloHcalE', b'RichDLLk'), 0.9472042402945231),
 ((b'InAccMuon', b'RichDLLk'), 0.9471495621268424),
 ((b'RichAbovePiThres', b'RichDLLk'), 0.94709094217322),
 ((b'CaloEcalE', b'RichDLLk'), 0.9470723561181331),
 ((b'RichDLLk', b'TrackHistory'), 0.9467992307129962),
 ((b'RichDLLbt', b'RichDLLk'), 0.9467829053596701),
 ((b'RichDLLk', b'RichDLLmu'), 0.9467506428959063),
 ((b'RichDLLk', b'RichDLLp'), 0.9467321472804102),
 ((b'RichDLLk', b'TrackLikelihood'), 0.9466285561348147),
 ((b'RichDLLk', b'VeloCharge'), 0.9465583323100877),
 ((b'RichAboveKaThres', b'RichDLLk'), 0.9465198624256165),
 ((b'RichDLLk', b'TrackGhostProbability'), 0.9462770291567729),
 ((b'RichDLLk', b'RichD