In [12]:
import xgboost as xgb
import numpy as np
import multiprocessing
import pandas as pd
import matplotlib.pyplot as plt
import root_numpy

from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.base import clone

In [5]:
def compute_multiclass_auc(data):
    train = data[0]
    test = data[1]
    clf = clone(data[2])
    X_train = train.drop('target', axis=1).values
    y_train = train['target'].values

    X_test = test.drop('target', axis=1).values
    y_test = test['target'].values

    clf.fit(X_train, y_train)

    y_pred = clf.predict_proba(X_test)
    return [1 - roc_auc_score(y_test > 0, y_pred[:, 0] / y_pred[:, 1],
                                                      sample_weight=(y_test != 2) * 1),
    1 - roc_auc_score(y_test > 1, y_pred[:, 0] / y_pred[:, 2],
                                                      sample_weight=(y_test != 1) * 1),
    1 - roc_auc_score(y_test > 1, y_pred[:, 1] / y_pred[:, 2],
                                                      sample_weight=(y_test != 0) * 1)]

def compute_auc(data):
    train = data[0]
    test = data[1]
    clf = clone(data[2])
    X_train = train.drop('target', axis=1).values
    y_train = train['target'].values

    X_test = test.drop('target', axis=1).values
    y_test = test['target'].values

    clf.fit(X_train, y_train)

    y_pred = clf.predict_proba(X_test)
    return roc_auc_score(y_test, y_pred[:, 1])

In [6]:
treename = 'tag'

data_b = pd.DataFrame(root_numpy.root2array('datasets/type=5.root', treename=treename)).dropna()
data_b = data_b[::20].reset_index(drop=True)
data_c = pd.DataFrame(root_numpy.root2array('datasets/type=4.root', treename=treename)).dropna().reset_index(drop=True)
data_light = pd.DataFrame(root_numpy.root2array('datasets/type=0.root', treename=treename)).dropna().reset_index(drop=True)
data_b['target'] = 0
data_c['target'] = 1
data_light['target'] = 2
data = {'b': data_b, 'c': data_c, 'light': data_light}
jet_features = [column for column in data_b.columns if "Jet" in column]
sv_features = [column for column in data_b.columns if "SV" in column]

In [7]:
for d in data.values():
    features = []
    for ind1 in range(0, len(sv_features)):
        f1 = sv_features[ind1]
        for ind2 in range(ind1, len(sv_features)):
            f2 = sv_features[ind2]
            d[f1+'_mult_'+f2] = d[f1].values * d[f2].values
            d[f1+'_div_'+f2] = d[f1].values / (d[f2].values + 0.1)
            #d[f2+'_div_'+f1] = d[f2].values / (d[f1].values + 0.1)
            features.append(f1+'_mult_'+f2)
            features.append(f1+'_div_'+f2)
            #features.append(f2+'_div_'+f1)
            #d['2'+f1+'_mult_'+f2] = (d[f1].values**2) * (d[f2].values**2)
            #d['2'+f1+'_div_'+f2] = (d[f1].values**2) / (d[f2].values**2 + 0.1)
            d['2'+f1+'_plus_'+f2] = (d[f1].values**2) + (d[f2].values**2)
            #['2'+f1+'_min_'+f2] = (d[f1].values**2) - (d[f2].values**2)
            #features.append('2'+f1+'_mult_'+f2)
            #features.append('2'+f1+'_div_'+f2)
            features.append('2'+f1+'_plus_'+f2)
            #features.append('2'+f1+'_min_'+f2)

In [8]:
data_b_c = pd.concat([data_b, data_c], ignore_index=True)
data_b_c = data_b_c.iloc[np.random.permutation(len(data_b_c))]

data_b_light = pd.concat([data_b, data_light], ignore_index=True)
data_b_light = data_b_light.iloc[np.random.permutation(len(data_b_light))]
data_b_light.loc[data_b_light['target'] == 2, 'target'] = 1

data_c_light = pd.concat([data_c, data_light], ignore_index=True)
data_c_light = data_c_light.iloc[np.random.permutation(len(data_c_light))]
data_c_light.loc[data_c_light['target'] == 1, 'target'] = 0
data_c_light.loc[data_c_light['target'] == 2, 'target'] = 1

data_bc_light = pd.concat([data_b, data_c, data_light], ignore_index=True)
data_bc_light = data_bc_light.iloc[np.random.permutation(len(data_bc_light))]
data_bc_light.loc[data_bc_light['target'] == 1, 'target'] = 0
data_bc_light.loc[data_bc_light['target'] == 2, 'target'] = 1

data_b_c_light = pd.concat([data_b, data_c, data_light], ignore_index=True)
data_b_c_light = data_b_c_light.iloc[np.random.permutation(len(data_b_c_light))]

In [27]:
xclf1 = xgb.XGBClassifier(learning_rate=0.02,
                         max_depth=7,
                         n_estimators=600,
                         gamma=2.0,
                         objective='binary:logistic',
                         colsample_bytree=0.55,
                         subsample=0.55,
                         min_child_weight=2)

xclf2 = xgb.XGBClassifier(learning_rate=0.02,
                         max_depth=6,
                         n_estimators=2000,
                         gamma=2,
                         objective='binary:logistic',
                         colsample_bytree=0.85,
                         subsample=0.65,
                         min_child_weight=2)

xclf3 = xgb.XGBClassifier(learning_rate=0.02,
                         max_depth=5,
                         n_estimators=1000,
                         gamma=2.5,
                         objective='binary:logistic',
                         colsample_bytree=0.85,
                         subsample=0.65,
                         min_child_weight=2)

xclf4 = xgb.XGBClassifier(learning_rate=0.02,
                         max_depth=6,
                         n_estimators=2000,
                         gamma=2.5,
                         objective='binary:logistic',
                         colsample_bytree=0.85,
                         subsample=0.65,
                         min_child_weight=2)

In [22]:
def learn(dataset, clf, estimator):
    foldation = KFold(dataset.shape[0], 2, shuffle=True, random_state=0)
    pool = multiprocessing.Pool(4)
    output = pool.map(estimator, [(dataset[sv_features + features + ['target']].iloc[train_index], dataset[sv_features + features + ['target']].iloc[test_index], clf)
        for train_index, test_index in foldation])
    pool.close()
    return np.mean(output, axis=0)


In [20]:
learn(data_b_c_light, xclf1, compute_multiclass_auc)

array([ 0.95139229,  0.9888575 ,  0.98096004])