In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.ensemble import BalanceCascade, EasyEnsemble
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import GridSearchCV
import h5py
import keras
from sklearn.utils import class_weight
from keras.utils import to_categorical
from keras.optimizers import *
from keras.regularizers import *
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier


%matplotlib inline

KFOLD_SEED = 42


def rtb_confusion_matrix(test_labels, y_preds):
    m = confusion_matrix(test_labels, y_preds)
    
    print("================================")
    print("Confusion Matrix:")
    print("True Negative = %d" % m[0][0])
    print("False Negative = %d" % m[1][0])
    print("True Positive = %d" % m[1][1])
    print("False Positive = %d" % m[0][1])


def rtb_f1_score(test_labels, y_preds):
    f = f1_score(test_labels, y_preds)
    print("================================")
    print("f1 score = %0.3f" % f)


def print_metrics(true_labels, y_preds, y_scores, is_train=True):
    if is_train:
        print("--------train---------")
    else:
        print("--------test---------")
    
    rtb_confusion_matrix(true_labels, y_preds)
    rtb_f1_score(true_labels, y_preds)
    print("================================")
    print("ROC AUC Score = %0.3f" % roc_auc_score(true_labels, y_scores.argmax(axis=-1)))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
input_path = '~/data/biddings.csv'
data = pd.read_csv(input_path)
print(data.shape)

train = data[:800000]
test = data[800000:]

sample = train.sample(frac=1)
features = sample.drop('convert', axis=1).values
labels = sample.convert.ravel()

test_features = test.drop('convert', axis=1).values
test_labels = test.convert.ravel()

(1000000, 89)


In [12]:
lr = LogisticRegression(penalty='l2', random_state=KFOLD_SEED, verbose=2)

model = lr.fit(features, labels)
predicted_scores = model.predict_proba(test_features)
predicted_labels = model.predict(test_features)
print(predicted_scores.shape, predicted_labels.shape)

print_metrics(test_labels, predicted_labels, predicted_scores, is_train=False)


[LibLinear](200000, 2) (200000,)
--------test---------
Confusion Matrix:
True Negative = 199623
False Negative = 377
True Positive = 0
False Positive = 0
f1 score = 0.000
ROC AUC Score = 0.500


  'precision', 'predicted', average, warn_for)


In [3]:
def logistic_regression():
    lr = LogisticRegression(penalty='l2', random_state=KFOLD_SEED, verbose=2)
    return lr
    

def ensembler_test(classifier_fn, ensemblers):
    rus = RandomUnderSampler(ratio={0: 1531*10, 1: 1531}, random_state=KFOLD_SEED)
    X_us, y_us = rus.fit_sample(features, labels)
    
    for i, e in enumerate(ensemblers):
        print("fitting sample")
        X_res, y_res = e.fit_sample(X_us, y_us)
        print(X_res.shape, y_res.shape)
        clf = classifier_fn()
        print("training")
        
        for j, X_train in enumerate(X_res):
            model = clf.fit(X_train, y_res[j])
        
        predicted_scores = model.predict_proba(test_features)
        predicted_labels = model.predict(test_features)
        
        print("Ensembler %d" % i)
        print_metrics(test_labels, predicted_labels, predicted_scores, is_train=False)

EasyEnsemble and decision tree are consistenly the best

In [4]:
ee = EasyEnsemble(random_state=KFOLD_SEED)
bc = BalanceCascade(random_state=KFOLD_SEED)

dt = DecisionTreeClassifier(random_state=KFOLD_SEED)
bc_dt = BalanceCascade(estimator=dt, random_state=KFOLD_SEED)

rf = RandomForestClassifier(n_jobs=-1, random_state=KFOLD_SEED, verbose=1)
bc_rf = BalanceCascade(estimator=rf, random_state=KFOLD_SEED)

xgbc = XGBClassifier()
bc_xgbc = BalanceCascade(estimator=xgbc, random_state=KFOLD_SEED)


ensemblers = [bc_xgbc, ee, bc, bc_dt, bc_rf]
ensembler_test(logistic_regression, ensemblers)

fitting sample


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


(17, 3062, 88) (17, 3062)
training
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Ensembler 0
--------test---------
Confusion Matrix:
True Negative = 102476
False Negative = 162
True Positive = 215
False Positive = 97147
f1 score = 0.004
ROC AUC Score = 0.542
fitting sample
(10, 3062, 88) (10, 3062)
training
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Ensembler 1
--------test---------
Confusion Matrix:
True Negative = 126466
False Negative = 115
True Positive = 262
False Positive = 73157
f1 score = 0.007
ROC AUC Score = 0.664
fitting sample
(19, 3062, 88) (19, 3062)
training
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Ens

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_j

(15, 3062, 88) (15, 3062)
training
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Ensembler 4
--------test---------
Confusion Matrix:
True Negative = 130478
False Negative = 145
True Positive = 232
False Positive = 69145
f1 score = 0.007
ROC AUC Score = 0.635


Try to tune EasyEnsemble by adjusting subsets. It does not affect f1-score or ROC AUC socre
```
ee = EasyEnsemble(n_subsets = 100, random_state=KFOLD_SEED)
ensembler_test(logistic_regression, [ee])

ee = EasyEnsemble(n_subsets = 4, random_state=KFOLD_SEED)
ensembler_test(logistic_regression, [ee])
```
Both result in:
```
================================
Confusion Matrix:
True Negative = 127470
False Negative = 132
True Positive = 245
False Positive = 72153
================================
f1 score = 0.007
================================
ROC AUC Score = 0.644
```

Tune max subset for BalanceCascade with DecisionTreeClassifier

10 is the best

In [19]:
dt_20 = DecisionTreeClassifier(random_state=KFOLD_SEED)
bc_dt_20 = BalanceCascade(estimator=dt_20, n_max_subset=20, random_state=KFOLD_SEED)

dt_10 = DecisionTreeClassifier(random_state=KFOLD_SEED)
bc_dt_10 = BalanceCascade(estimator=dt_10, n_max_subset=10, random_state=KFOLD_SEED)

dt_5 = DecisionTreeClassifier(random_state=KFOLD_SEED)
bc_dt_5 = BalanceCascade(estimator=dt_5, n_max_subset=5, random_state=KFOLD_SEED)

ensemblers = [bc_dt_20, bc_dt_10, bc_dt_5]
ensembler_test(logistic_regression, ensemblers)

fitting sample
(17, 3062, 88) (17, 3062)
training
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Ensembler 0
--------test---------
Confusion Matrix:
True Negative = 134778
False Negative = 150
True Positive = 227
False Positive = 64845
f1 score = 0.007
ROC AUC Score = 0.639
fitting sample
(10, 3062, 88) (10, 3062)
training
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Ensembler 1
--------test---------
Confusion Matrix:
True Negative = 131774
False Negative = 133
True Positive = 244
False Positive = 67849
f1 score = 0.007
ROC AUC Score = 0.654
fitting sample
(5, 3062, 88) (5, 3062)
training
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Ensembler 2
--------test---------
Confusion Matrix:
True Negative = 128482
False Negative = 138
True Positive = 239
False Positive = 71141
f1 sc

Tune max features, overall it doesn't affect ROC AUC much, but 17 features is slightly higher than others

In [25]:
# 70 features
dt_08 = DecisionTreeClassifier(max_features=0.8, random_state=KFOLD_SEED)
bc_dt_08 = BalanceCascade(estimator=dt_08, n_max_subset=10, random_state=KFOLD_SEED)

# 35 features
dt_04 = DecisionTreeClassifier(max_features=0.4, random_state=KFOLD_SEED)
bc_dt_04 = BalanceCascade(estimator=dt_04, n_max_subset=10, random_state=KFOLD_SEED)

# 17 features
dt_02 = DecisionTreeClassifier(max_features=0.2, random_state=KFOLD_SEED)
bc_dt_02 = BalanceCascade(estimator=dt_02, n_max_subset=10, random_state=KFOLD_SEED)

# Auto is sqrt(n_features) ~= 9
dt_auto = DecisionTreeClassifier(max_features='auto', random_state=KFOLD_SEED)
bc_dt_auto = BalanceCascade(estimator=dt_auto, n_max_subset=10, random_state=KFOLD_SEED)

ensemblers = [bc_dt_08, bc_dt_04, bc_dt_02, bc_dt_auto]
ensembler_test(logistic_regression, ensemblers)

fitting sample
(10, 3062, 88) (10, 3062)
training
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Ensembler 0
--------test---------
Confusion Matrix:
True Negative = 131814
False Negative = 132
True Positive = 245
False Positive = 67809
f1 score = 0.007
ROC AUC Score = 0.655
fitting sample
(10, 3062, 88) (10, 3062)
training
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Ensembler 1
--------test---------
Confusion Matrix:
True Negative = 133049
False Negative = 142
True Positive = 235
False Positive = 66574
f1 score = 0.007
ROC AUC Score = 0.645
fitting sample
(10, 3062, 88) (10, 3062)
training
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Ensembler 2
--------test---------
Confusion Matrix:
True Negative = 131934
False Negative = 126
True Positive = 251
False Positive = 67689
f1 score = 0.007
ROC AUC 

No need to tune decision tree's class weight, since ensembler already ensures both classes have equal samples

Min samples at leaves do not seem to matter

In [30]:
dt_min_samples_50 = DecisionTreeClassifier(min_samples_leaf=50, max_features=0.2, random_state=KFOLD_SEED)
bc_dt_min_samples_50 = BalanceCascade(estimator=dt_min_samples_50, n_max_subset=10, random_state=KFOLD_SEED)

dt_min_samples_20 = DecisionTreeClassifier(min_samples_leaf=20, max_features=0.2, random_state=KFOLD_SEED)
bc_dt_min_samples_20 = BalanceCascade(estimator=dt_min_samples_20, n_max_subset=10, random_state=KFOLD_SEED)

dt_min_samples_10 = DecisionTreeClassifier(min_samples_leaf=10, max_features=0.2, random_state=KFOLD_SEED)
bc_dt_min_samples_10 = BalanceCascade(estimator=dt_min_samples_10, n_max_subset=10, random_state=KFOLD_SEED)

dt_min_samples_5 = DecisionTreeClassifier(min_samples_leaf=5, max_features=0.2, random_state=KFOLD_SEED)
bc_dt_min_samples_5 = BalanceCascade(estimator=dt_min_samples_5, n_max_subset=10, random_state=KFOLD_SEED)

ensemblers = [bc_dt_min_samples_50, bc_dt_min_samples_20, bc_dt_min_samples_10, bc_dt_min_samples_5]
ensembler_test(logistic_regression, ensemblers)

fitting sample
(10, 3062, 88) (10, 3062)
training
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Ensembler 0
--------test---------
Confusion Matrix:
True Negative = 133422
False Negative = 154
True Positive = 223
False Positive = 66201
f1 score = 0.007
ROC AUC Score = 0.630
fitting sample
(10, 3062, 88) (10, 3062)
training
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Ensembler 1
--------test---------
Confusion Matrix:
True Negative = 130890
False Negative = 152
True Positive = 225
False Positive = 68733
f1 score = 0.006
ROC AUC Score = 0.626
fitting sample
(10, 3062, 88) (10, 3062)
training
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Ensembler 2
--------test---------
Confusion Matrix:
True Negative = 131893
False Negative = 133
True Positive = 244
False Positive = 67730
f1 score = 0.007
ROC AUC 

Class weight has to be balanced!
If negative class is heavier, both TPR and FNR decrease, but TPR decrease causes more harm to ROC AUC.
If positive class is heavier, the observation is the opposite

In [33]:
def create_lr_proxy(class_weight='balanced'):
    def create_lr():
        return LogisticRegression(penalty='l2', class_weight=class_weight)
    return create_lr

dt = DecisionTreeClassifier(max_features=0.2, random_state=KFOLD_SEED)
bc = BalanceCascade(estimator=dt, n_max_subset=10, random_state=KFOLD_SEED)

ensembler_test(create_lr_proxy({0: 2, 1: 1}), [bc])
ensembler_test(create_lr_proxy(), [bc])
ensembler_test(create_lr_proxy({0: 1, 1: 2}), [bc])
ensembler_test(create_lr_proxy({0: 1, 1: 4}), [bc])

fitting sample
(10, 3062, 88) (10, 3062)
training
Ensembler 0
--------test---------
Confusion Matrix:
True Negative = 187970
False Negative = 288
True Positive = 89
False Positive = 11653
f1 score = 0.015
ROC AUC Score = 0.589
fitting sample
(10, 3062, 88) (10, 3062)
training
Ensembler 0
--------test---------
Confusion Matrix:
True Negative = 131934
False Negative = 126
True Positive = 251
False Positive = 67689
f1 score = 0.007
ROC AUC Score = 0.663
fitting sample
(10, 3062, 88) (10, 3062)
training
Ensembler 0
--------test---------
Confusion Matrix:
True Negative = 38409
False Negative = 25
True Positive = 352
False Positive = 161214
f1 score = 0.004
ROC AUC Score = 0.563
fitting sample
(10, 3062, 88) (10, 3062)
training
Ensembler 0
--------test---------
Confusion Matrix:
True Negative = 13244
False Negative = 6
True Positive = 371
False Positive = 186379
f1 score = 0.004
ROC AUC Score = 0.525


Try L1 regularization, C=1.0 is just right.

In [36]:
def create_lr_proxy(C=1.0):
    def create_lr():
        return LogisticRegression(penalty='l1', C=C, random_state=KFOLD_SEED)
    return create_lr

dt = DecisionTreeClassifier(max_features=0.2, random_state=KFOLD_SEED)
bc = BalanceCascade(estimator=dt, n_max_subset=10, random_state=KFOLD_SEED)

ensembler_test(create_lr_proxy(2.0), [bc])
ensembler_test(create_lr_proxy(1.0), [bc])
ensembler_test(create_lr_proxy(0.8), [bc])
ensembler_test(create_lr_proxy(0.5), [bc])
ensembler_test(create_lr_proxy(0.2), [bc])

fitting sample
(10, 3062, 88) (10, 3062)
training
Ensembler 0
--------test---------
Confusion Matrix:
True Negative = 131967
False Negative = 125
True Positive = 252
False Positive = 67656
f1 score = 0.007
ROC AUC Score = 0.665
fitting sample
(10, 3062, 88) (10, 3062)
training
Ensembler 0
--------test---------
Confusion Matrix:
True Negative = 131859
False Negative = 124
True Positive = 253
False Positive = 67764
f1 score = 0.007
ROC AUC Score = 0.666
fitting sample
(10, 3062, 88) (10, 3062)
training
Ensembler 0
--------test---------
Confusion Matrix:
True Negative = 131884
False Negative = 124
True Positive = 253
False Positive = 67739
f1 score = 0.007
ROC AUC Score = 0.666
fitting sample
(10, 3062, 88) (10, 3062)
training
Ensembler 0
--------test---------
Confusion Matrix:
True Negative = 131745
False Negative = 126
True Positive = 251
False Positive = 67878
f1 score = 0.007
ROC AUC Score = 0.663
fitting sample
(10, 3062, 88) (10, 3062)
training
Ensembler 0
--------test---------
Conf