In [91]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.ensemble import BalanceCascade, EasyEnsemble
from sklearn.linear_model import SGDClassifier, LogisticRegression
import h5py
import keras
from sklearn.utils import class_weight
from keras.utils import to_categorical
from keras.optimizers import *
from keras.callbacks import *
from keras.regularizers import *
from keras.initializers import *
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier


%matplotlib inline

KFOLD_SEED = 42

def shuffle(features, labels):
    p = np.random.permutation(len(features))
    return features[p], labels[p]


def keras_confusion_matrix(test_labels_1d, predicted_labels):
    m = confusion_matrix(test_labels_1d, predicted_labels)

    print("Confusion Matrix:")
    print("True Negative = %d" % m[0][0])
    print("False Negative = %d" % m[1][0])
    print("True Positive = %d" % m[1][1])
    print("False Positive = %d" % m[0][1])
    

def keras_f1_score(test_labels_1d, predicted_labels):
    f = f1_score(test_labels_1d, predicted_labels)
    print("f1 score = %0.3f" % f)

    
def print_xgb_metrics(test_labels_1d, y_scores, score_to_label_threshold=0.5):
    predicted_labels = np.array([])
    for s in y_scores:
        if s > score_to_label_threshold:
            predicted_labels = np.append(predicted_labels, 1)
        else:
            predicted_labels = np.append(predicted_labels, 0)

    print(predicted_labels.shape)
    
    keras_confusion_matrix(test_labels_1d, predicted_labels)
    keras_f1_score(test_labels_1d, predicted_labels)
    print("ROC Score = %0.3f" % roc_auc_score(test_labels_1d, predicted_labels))


def print_metrics(test_labels_1d, y_scores, is_train=False, score_to_label_threshold=None):
    if score_to_label_threshold is None:
        predicted_labels = y_scores.argmax(axis=-1)
    else:
        print(y_scores[:,1][0:5])
        predicted_labels = np.array([])
        for s in y_scores[:,1]:
            if s > score_to_label_threshold:
                predicted_labels = np.append(predicted_labels, 1)
            else:
                predicted_labels = np.append(predicted_labels, 0)
    
    if is_train:
        print("---------train---------")
    else:
        print("---------test---------")
    
    keras_confusion_matrix(test_labels_1d, predicted_labels)
    keras_f1_score(test_labels_1d, predicted_labels)
    print("ROC Score = %0.3f" % roc_auc_score(test_labels_1d, predicted_labels))

In [2]:
input_path = '~/data/biddings.csv'
data = pd.read_csv(input_path)
print(data.shape)

train = data[:800000]
test = data[800000:]

sample = train.sample(frac=1)
features = sample.drop('convert', axis=1).values
labels = sample.convert.ravel()
categorical_labels = to_categorical(sample.convert.ravel(), 2)

test_features = test.drop('convert', axis=1).values
test_labels = test.convert.ravel()
categorical_test_labels = to_categorical(test.convert.ravel(), 2)

(1000000, 89)


In [3]:
dt = DecisionTreeClassifier(max_features=0.2, random_state=KFOLD_SEED)
bc = BalanceCascade(estimator=dt, n_max_subset=10, random_state=KFOLD_SEED)

def deep_ensemble_merged(model_fn, model_fit_fn, ensembler, smote=None):
    print("fitting sample")
    X_res, y_res = ensembler.fit_sample(features, labels)
    print(X_res.shape, y_res.shape)
    
    model = model_fn()
    print("training")

    # Merge sample batches
    Xs = None
    ys = None
    for i, X_train in enumerate(X_res):
        if Xs is None:
            Xs = np.array(X_res[i])
            ys = np.array(y_res[i])
            print(Xs.shape, ys.shape)
        else:
            Xs = np.concatenate((Xs, np.array(X_res[i])))
            ys = np.concatenate((ys, np.array(y_res[i])))
    
    print(Xs.shape, ys.shape)
    shuffle(Xs, ys)
    
    # Generate more synthetic samples
    if smote is not None:
        Xs, ys = smote.fit_sample(Xs, ys)
    
    shuffle(Xs, ys)
    ys = to_categorical(ys, 2)
    model = model_fit_fn(model, Xs, ys)

    predicted_scores = model.predict(test_features, verbose=1)
    print(predicted_scores.shape)
    print_metrics(test_labels, predicted_scores, is_train=False)
    return model


def deep_ensemble(model_fn, model_fit_fn, ensembler, smote=None):
    print("fitting sample")
    X_res, y_res = ensembler.fit_sample(features, labels)
    print(X_res.shape, y_res.shape)
    
    model = model_fn()
    print("training")

    for j, X_train in enumerate(X_res):
        if smote is not None:
            X, y = smote.fit_sample(X_train, y_res[j])
            y = to_categorical(y, 2)
            model = model_fit_fn(model, X, y)
        else:
            y = to_categorical(y_res[j], 2)
            model = model_fit_fn(model, X_train, y)

    predicted_scores = model.predict(test_features, verbose=1)
    print(predicted_scores.shape)
    print_metrics(test_labels, predicted_scores, is_train=False)
    return model
    

## Basic Neural Net

In [4]:
batch_size = 16
epochs = 1

Bare minimum: 0.644 <br>
10 Epochs: 0.655 <br>
With l2(0.01): 0.660

In [6]:
def base_model():
    model = Sequential()
    model.add(Dense(88, kernel_initializer='glorot_uniform', input_shape=(88,)))
    model.add(Dense(2, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer=SGD(lr=0.01),
                  metrics=['accuracy'])
    return model;

def model_fit(model, X, y):
    model.fit(X, y,
              batch_size=batch_size,
              epochs=10,
#               validation_split=0.2,
              verbose=1)
    return model

model = deep_ensemble(base_model, model_fit, bc)

fitting sample
(20, 3062, 88) (20, 3062)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 88)                7832      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 178       
Total params: 8,010
Trainable params: 8,010
Non-trainable params: 0
_________________________________________________________________
training
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/1

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(200000, 2)
---------test---------
Confusion Matrix:
True Negative = 124780
False Negative = 128
True Positive = 249
False Positive = 74843
f1 score = 0.007
ROC Score = 0.643


With validation split, we can see that there is overfitting

In [22]:
def base_model():
    model = Sequential()
    model.add(Dense(88, kernel_initializer='glorot_uniform', kernel_regularizer=l2(0.01),
                    input_shape=(88,)))
    model.add(Dense(2, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer=SGD(lr=0.01),
                  metrics=['accuracy'])
    return model;

def model_fit(model, X, y):
    model.fit(X, y,
              batch_size=batch_size,
              epochs=10,
#               validation_split=0.2,
              verbose=1)
    return model

model = deep_ensemble_merged(base_model, model_fit, bc)

fitting sample
(10, 3062, 88) (10, 3062)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_26 (Dense)             (None, 88)                7832      
_________________________________________________________________
dense_27 (Dense)             (None, 2)                 178       
Total params: 8,010
Trainable params: 8,010
Non-trainable params: 0
_________________________________________________________________
training
(3062, 88) (3062,)
(30620, 88) (30620,)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(200000, 2)
---------test---------
Confusion Matrix:
True Negative = 132306
False Negative = 126
True Positive = 251
False Positive = 67317
f1 score = 0.007
ROC Score = 0.664


In [72]:
def base_model():
    model = Sequential()
    model.add(Dense(88, activation='relu',
                    kernel_initializer='glorot_uniform', kernel_regularizer=l2(0.01),
                    input_shape=(88,)))
    model.add(Dropout(0.1))
    model.add(Dense(64, kernel_initializer=RandomNormal(mean=2.0), kernel_regularizer=l2(0.01)))
    model.add(Dropout(0.2))
    model.add(Dense(2, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer=RMSprop(lr=0.01, decay=0.02),
                  metrics=['accuracy'])
    return model;

def model_fit(model, X, y):
    model.fit(X, y,
              batch_size=batch_size,
              epochs=20,
              callbacks=[EarlyStopping(patience=2)],
              validation_split=0.2,
              verbose=1)
    return model

smote = SMOTE(ratio={0: 306200, 1: 306200}, n_jobs=-1, random_state=KFOLD_SEED)
model = deep_ensemble_merged(base_model, model_fit, bc)

fitting sample
(10, 3062, 88) (10, 3062)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_34 (Dense)             (None, 88)                7832      
_________________________________________________________________
dropout_23 (Dropout)         (None, 88)                0         
_________________________________________________________________
dense_35 (Dense)             (None, 64)                5696      
_________________________________________________________________
dropout_24 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_36 (Dense)             (None, 2)                 130       
Total params: 13,658
Trainable params: 13,658
Non-trainable params: 0
_________________________________________________________________
training
(3062, 88) (3062,)
(30620, 88) (30620,)
Train on 24496 samples, validate on 6124 samples

In [69]:
w = model.get_layer(name='dense_26').get_weights()
print(w[0].shape, w[1].shape)
print(np.mean(w[0]), np.mean(w[1]))

(88, 64) (64,)
0.0011367296 -0.04796268


In [31]:
predicted_scores = model.predict(test_features)
predicted_labels = predicted_scores.argmax(axis=-1)

print(predicted_scores.shape, predicted_labels.shape)

false_positive_scores = np.array([])
false_negative_scores = np.array([])
true_positive_scores = np.array([])
true_negative_scores = np.array([])

for i, s in enumerate(predicted_scores):
    # False positive
    if predicted_labels[i] == 1 and test_labels[i] == 0:
        false_positive_scores = np.append(false_positive_scores, s[1])
    # False negative
    elif predicted_labels[i] == 0 and test_labels[i] == 1:
        false_negative_scores = np.append(false_negative_scores, s[1])
    # True positive
    elif predicted_labels[i] == 1 and test_labels[i] == 1:
        true_positive_scores = np.append(true_positive_scores, s[1])
    # True negative
    else:
        true_negative_scores = np.append(true_negative_scores, s[1])

(200000, 2) (200000,)


In [39]:
# If threshold = 0.60, can catch an additional 10% of false positives
print(np.percentile(false_positive_scores, 20), np.median(false_positive_scores))
print(np.median(false_negative_scores))
# If threshold = 0.66, can capture an additional 10% of true positives
print(np.percentile(true_positive_scores, 20), np.median(true_positive_scores))
print(np.median(true_negative_scores))

0.5354089736938477 0.6013307273387909
0.3806043416261673
0.5638539671897889 0.6640639901161194
0.34379687905311584


In [54]:
print_metrics(test_labels, predicted_scores, score_to_label_threshold=0.52)

[0.5714545  0.5183983  0.46809098 0.18678115 0.38346967]
---------test---------
Confusion Matrix:
True Negative = 146422
False Negative = 158
True Positive = 219
False Positive = 53201
f1 score = 0.008
ROC Score = 0.657


### Xgboost

In [89]:
dtrain = xgb.DMatrix(features, labels)
dtest = xgb.DMatrix(test_features, test_labels)

params = {'max_depth':3, 'eta':0.1, 'objective':'binary:logistic',
         'nthread': 4, 'eval_metric':'auc'}
evallist = [(dtest, 'eval'), (dtrain, 'train')]

num_round = 20
bst = xgb.train(params, dtrain, num_round, evallist, early_stopping_rounds=5)


[0]	eval-auc:0.5	train-auc:0.500325
Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.

Will train until train-auc hasn't improved in 5 rounds.
[1]	eval-auc:0.5	train-auc:0.500325
[2]	eval-auc:0.499997	train-auc:0.500648
[3]	eval-auc:0.499997	train-auc:0.500648
[4]	eval-auc:0.499997	train-auc:0.500648
[5]	eval-auc:0.633761	train-auc:0.641557
[6]	eval-auc:0.641761	train-auc:0.65134
[7]	eval-auc:0.646147	train-auc:0.656013
[8]	eval-auc:0.646284	train-auc:0.65658
[9]	eval-auc:0.646614	train-auc:0.657321
[10]	eval-auc:0.648449	train-auc:0.658278
[11]	eval-auc:0.650978	train-auc:0.659813
[12]	eval-auc:0.650799	train-auc:0.660001
[13]	eval-auc:0.652829	train-auc:0.663415
[14]	eval-auc:0.656153	train-auc:0.664207
[15]	eval-auc:0.655381	train-auc:0.664069
[16]	eval-auc:0.654507	train-auc:0.665269
[17]	eval-auc:0.653276	train-auc:0.664303
[18]	eval-auc:0.653398	train-auc:0.665308
[19]	eval-auc:0.653341	train-auc:0.665401


In [90]:
y_scores = bst.predict(dtest)

print(y_scores.shape)
print(y_scores[0:5])
print_xgb_metrics(test_labels, y_scores)

(200000,)
[0.06797221 0.06806614 0.06659859 0.06659859 0.06668172]
(200000,)
Confusion Matrix:
True Negative = 199623
False Negative = 377
True Positive = 0
False Positive = 0
f1 score = 0.000
ROC Score = 0.500


  'precision', 'predicted', average, warn_for)


In [92]:
xgb_model = XGBClassifier()
bc = BalanceCascade(estimator=xgb_model, n_max_subset=10, random_state=KFOLD_SEED)