In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.ensemble import BalanceCascade, EasyEnsemble
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import GridSearchCV
import h5py
import keras
from sklearn.utils import class_weight
from keras.utils import to_categorical
from keras.optimizers import *
from keras.regularizers import *
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

KFOLD_SEED = 42

def keras_confusion_matrix(test_labels_1d, predicted_labels):
    m = confusion_matrix(test_labels_1d, predicted_labels)

    print("Confusion Matrix:")
    print("True Negative = %d" % m[0][0])
    print("False Negative = %d" % m[1][0])
    print("True Positive = %d" % m[1][1])
    print("False Positive = %d" % m[0][1])
    

def keras_f1_score(test_labels_1d, predicted_labels):
    f = f1_score(test_labels_1d, predicted_labels)
    print("f1 score = %0.3f" % f)


def print_metrics(test_labels_1d, y_scores, is_train=False):
    predicted_labels = y_scores.argmax(axis=-1)
    if is_train:
        print("---------train---------")
    else:
        print("---------test---------")
    
    keras_confusion_matrix(test_labels_1d, predicted_labels)
    keras_f1_score(test_labels_1d, predicted_labels)
    print("ROC Score = %0.3f" % roc_auc_score(test_labels_1d, predicted_labels))

In [3]:
input_path = '~/data/biddings.csv'
data = pd.read_csv(input_path)
print(data.shape)

train = data[:800000]
test = data[800000:]

sample = train.sample(frac=1)
features = sample.drop('convert', axis=1).values
labels = sample.convert.ravel()
categorical_labels = to_categorical(sample.convert.ravel(), 2)

test_features = test.drop('convert', axis=1).values
test_labels = test.convert.ravel()
categorical_test_labels = to_categorical(test.convert.ravel(), 2)

(1000000, 89)


In [40]:
dt = DecisionTreeClassifier(max_features=0.2, random_state=KFOLD_SEED)
bc = BalanceCascade(estimator=dt, n_max_subset=10, random_state=KFOLD_SEED)

def deep_ensemble(model_fn, model_fit_fn, ensembler, smote=None):
    print("fitting sample")
    X_res, y_res = ensembler.fit_sample(features, labels)
    print(X_res.shape, y_res.shape)
    
    model = model_fn()
    print("training")

    for j, X_train in enumerate(X_res):
        if smote is not None:
            X, y = smote.fit_sample(X_train, y_res[j])
            y = to_categorical(y, 2)
            model = model_fit_fn(model, X, y)
        else:
            y = to_categorical(y_res[j], 2)
            model = model_fit_fn(model, X_train, y)

    predicted_scores = model.predict(test_features, verbose=1)
    print(predicted_scores.shape)
    print_metrics(test_labels, predicted_scores, is_train=False)
    return model
    

## Basic Neural Net

In [24]:
batch_size = 16
epochs = 1

Bare minimum: 0.644 <br>
10 Epochs: 0.655 <br>
With l2(0.01): 0.660

In [26]:
def base_model():
    model = Sequential()
    model.add(Dense(88, kernel_initializer='glorot_uniform', input_shape=(88,)))
    model.add(Dense(2, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer=SGD(lr=0.01),
                  metrics=['accuracy'])
    return model;

def model_fit(model, X, y):
    model.fit(X, y,
              batch_size=batch_size,
              epochs=10,
#               validation_split=0.2,
              verbose=1)
    return model

model = deep_ensemble(base_model, model_fit, bc)

fitting sample
(10, 3062, 88) (10, 3062, 2)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_23 (Dense)             (None, 88)                7832      
_________________________________________________________________
dense_24 (Dense)             (None, 2)                 178       
Total params: 8,010
Trainable params: 8,010
Non-trainable params: 0
_________________________________________________________________
training
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 

Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(200000, 2)
---------test---------
Confusion Matrix:
True Negative = 128160
False Negative = 125
True Positive = 252
False Positive = 71463
f1 score = 0.007
ROC Score = 0.655


With validation split, we can see that there is overfitting

In [29]:
def base_model():
    model = Sequential()
    model.add(Dense(88, kernel_initializer='glorot_uniform', kernel_regularizer=l2(0.01),
                    input_shape=(88,)))
    model.add(Dense(2, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer=SGD(lr=0.01),
                  metrics=['accuracy'])
    return model;

def model_fit(model, X, y):
    model.fit(X, y,
              batch_size=batch_size,
              epochs=10,
#               validation_split=0.2,
              verbose=1)
    return model

model = deep_ensemble(base_model, model_fit, bc)

fitting sample
(10, 3062, 88) (10, 3062, 2)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_29 (Dense)             (None, 88)                7832      
_________________________________________________________________
dense_30 (Dense)             (None, 2)                 178       
Total params: 8,010
Trainable params: 8,010
Non-trainable params: 0
_________________________________________________________________
training
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 

Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(200000, 2)
---------test---------
Confusion Matrix:
True Negative = 129023
False Negative = 123
True Positive = 254
False Positive = 70600
f1 score = 0.007
ROC Score = 0.660


In [43]:
def base_model():
    model = Sequential()
    model.add(Dense(88, activation='tanh',
                    kernel_initializer='glorot_uniform', kernel_regularizer=l1(0.01),
                    input_shape=(88,)))
    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
    model.add(Dense(2, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer=SGD(lr=0.01),
                  metrics=['accuracy'])
    return model;

def model_fit(model, X, y):
    model.fit(X, y,
              batch_size=batch_size,
              epochs=10,
              validation_split=0.2,
              verbose=1)
    return model

smote = SMOTE(ratio={0: 10000, 1: 10000}, n_jobs=-1, random_state=KFOLD_SEED)
model = deep_ensemble(base_model, model_fit, bc, smote)

fitting sample
(10, 3062, 88) (10, 3062)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_52 (Dense)             (None, 88)                7832      
_________________________________________________________________
dense_53 (Dense)             (None, 64)                5696      
_________________________________________________________________
dense_54 (Dense)             (None, 2)                 130       
Total params: 13,658
Trainable params: 13,658
Non-trainable params: 0
_________________________________________________________________
training


  n_samples_majority))
  n_samples_majority))


Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  n_samples_majority))
  n_samples_majority))


Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  n_samples_majority))
  n_samples_majority))


Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  n_samples_majority))
  n_samples_majority))


Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  n_samples_majority))
  n_samples_majority))


Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  n_samples_majority))
  n_samples_majority))


Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  n_samples_majority))
  n_samples_majority))


Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  n_samples_majority))
  n_samples_majority))


Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  n_samples_majority))
  n_samples_majority))


Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  n_samples_majority))
  n_samples_majority))


Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(200000, 2)
---------test---------
Confusion Matrix:
True Negative = 173375
False Negative = 244
True Positive = 133
False Positive = 26248
f1 score = 0.010
ROC Score = 0.611
