In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.ensemble import BalanceCascade
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from mlxtend.plotting import plot_decision_regions
import h5py
import keras
from sklearn.utils import class_weight
from keras.utils import to_categorical
from keras.optimizers import *
from keras.regularizers import *
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, precision_recall_fscore_support

%matplotlib inline

KFOLD_SEED = 42

def shuffle(features, labels):
    p = np.random.permutation(len(features))
    return features[p], labels[p]

def rtb_confusion_matrix(test_labels, y_preds):
    m = confusion_matrix(test_labels[:,1], y_preds.argmax(axis=-1))
    
    print("================================")
    print("Confusion Matrix:")
    print("True Negative = %d" % m[0][0])
    print("False Negative = %d" % m[1][0])
    print("True Positive = %d" % m[1][1])
    print("False Positive = %d" % m[0][1])


def rtb_f1_score(test_labels, y_preds):
    f = f1_score(test_labels[:,1], y_preds.argmax(axis=-1))
    print("================================")
    print("f1 score = %0.3f" % f)


def rtb_precision_recall(test_labels, y_preds):
    precision, recall, fbeta_score, support = precision_recall_fscore_support(
        test_labels[:,1], y_preds.argmax(axis=-1))
    print("================================")
    print("Precision = %0.3f, Recall = %0.3f" % (np.mean(precision), np.mean(recall)))
    return precision, recall


def print_metrics(true_labels, y_preds, is_train=True):
    if is_train:
        print("--------train---------")
    else:
        print("--------test---------")
    
    rtb_confusion_matrix(true_labels, y_preds)
    rtb_f1_score(true_labels, y_preds)
    rtb_precision_recall(true_labels, y_preds)
    print("================================")
    print("ROC AUC Score = %0.3f" % roc_auc_score(true_labels, y_preds))

In [3]:
input_path = '~/data/biddings.csv'
data = pd.read_csv(input_path)
print(data.shape)

train = data[:800000]
test = data[800000:]

sample = train.sample(frac=1)
features = sample.drop('convert', axis=1).values
labels = to_categorical(sample.convert.ravel(), 2)

test_features = test.drop('convert', axis=1).values
test_labels = to_categorical(test.convert.ravel(), 2)

(1000000, 89)


In [4]:
# Data prep
'''
when sample weights is balanced, model predicts everything as positive. This is too strong
'''
sample_weights = class_weight.compute_sample_weight(
    class_weight={0:1, 1:100},
    y=labels[:,1])
class_weights = class_weight.compute_class_weight('balanced', np.unique(labels[:,1]), labels[:,1])

print(class_weights)

[  0.50095871 261.26714566]


In [44]:
batch_size = 32
epochs = 5

# Build model
def create_model():
    model = Sequential()
    model.add(Dense(64, input_shape=(88,),
                    activation='relu',
                    kernel_regularizer=l2(0.01),
                    kernel_initializer='glorot_uniform'))
    model.add(Dropout(0.2))
    model.add(Dense(32,
                    activation='relu',
                    kernel_regularizer=l2(0.01),
                    kernel_initializer='glorot_uniform'))
    model.add(Dropout(0.2))
    model.add(Dense(2, activation='softmax'))

    model.summary()

    model.compile(loss='binary_crossentropy',
    #               loss_weights=class_weights,
                  optimizer=SGD(lr=0.1, decay=0.02),
                  metrics=['accuracy'])
    return model

In [65]:
n_batches = 10

def gen_batches(features, labels, n_batches=10, ratio=10):
    positive_samples = sample[sample.convert == 1]
    negative_samples = sample[sample.convert == 0]
    
    positive_size = positive_samples.shape[0]
    
    print(positive_size)
    
#     batches = np.empty(shape=(n_batches, positive_size * 2, 89))
    feature_batches = np.empty(shape=(n_batches, positive_size*(ratio+1), 88))
    label_batches = np.empty(shape=(n_batches, positive_size*(ratio+1), 2))

    for i in range(n_batches):
        b = negative_samples[i*n_batches : (positive_size*ratio + i*n_batches)]
        shuffled = pd.concat([b, positive_samples]).sample(frac=1)
        shuffled_features = shuffled.drop('convert', axis = 1).values
        shuffled_labels = to_categorical(shuffled.convert.ravel(), 2)
        
        print(shuffled_features.shape, shuffled_labels.shape)
        np.append(feature_batches, shuffled_features)
        np.append(label_batches, shuffled_labels)
    
    return feature_batches, label_batches


def train_batched_models(feature_batches, label_batches):
    models = []
    for i, fb in enumerate(feature_batches):
        model = create_model()
        print(fb.shape)

        lb = label_batches[i]
        
        print(fb.shape, lb.shape)
    
        model.fit(fb, lb,
            batch_size=batch_size,
#             class_weight={0:1, 1:5},
    #       sample_weight=sample_weights,
            epochs=epochs,
            callbacks=[keras.callbacks.EarlyStopping()],
            validation_split=0.2,
            verbose=1)
        models.append(model)
    return models


def predict_batched_models(models, test_features, test_labels, n_batches=10):
    cum_preds = None
    for model in models:
        test_preds = model.predict(test_features, verbose=1)
        print(test_preds.shape)
        print(test_preds[0:10])
        print_metrics(test_labels, test_preds, is_train=False)
        
        if cum_preds is None:
            cum_preds = test_preds
            print(cum_preds.shape)
        else:
            cum_preds += test_preds
    
    return cum_preds/n_batches


feature_batches, label_batches = gen_batches(features, labels, n_batches=n_batches)
models = train_batched_models(feature_batches, label_batches)

1531
(16841, 88) (16841, 2)
(16841, 88) (16841, 2)
(16841, 88) (16841, 2)
(16841, 88) (16841, 2)
(16841, 88) (16841, 2)
(16841, 88) (16841, 2)
(16841, 88) (16841, 2)
(16841, 88) (16841, 2)
(16841, 88) (16841, 2)
(16841, 88) (16841, 2)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_208 (Dense)            (None, 64)                5696      
_________________________________________________________________
dropout_139 (Dropout)        (None, 64)                0         
_________________________________________________________________
dense_209 (Dense)            (None, 32)                2080      
_________________________________________________________________
dropout_140 (Dropout)        (None, 32)                0         
_________________________________________________________________
dense_210 (Dense)            (None, 2)                 66        
Total params: 7,842
Trainable params: 7

Train on 13472 samples, validate on 3369 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_223 (Dense)            (None, 64)                5696      
_________________________________________________________________
dropout_149 (Dropout)        (None, 64)                0         
_________________________________________________________________
dense_224 (Dense)            (None, 32)                2080      
_________________________________________________________________
dropout_150 (Dropout)        (None, 32)                0         
_________________________________________________________________
dense_225 (Dense)            (None, 2)                 66        
Total params: 7,842
Trainable params: 7,842
Non-trainable params: 0
_________________________________________________________________
(16841, 88)
(16841, 88) (16841, 2)
Train 

Train on 13472 samples, validate on 3369 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


In [66]:
predicted_test_scores = predict_batched_models(models, test_features, test_labels, n_batches=n_batches)
print(predicted_test_scores.shape)
print(predicted_test_scores[0:10])

print_metrics(test_labels, predicted_test_scores, is_train=False)

# print(test_labels.argmax(axis=-1).shape, predicted_test_labels.shape)

(200000, 2)
[[0.6916608  0.30833918]
 [0.44448575 0.5555142 ]
 [0.52824044 0.47175953]
 [0.48681757 0.51318246]
 [0.44983444 0.5501656 ]
 [0.55644184 0.44355822]
 [0.5795436  0.4204565 ]
 [0.4676635  0.53233653]
 [0.544575   0.45542502]
 [0.5532304  0.44676957]]
--------test---------
Confusion Matrix:
True Negative = 148904
False Negative = 263
True Positive = 114
False Positive = 50719
f1 score = 0.004
Precision = 0.500, Recall = 0.524
ROC AUC Score = 0.528
(200000, 2)
(200000, 2)
[[0.28798667 0.71201336]
 [0.332157   0.66784304]
 [0.43195635 0.56804365]
 [0.39905155 0.60094845]
 [0.46526214 0.5347378 ]
 [0.3623842  0.63761574]
 [0.52184844 0.47815156]
 [0.4619948  0.5380052 ]
 [0.51808023 0.48191977]
 [0.4178925  0.5821075 ]]
--------test---------
Confusion Matrix:
True Negative = 16877
False Negative = 28
True Positive = 349
False Positive = 182746
f1 score = 0.004
Precision = 0.500, Recall = 0.505
ROC AUC Score = 0.517
(200000, 2)
[[0.756174   0.24382599]
 [0.37576675 0.62423325]
 