In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.ensemble import BalanceCascade
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from mlxtend.plotting import plot_decision_regions
import h5py
import keras
from sklearn.utils import class_weight
from keras.utils import to_categorical
from keras.optimizers import *
from keras.regularizers import *
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support

%matplotlib inline

KFOLD_SEED = 42

def shuffle(features, labels):
    p = np.random.permutation(len(features))
    return features[p], labels[p]

def rtb_confusion_matrix(test_labels, y_preds):
    m = confusion_matrix(test_labels[:,1], y_preds.argmax(axis=-1))
    
    print("================================")
    print("Confusion Matrix:")
    print("True Negative = %d" % m[0][0])
    print("False Negative = %d" % m[1][0])
    print("True Positive = %d" % m[1][1])
    print("False Positive = %d" % m[0][1])


def rtb_f1_score(test_labels, y_preds):
    f = f1_score(test_labels[:, 1], y_preds.argmax(axis=-1))
    print("================================")
    print("f1 score = %0.3f" % f)


def rtb_precision_recall(test_labels, y_preds):
    precision, recall, fbeta_score, support = precision_recall_fscore_support(
        test_labels[:, 1], y_preds.argmax(axis=-1))
    print("================================")
    print("Precision = %0.3f, Recall = %0.3f" % (np.mean(precision), np.mean(recall)))
    return precision, recall


def print_metrics(true_labels, y_preds, is_train=True):
    if is_train:
        print("--------train---------")
    else:
        print("--------test---------")
    
    rtb_confusion_matrix(true_labels, y_preds)
    rtb_f1_score(true_labels, y_preds)
    rtb_precision_recall(true_labels, y_preds)
    print("================================")
    print("ROC AUC Score = %0.3f" % roc_auc_score(true_labels, y_preds))

In [12]:
input_path = '~/data/biddings.csv'
data = pd.read_csv(input_path)
print(data.shape)

train = data[:800000]
test = data[800000:]

sample = train.sample(frac=1)
features = sample.drop('convert', axis=1).values
labels = to_categorical(sample.convert.ravel(), 2)

test_features = test.drop('convert', axis=1).values
test_labels = to_categorical(test.convert.ravel(), 2)

(1000000, 89)


In [43]:
# Data prep
'''
when sample weights is balanced, model predicts everything as positive. This is too strong
'''
sample_weights = class_weight.compute_sample_weight(
    class_weight={0:1, 1:100},
    y=labels[:,1])
class_weights = class_weight.compute_class_weight('balanced', np.unique(labels[:,1]), labels[:,1])

print(class_weights)

[  0.50095871 261.26714566]


In [59]:
batch_size = 32
epochs = 5

# Build model
def create_model():
    model = Sequential()
    model.add(Dense(64, input_shape=(88,),
                    activation='relu',
                    kernel_regularizer=l2(0.01),
                    kernel_initializer='glorot_uniform'))
    model.add(Dropout(0.2))
    model.add(Dense(32,
                    activation='relu',
                    kernel_regularizer=l2(0.01),
                    kernel_initializer='glorot_uniform'))
    model.add(Dropout(0.2))
    model.add(Dense(2, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
    #               loss_weights=class_weights,
                  optimizer=SGD(lr=0.1, decay=0.02),
                  metrics=['accuracy'])
    return model

In [70]:
def gen_batches(features, labels, n_batches=10):
    positive_samples = sample[sample.convert == 1]
    negative_samples = sample[sample.convert == 0]
    
    print(positive_samples.size, negative_samples.size)
    batches = []
    for i in range(n_batches):
        b = negative_samples[i*n_batches : (positive_samples.size + i*n_batches)]
        print(b.size)
        shuffled = pd.concat([b, positive_samples]).sample(frac=1)
        print(shuffled.size)
        batches.append([shuffled])
    
    return np.array(batches)


def train_batched_models(batches):
    models = []
    for b in batches:
        model = create_model()
        batch_features = b.drop('convert', axis=1).values
        batch_labels = to_categorical(b.convert.ravel(), 2)
        
        model.fit(batch_features, batch_labels,
            batch_size=batch_size,
    #         class_weight={0:1, 1:400},
    #       sample_weight=sample_weights,
            epochs=epochs,
            callbacks=[keras.callbacks.EarlyStopping()],
            validation_split=0.2,
            verbose=1)
        models.append(model)
    return models


def predict_batched_models(models, test_features, test_labels):
    cum_preds = None
    for model in models:
        test_preds = model.predict(test_features, verbose=1)
        print_metrics(test_labels, test_preds, is_train=False)
        
        if cum_preds is None:
            cum_preds = test_preds
            print(cum_preds.shape)
        else:
            cum_preds += test_preds
    
    return np.mean(cum_preds, axis=1)


batches = gen_batches(features, labels)
models = train_batched_models(batches)

# predicted_test_scores = predict_batched_models(models)
# predicted_test_labels = map(lambda s: 1 if s > 0.5 else 0, predicted_test_labels)

# print_metrics(test_labels, predicted_test_labels, is_train=False)

136259 71063741
12127051
12263310
12127051
12263310
12127051
12263310
12127051
12263310
12127051
12263310
12127051
12263310
12127051
12263310
12127051
12263310
12127051
12263310
12127051
12263310


ValueError: cannot copy sequence with size 137790 to array axis with dimension 89

In [66]:
predicted_test_scores = predict_batched_models(models, test_features, test_labels)
predicted_test_labels = map(lambda s: 1 if s > 0.5 else 0, predicted_test_scores)

print_metrics(test_labels, predicted_test_labels, is_train=False)

--------test---------
Confusion Matrix:
True Negative = 199623
False Negative = 377
True Positive = 0
False Positive = 0
f1 score = 0.000
Precision = 0.499, Recall = 0.500


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


ROC AUC Score = 0.618
(200000, 2)
--------test---------
Confusion Matrix:
True Negative = 199623
False Negative = 377
True Positive = 0
False Positive = 0
f1 score = 0.000
Precision = 0.499, Recall = 0.500
ROC AUC Score = 0.645
--------test---------
Confusion Matrix:
True Negative = 199623
False Negative = 377
True Positive = 0
False Positive = 0
f1 score = 0.000
Precision = 0.499, Recall = 0.500
ROC AUC Score = 0.628
--------test---------
Confusion Matrix:
True Negative = 199623
False Negative = 377
True Positive = 0
False Positive = 0
f1 score = 0.000
Precision = 0.499, Recall = 0.500
ROC AUC Score = 0.597
--------test---------
Confusion Matrix:
True Negative = 199623
False Negative = 377
True Positive = 0
False Positive = 0
f1 score = 0.000
Precision = 0.499, Recall = 0.500
ROC AUC Score = 0.641
--------test---------
Confusion Matrix:
True Negative = 199623
False Negative = 377
True Positive = 0
False Positive = 0
f1 score = 0.000
Precision = 0.499, Recall = 0.500
ROC AUC Score = 0.

NameError: name 'predicted_test_labels' is not defined

In [48]:
train_preds = model.predict(features, verbose=1) 
print_metrics(labels, train_preds, is_train=True)

test_preds = model.predict(test_features, verbose=1)
print_metrics(test_labels, test_preds, is_train=False)

--------train---------
Confusion Matrix:
True Negative = 738988
False Negative = 1214
True Positive = 317
False Positive = 59481
f1 score = 0.010
Precision = 0.502, Recall = 0.566
ROC AUC Score = 0.668
--------test---------
Confusion Matrix:
True Negative = 184489
False Negative = 299
True Positive = 78
False Positive = 15134
f1 score = 0.010
Precision = 0.502, Recall = 0.566
ROC AUC Score = 0.664


In [53]:
print(test_preds.shape)
print(test_preds[0:3:,1])
print(test_preds[0:3].argmax(axis=-1))

(200000, 2)
[0.36920437 0.44836164 0.3320884 ]
[0 0 0]


In [67]:
gen_batches(features, labels)

NameError: name 'positive_samples' is not defined