In [1]:
import numpy as np
import keras
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import BatchNormalization

Using TensorFlow backend.


In [2]:
def load_data(filename, skiprows = 1):
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

In [3]:
train_data = load_data("train_2008.csv")
test_data = load_data("test_2012.csv")

In [4]:
# converts column c of data to array where each row is a 0-1 vector
def ohe_col(train, test, c):
    cats = np.unique(np.vstack((np.reshape(train[:,c], (train.shape[0], 1)), np.reshape(test[:, c], (test.shape[0], 1)))))
    print(cats)
    print(len(cats))
    d = {}
    for i in range(len(cats)):
        d[cats[i]] = i
    train_col, test_col = np.zeros((train.shape[0], len(cats))), np.zeros((test.shape[0], len(cats)))
    for i in range(train.shape[0]):
        train_col[i, d[train[i, c]]] = 1
    for i in range(test.shape[0]):
        test_col[i, d[test[i, c]]] = 1
    return train_col, test_col

In [5]:
train_ohe, test_ohe = train_data, test_data
cs = [4, 10, 13, 26, 31, 36, 43, 53, 61, 65, 68, 118, 127, 131, 182, 233, 498, 601, 605, 608, 617, 644, 664, 675, 698, 704]
for c in cs:
    train_col, test_col = ohe_col(train_ohe, test_ohe, c)
    train_ohe = np.hstack((np.hstack((train_ohe[:,:c], train_col)), train_ohe[:,c+1:]))
    test_ohe = np.hstack((np.hstack((test_ohe[:,:c], test_col)), test_ohe[:,c+1:]))

[  1.   2.   4. 201. 203.]
5
[1. 2. 3.]
3
[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12.]
12
[-3. -2. -1.  1.  2.]
5
[-1.  0.  1.]
3
[-1.  1.  2.]
3
[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
10
[1. 2. 3. 4. 5. 6. 7. 8.]
8
[-1.  1.  2.]
3
[0. 2. 3.]
3
[83001. 83002. 83003. 83004. 83006. 83011. 83021. 83031. 83041. 83051.
 83101. 83111. 83131. 83141. 83201. 83241. 83251. 83261. 83262. 85001.
 85002. 85003. 85011. 85021. 85031. 85041. 85231. 85241. 85251. 85261.
 89001. 89002. 89003. 89004. 89011. 89012. 89021. 89031. 89251. 89252.
 89261. 89262. 91001. 91002. 91003. 91011. 91021. 91031. 91251. 91261.]
50
[-3. -2. -1.  1.  2.]
5
[1. 2. 3. 4.]
4
[11. 12. 13. 14. 15. 16. 21. 22. 23. 31. 32. 33. 34. 35. 41. 42. 43. 44.
 45. 46. 47. 51. 52. 53. 54. 55. 56. 57. 58. 59. 61. 62. 63. 64. 71. 72.
 73. 74. 81. 82. 83. 84. 85. 86. 87. 88. 91. 92. 93. 94. 95.]
51
[ 1.  2.  4.  5.  6.  8.  9. 10. 11. 12. 13. 15. 16. 17. 18. 19. 20. 21.
 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36. 37.

In [6]:
print(np.unique(train_data[:, 54]))
print(np.unique(train_ohe[:, 712]))

[0. 1. 2. 3. 4.]
[0. 1. 2. 3. 4.]


In [7]:
# divide the training data
X = train_ohe[:, 3:-1]
Y = train_ohe[:, -1] 
test = test_ohe[:, 3:]

In [8]:
print(X.shape)
print(test.shape)
f = X.shape[1]

(64667, 1037)
(82820, 1037)


In [9]:
# Normalize both the training and testing distribution according to the training data
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)
test = scaler.transform(test)

In [10]:
model = Sequential()

model.add(Dense(f, input_shape=(f,)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(int(f / 2)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(int(f / 4)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(int(f / 8)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(int(f / 16)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.1))

model.add(Dense(int(f / 32)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.1))

model.add(Dense(int(f / 64)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.1))

model.add(Dense(1))
model.add(Activation('sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1037)              1076406   
_________________________________________________________________
batch_normalization_1 (Batch (None, 1037)              4148      
_________________________________________________________________
activation_1 (Activation)    (None, 1037)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1037)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 518)               537684    
_________________________________________________________________
batch_normalization_2 (Batch (None, 518)               2072      
_________________________________________________________________
activation_2 (Activation)    (None, 518)               0         
__________

In [11]:
fold = 5
tot_train = 0
tot_test = 0
tot_train_auc = 0 
tot_test_auc = 0

kf = KFold(n_splits=fold)
for train_index, test_index in kf.split(X):
    print(".", end="")
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    fit = model.fit(X_train, Y_train, batch_size=64, epochs=10, verbose=0)
    score_train = model.evaluate(X_train, Y_train, verbose=0)
    score_test = model.evaluate(X_test, Y_test, verbose=0)
    # We only keep track of the accuracy
    tot_train += score_train[1]
    tot_test += score_test[1]
    tot_train_auc += roc_auc_score(Y_train, model.predict(X_train, batch_size=64))
    tot_test_auc += roc_auc_score(Y_test, model.predict(X_test, batch_size=64))

print('\nTrain accuracy:', tot_train/fold)
print('Test accuracy:', tot_test/fold)
print('Train AUC:', tot_train_auc/fold)
print('Test AUC:', tot_test_auc/fold)

.....
Train accuracy: 0.9245166815561401
Test accuracy: 0.8432757945727227
Train AUC: 0.9621030240037808
Test AUC: 0.8737245872693601


In [12]:
fit = model.fit(X, Y, batch_size=64, epochs=20, verbose=1)
score = model.evaluate(X, Y, verbose=0)
print("Training accuracy:", score[1])
train_results = model.predict(X, batch_size=64)
print("Training AUC:", roc_auc_score(Y, train_results))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training accuracy: 0.9909072633646219
Training AUC: 0.9996648756885868


In [13]:
test_results = model.predict(test, batch_size=64)

In [14]:
prob_ones = np.hstack((np.reshape(test_data[:, 0], (test_data.shape[0], 1)), test_results))

In [15]:
# Saving the test_prob as a csv file in the proper format
np.savetxt("predictionsOHE2012_16.csv", prob_ones, fmt = '%d,%21.20f', delimiter=',', header = 'id,target', comments='')