In [None]:
import numpy as np
import keras
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import BatchNormalization

In [None]:
def load_data(filename, skiprows = 1):
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

In [None]:
train_data = load_data("train_2008.csv")
test_data = load_data("test_2008.csv")

In [None]:
# divide the training data
X = train_data[:, 3:382]
Y = train_data[:, 382] 
test = test_data[:, 3:382]

In [None]:
print(X.shape)
print(test.shape)

In [None]:
# Normalize both the training and testing distribution according to the training data
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)
test = scaler.transform(test)

In [None]:
model = Sequential()

model.add(Dense(379, input_shape=(379,)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(189))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(94))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(47))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.1))

model.add(Dense(24))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.1))

model.add(Dense(12))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.1))

model.add(Dense(1))
model.add(Activation('sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [None]:
fold = 5
tot_train = 0
tot_test = 0
tot_train_auc = 0 
tot_test_auc = 0

kf = KFold(n_splits=fold)
for train_index, test_index in kf.split(X):
    print(".", end="")
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    fit = model.fit(X_train, Y_train, batch_size=64, epochs=10, verbose=0)
    score_train = model.evaluate(X_train, Y_train, verbose=0)
    score_test = model.evaluate(X_test, Y_test, verbose=0)
    # We only keep track of the accuracy
    tot_train += score_train[1]
    tot_test += score_test[1]
    tot_train_auc += roc_auc_score(Y_train, model.predict(X_train, batch_size=64))
    tot_test_auc += roc_auc_score(Y_test, model.predict(X_test, batch_size=64))

print('\nTrain accuracy:', tot_train/fold)
print('Test accuracy:', tot_test/fold)
print('Train AUC:', tot_train_auc/fold)
print('Test AUC:', tot_test_auc/fold)

In [None]:
fit = model.fit(X, Y, batch_size=64, epochs=20, verbose=1)
score = model.evaluate(X, Y, verbose=0)
print("Training accuracy:", score[1])
train_results = model.predict(X, batch_size=64)
print("Training AUC:", roc_auc_score(Y, train_results))

In [None]:
test_results = model.predict(test, batch_size=64)

In [None]:
prob_ones = np.hstack((np.reshape(test_data[:, 0], (test_data.shape[0], 1)), test_results))

In [None]:
# Saving the test_prob as a csv file in the proper format
np.savetxt("predictions6.csv", prob_ones, fmt = '%d,%21.20f', delimiter=',', header = 'id,target', comments='')