In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import matplotlib
matplotlib.rcParams['font.size'] = 18

import keras 
import tensorflow as tf

from keras import models, layers, optimizers, losses, metrics, callbacks

Using TensorFlow backend.


# Data

In [2]:
train = pd.read_csv('numerai_datasets/numerai_training_data.csv')
test = pd.read_csv('numerai_datasets/numerai_tournament_data.csv')
train.head()

Unnamed: 0,id,era,data_type,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature42,feature43,feature44,feature45,feature46,feature47,feature48,feature49,feature50,target
0,n2b2e3dd163cb422,era1,train,0.43487,0.44645,0.25802,0.37149,0.62235,0.67451,0.68103,...,0.52962,0.42439,0.5168,0.46297,0.57426,0.57946,0.49646,0.48968,0.54194,1
1,n177021a571c94c8,era1,train,0.50038,0.39216,0.38394,0.51213,0.3666,0.46911,0.68204,...,0.51669,0.48445,0.57587,0.5986,0.67558,0.45577,0.80908,0.50287,0.61629,0
2,n7830fa4c0cd8466,era1,train,0.47416,0.34143,0.39528,0.46337,0.72953,0.45962,0.47869,...,0.41458,0.34804,0.29058,0.51382,0.36389,0.80602,0.39253,0.41821,0.58679,0
3,nc594a184cee941b,era1,train,0.48759,0.55903,0.43987,0.38834,0.4465,0.46389,0.70749,...,0.28776,0.42881,0.55402,0.53695,0.48793,0.62432,0.52898,0.49009,0.49557,0
4,nc5ab8667901946a,era1,train,0.23433,0.55499,0.47849,0.5699,0.64945,0.47152,0.62085,...,0.64405,0.32416,0.33193,0.58065,0.44587,0.4777,0.4402,0.47895,0.57978,0


In [None]:
test.head()

In [None]:
def model(train, test):
    features = [f for f in list(train) if "feature" in f]
    X = train[features]
    Y = train['target']
    X_prediction = test[features]
    ids = test['id']
    
    pca = PCA(n_components=0.95)
    X = pca.fit_transform(X)
    X_prediction = pca.transform(X_prediction)
    
    model = ExtraTreesClassifier(n_estimators = 100, verbose = 1, n_jobs=-1)
    # model = LogisticRegression(n_jobs=-1)
    model.fit(X, Y)
    
    predictions = model.predict_proba(X_prediction)
    probabilities = predictions[:, 1]
    
    results_df = pd.DataFrame({'probability': probabilities})
    results_df = pd.DataFrame(ids).join(results_df)
    
    return results_df

In [None]:
results = model(train, test)
results.to_csv('numerai_extratrees.csv', index=False)

# Keras Neural Network

In [None]:
def plot_history(history):
    val_loss = history.history['val_loss']
    train_loss = history.history['loss']
    epochs = [int(i) for i in list(range(1, len(val_loss) + 1))]
    
    plt.figure(figsize=(8, 6))
    
    plt.plot(epochs, train_loss, 'bo-', label = 'training loss')
    plt.plot(epochs, val_loss, 'ro-', label = 'validation loss')
    plt.xlabel('Epoch'); plt.ylabel('Log Loss'); plt.title('Training Curves')
    plt.legend();
    plt.show()
    

In [3]:
features = [f for f in list(train) if "feature" in f]
X = train[features]
Y = train['target']
X_test = test[features]

X_valid = test.ix[test['data_type'] == 'validation', features]
Y_valid = test.ix[test['data_type'] == 'validation', 'target']
ids = test['id']

In [None]:
pca = PCA(n_components = 0.99)
scaler = MinMaxScaler()

X = pca.fit_transform(X)
X_valid = pca.transform(X_valid)
X_test = pca.transform(X_test)

X = scaler.fit_transform(X)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

In [None]:
X.shape

In [None]:
X_valid.shape

In [None]:
model = models.Sequential()
model.add(layers.Dense(32, activation = 'elu', input_dim = X.shape[1]))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation = 'elu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(128, activation = 'elu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation = 'elu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(32, activation = 'elu'))
model.add(layers.Dense(1, activation = 'sigmoid'))

model.compile(loss = losses.binary_crossentropy,
              metrics = [losses.binary_crossentropy],
              optimizer = optimizers.SGD())

model.summary()

In [None]:
callback_list = [callbacks.ModelCheckpoint(filepath='models/numerai_nn.hdf5', save_best_only = True, monitor = 'val_loss'),
                callbacks.EarlyStopping(monitor = 'val_loss', patience = 3)]

history = model.fit(X, Y, epochs = 25, batch_size = 1024, 
                    validation_data = [X_valid, Y_valid], callbacks=callback_list)    

plot_history(history)

In [None]:
model.load_weights('models/numerai_nn.hdf5')
p_array = model.predict(X_prediction)
p = p_array[:, 0]

In [None]:
results = pd.DataFrame({'id': ids, 'probability': p})
results.to_csv('numerai_nn.csv', index=False)

# Traditional Methods

In [None]:
corrs = pd.Series(train.corr()['target'].sort_values())

In [None]:
top_corrs = corrs[abs(corrs.values) > 0.01]
top_corrs_names = list(top_corrs.index)
top_corrs_names.remove('target')

In [None]:
# features = [f for f in list(train) if "feature" in f]
features = top_corrs_names
X = train[features]
Y = train['target']
X_test = test[features]

X_valid = test.ix[test['data_type'] == 'validation', features]
Y_valid = test.ix[test['data_type'] == 'validation', 'target']
ids = test['id']

# Logistic Regression

In [None]:
lr_model = LogisticRegressionCV(n_jobs=-1, cv = 3, Cs=50, scoring = 'log_loss')
lr_model.fit(X, Y)

In [None]:
lr_valid_pred = lr_model.predict_proba(X_valid)[:, 1]
print('Validation Log Loss using Logistic Regression = {:0.6f}.'.format(log_loss(Y_valid, lr_valid_pred)))

In [None]:
lr_pred = lr_model.predict_proba(X_test)[:, 1]
results = pd.DataFrame({'id': ids, 'probability': lr_pred})
results.to_csv('submissions/101/lr_cv.csv', index=False)

In [None]:
gnb = GaussianNB()
gnb.fit(X, Y)

In [None]:
gnb_valid_pred = gnb.predict_proba(X_valid)[:, 1]
print('Validation Log Loss using Logistic Regression = {:0.6f}.'.format(log_loss(Y_valid, gnb_valid_pred)))

In [None]:
gnb_pred = gnb.predict_proba(X_test)[:, 1]
results = pd.DataFrame({'id': ids, 'probability': gnb_pred})
results.to_csv('submissions/101/gnb.csv', index=False)

In [None]:
combined_valid = 0.8 * lr_valid_pred + 0.2 * gnb_valid_pred
print('Validation Log Loss Combined = {:0.6f}.'.format(log_loss(Y_valid, combined_valid)))

In [None]:
combined_pred = 0.8 * lr_pred + 0.2 * gnb_pred
results = pd.DataFrame({'id': ids, 'probability': combined_pred})
results.to_csv('submissions/101/lr_gnb.csv', index=False)

In [None]:
features = [f for f in list(train) if "feature" in f]
# features = top_corrs_names
X = train[features]
Y = train['target']
X_test = test[features]

X_valid = test.ix[test['data_type'] == 'validation', features]
Y_valid = test.ix[test['data_type'] == 'validation', 'target']
ids = test['id']

In [None]:
lr_model2 = LogisticRegressionCV(n_jobs=-1, cv = 3, Cs=50, scoring = 'log_loss')
lr_model2.fit(X, Y)

In [None]:
lr_valid_pred2 = lr_model2.predict_proba(X_valid)[:, 1]
print('Validation Log Loss using Logistic Regression = {:0.6f}.'.format(log_loss(Y_valid, lr_valid_pred2)))

In [None]:
lr_valid_pred = lr_model.predict_proba(X_valid)[:, 1]
print('Validation Log Loss using Logistic Regression = {:0.6f}.'.format(log_loss(Y_valid, lr_valid_pred)))

In [None]:
log_loss(Y_valid, [0.5 for _ in range(len(Y_valid))])

In [None]:
-np.log(0.5)

In [None]:
p = lr_model.predict_proba(X_test)[:, 1]
results = pd.DataFrame({'id': ids, 'probability': p})
results.to_csv('submissions/numerai4.csv', index=False)

# Support Vector Machine

In [4]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

In [5]:
full_X = train[features]
full_X_test = test[features]
full_X_valid = test.ix[test['data_type'] == 'validation', features]

In [None]:
svm = SVC(probability=True)
svm.fit(full_X, Y)

In [None]:
svm_valid_pred = svm.predict_proba(full_X_valid)[:, 1]
print('Validation Log Loss using Support Vector Classifier = {:0.6f}.'.format(log_loss(Y_valid, svm_valid_pred)))