In [8]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler

from sklearn.metrics import log_loss

In [3]:
def format_data():
    train = pd.read_csv('data/numerai_training_data.csv')
    test = pd.read_csv('data/numerai_tournament_data.csv')
    
    features = [f for f in list(train) if "feature" in f]
    X = train[features]
    Y = train.target
    X_test = test[features]
    ids = test['id']
    
    X_valid = test.ix[test['data_type'] == 'validation', features]
    Y_valid = test.ix[test['data_type'] == 'validation', 'target']
    
    return X, Y, X_valid, Y_valid, X_test, ids

X, Y, X_valid, Y_valid, X_test, ids = format_data()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # This is added back by InteractiveShellApp.init_path()


In [4]:
# Function to calculate log loss on the validation set
def calc_log_loss(valid_predictions, name):
    log_loss_value = log_loss(Y_valid, valid_predictions)
    print('Log Loss for {}: {:.6f}.'.format(name, log_loss_value))

In [12]:
# Function to write a submission file to the provided filename
def write_submit(test_predictions, filename):
    results = pd.DataFrame({'id': ids, 'probability': list(test_predictions)})
    filepath = 'submissions/%s.csv' % filename
    results.to_csv(filepath, index = False)
    print('Results saved to %s' % filepath)

In [5]:
# Implement the exported pipeline from TPOT
exported_pipeline = make_pipeline(
    MaxAbsScaler(),
    PCA(iterated_power=4, svd_solver="randomized"),
    LogisticRegression(C=0.01, dual=False, penalty="l1")
)

# Fit on the training data
exported_pipeline.fit(X, Y)

Pipeline(memory=None,
     steps=[('maxabsscaler', MaxAbsScaler(copy=True)), ('pca', PCA(copy=True, iterated_power=4, n_components=None, random_state=None,
  svd_solver='randomized', tol=0.0, whiten=False)), ('logisticregression', LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [9]:
# Calculate the valiation log loss
calc_log_loss(exported_pipeline.predict_proba(X_valid)[:, 1], name = 'exported_pipeline_implementation')

Log Loss for exported_pipeline_implementation: 0.692528.


In [14]:
# Make predictions on the testing set
exported_pipeline_predictions = exported_pipeline.predict_proba(X_test)[:, 1]

# Save to a prediction file
write_submit(exported_pipeline_predictions, filename = '107_tpot_1')

Results saved to submissions/107_tpot_1.csv
