# Biomag 2016 - Competition 3: inter-subject exploit

 *Alexandre Barachant, Jean-Remi King*

## Exploit description

We identified that the order of the pictures was not truly randomized in the training set:
1. The trials consisted in shuffled sequences of 2 x 6 pictures (one of each category).
2. The overall order was identical across subjects.

Consequently, we hypothesized that these structures were preserved in the test set.
We thus assembled the predictions across subjects to increase robustness.

## Approach

The data was epoched from 400 ms to 1600 ms after the stimulus onset. An single estimator, adapted from our main pipeline, was fitted for each subject separately, and used to make a probabilistic estimate of the trials in the test set. 

Each prediction was then locally debiased and averaged across subjects.

We finally check that this approach is valid by applying a cross-validation on the test set.


In [1]:
import numpy as np
import pandas as pd

from scipy.io import loadmat

from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold

from mne.decoding import UnsupervisedSpatialFilter

from pyriemann.spatialfilters import CSP
from pyriemann.tangentspace import TangentSpace
from pyriemann.estimation import HankelCovariances



def epoch_data(data, window=125, offset=0):
    """Epoch data"""
    MEG, trigger = data['planardat'],data['triggers']

    X, y = list(), list()
    
    trials = np.r_[trigger.t1, trigger.t2, trigger.t3,
                   trigger.t4, trigger.t5, trigger.t6]

    values = np.array([0]*len(trigger.t1) + [1]*len(trigger.t2) +
                      [2]*len(trigger.t3) + [3]*len(trigger.t4) +
                      [4]*len(trigger.t5) + [5]*len(trigger.t6))

    # Epoch training set
    ix = np.argsort(trials)
    trials, values = trials[ix], values[ix]
    for ii, start in enumerate(trials):
        X.append(MEG[:, slice(start + offset, start + window + offset)])
        y.append(values[ii])

    # Epoch testing set
    X_test = list()
    for t in trigger.test:
        sl = slice(t + offset, t + window + offset)
        # Stack zeros on last trials in case window is too long
        epoch = np.zeros((len(MEG), window))
        epoch[:, :len(MEG[0, sl])] = MEG[:, sl]
        X_test.append(epoch)

    # Format
    X = 1e12 * np.array(X)
    X_test = 1e12 * np.array(X_test)
    y = np.array(y) == 3

    return X, y, X_test


def local_debias(y_preds):
    """The sum of each group of 12 trials must be 1."""
    y_preds = np.array(y_preds)
    y_preds = np.reshape(y_preds, (-1, 12))
    y_preds /= np.sum(y_preds, 1)[:, np.newaxis]
    return y_preds.ravel()


def local_threshold(y_preds):
    """Find the two maxima of each group of 12 trials"""
    y_preds = np.array(y_preds)
    y_preds = y_preds.reshape(-1, 12)
    thresholds = np.zeros_like(y_preds)
    sorts = np.argsort(y_preds, axis=1)[:, -2:]
    for ii, sort in enumerate(sorts):
        thresholds[ii, sort] = 1
    return thresholds.ravel()

We here use a single estimator, because it is sufficient to obtain perfect accuracy

In [2]:
clf = make_pipeline(
    UnsupervisedSpatialFilter(PCA(70), average=False),
    HankelCovariances(delays=[1, 8, 12, 64], estimator='oas'), 
    CSP(15, log=False),
    TangentSpace('logeuclid'),
    LogisticRegression('l2')
)

Fit on train data, aggregate test predictions across subjects

In [4]:
offsets = [10, 20, 30, 40, 50]
y_preds = np.zeros(240)
for subject in range(1, 5):
    data = loadmat('./data/meg_data_%da.mat' % subject, squeeze_me=True, struct_as_record=False)
    for offset in offsets:
        X, y, X_test = epoch_data(data, window=150, offset=offset)
        clf.fit(X, y)
        y_preds += local_debias(clf.predict_proba(X_test)[:, -1])

Test our predictions in a CV

In [5]:
# discretize probabilistic prediction
y_test = local_threshold(y_preds)

cv = StratifiedKFold(y_test, 5)
n_test = 240
y_preds = np.zeros((4, n_test))

for subject in range(1, 5):
    data = loadmat('./data/meg_data_%da.mat' % subject, squeeze_me=True, struct_as_record=False)
    X, y, X_test = epoch_data(data, window=150, offset=0)

    # Use CV for validation of our hypothesis
    for train, test in cv:

        # Fit on the complete train data and a subset of the test data
        X_all = np.vstack((X, X_test[train]))
        y_all = np.hstack((y, y_test[train]))
        clf.fit(X_all, y_all)

        # Predict on remaining test data
        y_preds[subject-1, test] = clf.predict_proba(X_test[test])[:, -1]
y_preds = np.mean([local_debias(y_pred) for y_pred in y_preds], axis=0)
print(roc_auc_score(y_test, y_preds))

1.0


Save the same predictions for every subject

In [6]:
y_test = local_threshold(y_preds)
for subject in range(1, 5):
    results = pd.DataFrame(dict(Predictions=y_test))
    results.to_csv('predictions_Subject%d_exploit.csv' % subject)