# Group Analysis

Desired output figure:
- time x classification accuracy
- Within, Pre & Post alignment

In [1]:
%matplotlib inline

import os
from copy import deepcopy
from collections import Counter

import numpy as np
import matplotlib.pyplot as plt

from mne import read_epochs

from hypertools.tools.align import align

# Classification stuff
from sklearn import svm
from sklearn.model_selection import cross_val_score

## Settings

Note: Decode both to the stimulus, and to the response.

In [2]:
# Classification Settings
k_fold = 3

# Initialize SVM classification object
clf = svm.SVC(kernel='linear')

# Set the expected number of events per condition
expected_ev_counts = [25, 25]

# Set data size
n_epochs = 50
n_chs = 128
n_times = 1001

In [3]:
# Globals

# Set the collection of ways to average across features
AVGS = {
    'max' : np.max, 
    'min' : np.min, 
    'mean' : np.mean, 
    'median' : np.median
}

## Helper Functions

In [4]:
# API notes:
# - work on subject level objects

In [5]:
def extract_data(dat):
    """Organize data from MNE object, to data matrices and labels to be used for classification. 
    
    Parameters
    ----------
    dat : mne.Epochs object
        A subject's worth of epoched data.
    
    Returns
    -------
    labels : 1d array
        Labels for each trial type. 
    data : 3d array
        Epoched data matrix. 
    """

    # Check event codes there are, and unpack
    ev_counts = Counter(dat.events[:, 2])
    evc_a, evc_b = [str(el) for el in ev_counts.keys()]
    n_evc_a, n_evc_b = ev_counts.values()

    # Check the number of events is as expected
    if [n_evc_a, n_evc_b] != expected_ev_counts:
        raise ValueError('Number of events does not match what was expected.')
        
    # Generate labels
    labels = np.hstack([np.zeros(shape=[n_evc_a]), np.ones(shape=[n_evc_b])])
    
    # Organize data
    data = np.concatenate([dat[evc_a]._data, dat[evc_b]._data], 0)
    
    return data, labels


def make_2d(dat):
    """Reorganize a 3D matrix into a continuous 2D matrix. 
    
    Parameters
    ----------
    dat : 3d
        Epoched data matrix, as [n_epochs, n_channels, n_times]
    
    Returns
    -------
    2d array
        Continuous data matrix of epochs concatendat in time, as [n_channels, n_times_tot]
            Note: where n_times_tot = n_times * n_epochs
    """
    
    return np.concatenate(dat, 1)


def make_3d(dat):
    """Reorganize a 2D matrix into the 3D trial structure matrix.
    
    Parameters
    ----------
    dat : 2d array
        Continuous data matrix of epochs concatendat in time, as [n_channels, n_times_tot]
            Note: where n_times_tot = n_times * n_epochs
        
    Results
    -------
    3d array
        Epoched data matrix, as [n_epochs, n_channels, n_times]
    """
    
    return np.stack(np.split(dat, n_epochs, 1))
    

def btwn_subj_classication(all_data, all_labels):
    """Run classification between subjects.
    
    Parameters
    ----------
    all_data : list of 3d array
        Data for each subject.
    all_labels : list of 1d array
        Labels for each subject.
    
    Returns
    -------
    scores : list of float
        The classifications scores for each held out subject, as predicted from the group. 
    """

    scores = [None] * len(all_data)
    
    for ind, subj_data, subj_labels in zip(range(len(all_data)), all_data, all_labels):

        # Take a copy of the group data, and drop held out subject
        temp_data = deepcopy(all_data)
        temp_labels = deepcopy(all_labels)
        del temp_data[ind]
        del temp_labels[ind]

        # Collapse group for training the model
        group_data = feature_dat(np.concatenate(temp_data, 0))
        group_labels = np.concatenate(temp_labels, 0)

        # Train on group & classify left out subject
        clf = svm.SVC(kernel='linear')
        clf.fit(group_data, group_labels)
        scores[ind] = clf.score(feature_dat(subj_data), subj_labels)

    return scores


def feature_dat(dat, avg_type='max'):
    """Convert epochs 
    
    Parameters
    ----------
    dat : 3d array
        xx
    avg_type : {'max', 'min', 'mean', 'median'}
        xx
        
    Returns
    -------
    out : XX
        xx
    """

    avg = AVGS[avg_type]

    # Note: can add something here to select channels / time points
    out = avg(dat[:, :, :], 2)
    
    return out


def print_avg(label, score):
    print(label + ': {:1.2f}%'.format(score *100))


def print_avgs(label, scores):
    print(label + ':')
    for ind, score in enumerate(scores):
        print('\t{:1.0f} \t {:1.2f}'.format(ind, score))

## Data Organization / Loading

In [6]:
# Set data location for processed files
dat_path = '/Users/tom/Desktop/HyperEEG_Project/Data/proc/'

In [7]:
# Get list of available files
dat_files = [file for file in os.listdir(dat_path) if '.fif' in file]

In [8]:
# Load all data
all_subjs = [read_epochs(os.path.join(dat_path, f_name),
                         preload=True, verbose=False) for f_name in dat_files]

# Check how many subjects there are
n_subjs = len(all_subjs)

In [9]:
# # TESTS

# Load single subject data - and fix channel subset
dat = read_epochs(os.path.join(dat_path, dat_files[0]), preload=True, verbose=False)
dat._data = dat._data[:, 0:128, :]

# Make test list of multi-subj data
n_group = 7
all_subjs = [dat] * n_group
n_subjs = len(all_subjs)

## Within Subject Classification (un-aligned)

Notes:
- Update to predict across windows of the trial

In [10]:
# Organize subject data for classification
all_data, all_labels = [], []
for subj in all_subjs:
    t_data, t_labels = extract_data(subj)
    all_data.append(t_data)
    all_labels.append(t_labels)

In [11]:
# Run cross-validated classification within each subject
within_scores = np.zeros(shape=[n_subjs, k_fold])
for s_ind, subj_data, subj_labels in zip(range(n_subjs), all_data, all_labels):
    within_scores[s_ind, :] = cross_val_score(clf, feature_dat(subj_data), subj_labels, cv=k_fold)

In [12]:
# Get average results - within and across subjects
within_subj_avgs = np.mean(within_scores, 1)
within_glob_avg = np.mean(within_subj_avgs)

In [13]:
# Check outcome - average across all subjects
print_avg('CV Within-Subj Prediction', within_glob_avg)

CV Within-Subj Prediction: 63.66%


In [14]:
# Check performance on each subject
print_avgs('Per Subj Within Predictions', within_subj_avgs)

Per Subj Within Predictions:
	0 	 0.64
	1 	 0.64
	2 	 0.64
	3 	 0.64
	4 	 0.64
	5 	 0.64
	6 	 0.64


## Between Subject Classification (un-aligned)

In [15]:
# Run prediction between subjects - on unaligned data
btwn_scores = btwn_subj_classication(all_data, all_labels)

In [16]:
# Get average results
avg_btwn_scores = np.mean(btwn_scores)

In [17]:
# Check outcome - average across all subjects
print_avg('Btwn-Subj Prediction', avg_btwn_scores)

Btwn-Subj Prediction: 58.00%


In [18]:
# Check performance on each subject
print_avgs('Btwn Subject Classification', btwn_scores)

Btwn Subject Classification:
	0 	 0.58
	1 	 0.58
	2 	 0.58
	3 	 0.58
	4 	 0.58
	5 	 0.58
	6 	 0.58


## Alignment


In [19]:
# Data organization - extract matrices, and flatten to continuous data
all_data = [make_2d(dat) for dat in all_data]

In [20]:
# Do alignment
#  Note: this also switches orientation (takes the transpose) to match hypertools
aligned_data = align([dat.T for dat in all_data]) # Note: align assumes [n_samples x n_channels]
aligned_data = [dat.T for dat in aligned_data]
aligned_data = [make_3d(dat) for dat in aligned_data]

## Between Subject Classification (aligned)

In [21]:
# Run prediction between subjects - on aligned data
btwn_al_scores = btwn_subj_classication(aligned_data, all_labels)

In [22]:
# Get average results
avg_btwn_al_scores = np.mean(btwn_al_scores)

In [23]:
# Check outcome - average across all subjects
print_avg('Btwn-Subj Prediction', avg_btwn_al_scores)

Btwn-Subj Prediction: 58.00%


In [24]:
# Check performance on each subject
print_avgs('Btwn Subject Classification', btwn_al_scores)

Btwn Subject Classification:
	0 	 0.58
	1 	 0.58
	2 	 0.58
	3 	 0.58
	4 	 0.58
	5 	 0.58
	6 	 0.58


#### CHECKS
Compare hyperaligned to unaligned data

In [25]:
print(np.all(all_data[0] == all_data[1]))
print(np.all(aligned_data[0] == aligned_data[1]))
print(np.all(aligned_data[0] == all_data[0]))

True
True
False


  This is separate from the ipykernel package so we can avoid doing imports until


## Check random rotations

In [26]:
# Random rotation matrix
rot = np.random.random(size=n_chs*n_chs).reshape([n_chs, n_chs])

In [27]:
# Rotation by random matrix
twod_dat = deepcopy(all_data)
twod_dat = [np.dot(rot, dat) for dat in twod_dat]
twod_dat_3d = [make_3d(dat) for dat in twod_dat]

In [28]:
# Between subject classification
rand_btwn_scores = btwn_subj_classication(twod_dat_3d, all_labels)

In [29]:
# Check outcome - average across all subjects
avg_rand_btwn = np.mean(rand_btwn_scores)
print_avg('Random Btwn-Subj Prediction', avg_rand_btwn)

Random Btwn-Subj Prediction: 58.00%


## PyMVPA

Apply hyperalignment implementation from the PyMVPA package.

Note: this requires being in a Py2 environment with PyMVPA available.

In [30]:
from mvpa2.datasets.base import Dataset
from mvpa2.algorithms.hyperalignment import Hyperalignment

In [31]:
# Re-organize data into PyMVPA datasets objects
datasets = [Dataset(dat.T) for dat in all_data]

In [32]:
# Run hyperalignment, and get the transformation matrices
hyper_aligner = Hyperalignment()
hyper_aligner.train(datasets)
mappers = hyper_aligner(datasets)

In [33]:
# Apply the transformations to each dataset, and re-organize data
#   This applies the projection to the 2D data, transpose, and split back into epochs
aligned_datasets = []
for dataset, mapper in zip(datasets, mappers):
    aligned_datasets.append(make_3d(mapper.forward(dataset).samples.T))

In [34]:
# Between subject classification after PyMVPA hyperalignment
btwn_al2_scores = btwn_subj_classication(aligned_datasets, all_labels)

In [35]:
# Check average performance
avg_btwn_al2 = np.mean(btwn_al2_scores)
print_avg('Aligned-2 Btwn Scores', avg_btwn_al2)

Aligned-2 Btwn Scores: 100.00%


In [36]:
# Check if the rotation matrices are the same
np.all(mappers[0].proj == mappers[1].proj)

True

#### Compare between the two hyperalignment implementations

In [37]:
# Check the magnitude of differences between aligned data
diff = aligned_datasets[0] - aligned_data[0]

In [38]:
print('Avg Magnitude Diff', np.mean(np.abs(diff)))
print('Avg Magnitude Data', np.mean(np.abs(aligned_datasets[0])))

('Avg Magnitude Diff', 0.7180778440607879)
('Avg Magnitude Data', 0.7180805884556001)


In [39]:
# Check number of overlapping points
from __future__ import division
np.sum(np.isclose(aligned_datasets[0], aligned_data[0])) / aligned_data[0].size

0.0

## Victory Party.

Soon...