Randomly rename target labels in order to check the classification isn't completely spurious.

In [1]:
import pandas as pd
import numpy as np
import scipy.io
import re
import sys
import warnings
import pickle

In [2]:
from mvpa2.datasets.mri import fmri_dataset

Failed to import duecredit due to No module named 'duecredit'
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  def __init__(self, shape=None, sid=None, fid=None, dtype=np.float):


In [3]:
class BehavioralDataNotFoundForBrainDataException(Exception):
    """Behavioral data could not be matched to a subject."""
    pass

Replicating earlier work on mvpa. Try not to overly complicate it--the main point is just to verify we get similar results on a different package to validate prior work. ANd we are primarily interested in validating the very high cross-validation results I got with nltools. Should aim for readable code.


Version 5 uses scikit-learn directly, bypassing mvpa2's framework altogether. We also implement a 'forced choice' scorer.

In [4]:
import mvpa2

In [5]:
import sys
import os

sys.path.append(os.path.abspath("../../ml/"))


## Load

In [6]:


nonbids_data_path = "/gpfs/projects/sanlab/shared/DEV/nonbids_data/"
ml_data_folderpath = "/gpfs/projects/sanlab/shared/DEV/nonbids_data/fMRI/ml"

In [7]:
include_exclude_list = pd.read_csv("../nsc_subject_exclusions.csv")

In [8]:
test_train_df_raw = pd.read_csv(nonbids_data_path + "fMRI/ml/train_test_markers_20211027T173724.csv")
test_train_df_raw = test_train_df_raw.merge(include_exclude_list[include_exclude_list.Task=='SST'],left_on='sub_label',right_on='SubjectId',how='left')
test_train_df_raw.loc[test_train_df_raw.Include.isna(),'Include'] = True
test_train_df = test_train_df_raw[test_train_df_raw.Include==True]
exclude_subjects = ['DEV061','DEV185','DEV187','DEV189','DEV190','DEV192','DEV198','DEV203','DEV220','DEV221']
train_subjs = test_train_df.loc[test_train_df.SplitGroup=='Train','sub_label'].tolist()#only get the train subjects; ignore those previously marked hold-out

In [9]:
train_subjs_selected = [ts for ts in train_subjs if (ts not in exclude_subjects)]

In [10]:
individual_differences = pd.read_csv(ml_data_folderpath + "/data_by_ppt.csv")
individual_differences = individual_differences.rename(columns={'SID':'subject'})
individual_differences['wave']=1

We probably actually want to start the pipeline from the betas rather than loading from pickle. to be continued...

In [11]:
from mvpa_pipeline_utils import get_Brain_Data_betas_as_mvpa_for_sub, import_beta_series_pymvpa2, sa_to_df

## new code

In [12]:
from sklearn.model_selection import LeaveOneGroupOut

In [72]:
def do_forced_choice(dataset):
    logo=LeaveOneGroupOut()

    group_scores = {}
    sample_wise_results = []
    for train_index, test_index in logo.split(
        dataset.samples, dataset.sa.targets, dataset.sa.chunks):
        iteration_label = np.unique(dataset.sa.chunks[test_index])[0]

        #print(iteration_label, "; TRAIN:", len(train_index), " items; TEST:", test_index)
        print(".",end="",flush=True)

        #do train-test split
        train_X=dataset.samples[train_index]
        test_X = dataset.samples[test_index]
        train_y=dataset.sa.targets[train_index]
        test_y = dataset.sa.targets[test_index]
        clf_svc = SVC()

        #create the classifier with a probability function
        #https://mmuratarat.github.io/2019-10-12/probabilistic-output-of-svm#:~:text=SVMs%20don't%20output%20probabilities,the%20output%20to%20class%20probabilities.&text=For%20many%20problems%2C%20it%20is,of%20certainty%20about%20the%20answer.
        sklearn_clf = CalibratedClassifierCV(clf_svc)
        #train
        sklearn_clf.fit(train_X, train_y)

        #get the _probability_ we fall into each class
        predict_y_prob = sklearn_clf.predict_proba(test_X)
        predict_y = sklearn_clf.predict(test_X)
        #need to label the output of the probability as CorrectStop and CorrectGo based on the classnames
        #iterate through each class
        proba_dict = {}
        for i, cls in enumerate(sklearn_clf.classes_):
            proba_dict[cls] = [x[i] for x in predict_y_prob]
            
        class_0 = sklearn_clf.classes_[0]
        class_1 = sklearn_clf.classes_[1]

        #find out which one of the two images is most likely to be CorrectGo
        class_0_choice_index = np.argmax(proba_dict[class_0])
        #now put that into a vector
        forced_choice_predictions = [class_1]*2
        forced_choice_predictions[class_0_choice_index] = class_0
        accuracy_score = np.sum([pred==target for pred,target in zip(forced_choice_predictions,test_y)])/len(test_y)
        #print(predict_y)
        #print(proba_dict)
        #print(forced_choice_predictions)
        print(accuracy_score)
        #can we do a sample-wise table?

        group_scores[iteration_label] = accuracy_score
        sample_wise_results_iter = pd.DataFrame({
            'chunk':[iteration_label]*len(test_y),
            'target_y':test_y,
            'pred_y':predict_y,
            'pred_y_forced_choice':forced_choice_predictions
        })
        #add the class-wise probabilities
        for cls in sklearn_clf.classes_:
            sample_wise_results_iter['pred_prob_' + cls] = proba_dict[cls]
            
        sample_wise_results = sample_wise_results + [sample_wise_results_iter]
            
    sample_wise_results_df = pd.concat(sample_wise_results)
    return({'sample_wise':sample_wise_results_df,'group_wise':group_scores})


In [73]:
def setup_metadata(Brain_Data_allsubs):
    #set up chunks and targets so we can do the learning.
    attribute_df = sa_to_df(Brain_Data_allsubs.sa)
    pd.concat([attribute_df['subject'],attribute_df['wave']],axis=1)
    chunk = attribute_df['subject']+"_" + attribute_df['wave'].astype(str)
    Brain_Data_allsubs.sa['chunks'] = list(chunk)
    Brain_Data_allsubs.sa['targets'] = list(Brain_Data_allsubs.sa['condition_label'].value)
    return(Brain_Data_allsubs)

## whole brain

In [15]:
from sklearn.model_selection import LeaveOneGroupOut

In [16]:
from sklearn.svm import SVC
#from mvpa2.measures.base import CrossValidation
#from mvpa2.clfs.meta import NFoldPartitioner
#from mvpa2.clfs.svm import LinearCSVMC
from sklearn.calibration import CalibratedClassifierCV


Now let's scale that up to the full dataset.

In [17]:
brain_data_filepath = ml_data_folderpath + '/SST/mvpa_Dataset_conditions_84subs_correct_cond.pkl'

In [18]:
results_filepath=ml_data_folderpath + "/SST/ttr_mvpa2_res_v3_conditions_84subs_twoclasses_wholebrain.pkl"
#results_filepath=ml_data_folderpath + "/SST/train_test_results_" + dataset_name + "_58subs_twoclasses_pfcmask_repeat1.pkl"

def decoderConstructor(*args, **kwargs):
    return(Decoder(scoring='accuracy',verbose=0, *args, **kwargs))


relevant_mask = None

In [66]:
with open(brain_data_filepath, 'rb') as pkl_file:
    Brain_Data_allsubs = pickle.load(pkl_file)


In [67]:
Brain_Data_allsubs = setup_metadata(Brain_Data_allsubs)

In [68]:
targets_shuffled = sa_to_df(Brain_Data_allsubs.sa).loc[:,['chunks','targets']].groupby('chunks').sample(2,replace=False).targets.reset_index(drop=True)
Brain_Data_allsubs.sa.targets_old = Brain_Data_allsubs.sa.targets
Brain_Data_allsubs.sa.targets = list(targets_shuffled)


In [None]:
forced_choice_results = do_forced_choice(Brain_Data_allsubs)

.1.0
.0.0
.1.0
.0.0
.0.0
.1.0
.1.0
.1.0
.1.0
.1.0
.1.0
.0.0
.0.0
.0.0
.0.0
.1.0
.0.0
.0.0
.0.0
.1.0
.1.0
.1.0
.1.0
.0.0
.1.0
.0.0
.1.0
.0.0
.0.0
.0.0
.0.0
.1.0
.0.0
.1.0
.0.0
.1.0
.1.0
.0.0
.1.0
.0.0
.0.0
.0.0
.1.0
.0.0
.1.0
.1.0
.1.0
.1.0
.0.0
.1.0
.0.0
.0.0
.0.0
.0.0
.0.0
.1.0
.1.0
.1.0
.0.0
.0.0
.0.0
.0.0
.0.0
.0.0
.1.0
.1.0
.1.0
.1.0
.1.0
.1.0
.1.0
.1.0
.1.0
.1.0
.1.0
.1.0
.1.0
.1.0
.1.0
.1.0
.

In [89]:
forced_choice_results['sample_wise']

Unnamed: 0,chunk,target_y,pred_y,pred_y_forced_choice,pred_prob_CorrectGo,pred_prob_CorrectStop
0,DEV005_1,CorrectGo,CorrectGo,CorrectGo,0.903737,0.096263
1,DEV005_1,CorrectStop,CorrectStop,CorrectStop,0.125309,0.874691
0,DEV006_1,CorrectGo,CorrectGo,CorrectGo,0.972710,0.027290
1,DEV006_1,CorrectStop,CorrectStop,CorrectStop,0.218002,0.781998
0,DEV010_1,CorrectGo,CorrectGo,CorrectGo,0.967692,0.032308
...,...,...,...,...,...,...
1,DEV216_1,CorrectStop,CorrectStop,CorrectStop,0.294775,0.705225
0,DEV217_1,CorrectGo,CorrectGo,CorrectGo,0.940275,0.059725
1,DEV217_1,CorrectStop,CorrectStop,CorrectStop,0.107911,0.892089
0,DEV218_1,CorrectGo,CorrectStop,CorrectGo,0.488167,0.511833


In [94]:
total_score = np.mean(list(forced_choice_results['group_wise'].values()))
total_score

0.9876543209876543

Alright--and how about if we don't do the forced-choice?

In [100]:
prediction = np.mean(forced_choice_results['sample_wise']['target_y']==forced_choice_results['sample_wise']['pred_y'])
forced_choice_prediction = np.mean(forced_choice_results['sample_wise']['target_y']==forced_choice_results['sample_wise']['pred_y_forced_choice'])