In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from sklearn import metrics

trialOrders = {}

import os

path = './events'

def compute_skourascore(subject_performace, idealized_performance):
    return scipy.stats.pearsonr(subject_performace, idealized_performance)[0]

def compute_auc_score(counterbalanced_angles, length):
    score = metrics.auc(np.arange(length * 1.0), counterbalanced_angles) / metrics.auc(np.arange(length * 1.0), np.full((length, 1), 90))
    return score

def compute_peak_score(counterbalanced_angles):
    return np.amax(counterbalanced_angles)

def compute_ttp_score(counterbalanced_angles):
    return np.argmax(counterbalanced_angles)

def find_empty_times(data):
    intermissions = data[data['instruction']==" Push Button"].index.tolist()
    rests = data[data['instruction']==" Rest"]
    first_scan_index = data[data['instruction']!=" Rest"].index.tolist()[0] - 1
    first_rest_at_end = data[data['instruction']!=" Rest"].index.tolist()[-1] + 1
    times = [first_scan_index] + intermissions + [first_rest_at_end]
    return times

def determine_trialorder(data, times):
    trialOrder = []
    for trialnum in range(12):
        this_trial = data[(times[trialnum] + 1):times[trialnum + 1]][data['feedback']=="On"]
        trialOrder += [this_trial['left_text'].tolist()[0][1:] + "-" + this_trial['right_text'].tolist()[0][1:], this_trial['instruction'].tolist()[0][1:]]
    return trialOrder

files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
    for file in f:
        if '.tsv' in file:
            files.append(os.path.join(r, file))
            

In [2]:
import copy
means_template = {'ID':[], 'down': [], 'up': [], 'both': []}

series_template = {'ID':[]}
for i in range(1, 13):
    series_template[str(i)] = []

downseries_template = {'ID':[]}
for i in range(1, 7):
    downseries_template[str(i)] = []
    
upseries_template = {'ID':[]}
for i in range(1, 7):
    upseries_template[str(i)] = []

skouradict = {'means':copy.deepcopy(means_template), 'series':copy.deepcopy(series_template), 'downseries':copy.deepcopy(downseries_template), 'upseries': copy.deepcopy(upseries_template)}
aucdict = {'means':copy.deepcopy(means_template), 'series':copy.deepcopy(series_template), 'downseries':copy.deepcopy(downseries_template), 'upseries': copy.deepcopy(upseries_template)}
peakdict = {'means':copy.deepcopy(means_template), 'series':copy.deepcopy(series_template), 'downseries':copy.deepcopy(downseries_template), 'upseries': copy.deepcopy(upseries_template)}
ttpdict = {'means':copy.deepcopy(means_template), 'series':copy.deepcopy(series_template), 'downseries':copy.deepcopy(downseries_template), 'upseries': copy.deepcopy(upseries_template)}

for i in range(len(files)):
    #parsing filename to find NKI subject ID
    subpos = files[i].find('sub-A')
    subjID = files[i][(subpos + 4):(subpos + 13)]
    #reading events.tsv file as "data"
    data = pd.read_csv(files[i], sep="\t")
    times = find_empty_times(data)
    trialOrder = determine_trialorder(data, times)
    
    skouradict['series']['ID'] += [subjID]
    skouradict['upseries']['ID'] += [subjID]
    skouradict['downseries']['ID'] += [subjID]
    
    aucdict['series']['ID'] += [subjID]
    aucdict['upseries']['ID'] += [subjID]
    aucdict['downseries']['ID'] += [subjID]
    
    peakdict['series']['ID'] += [subjID]
    peakdict['upseries']['ID'] += [subjID]
    peakdict['downseries']['ID'] += [subjID]
    
    ttpdict['series']['ID'] += [subjID]
    ttpdict['upseries']['ID'] += [subjID]
    ttpdict['downseries']['ID'] += [subjID]
    
    series_position = 1
    downseries_position = 1
    upseries_position = 1
    
    for trialnum in range(12):
        #this_trial is the data just from the trial of trialnum
        #this_trial is set to the FIRST 15 TRs (first 30 seconds) of each trial!
        this_trial = data[(times[trialnum] + 1):times[trialnum + 1]]#[0:16]
        length = len(this_trial.needle_position.values)
        instruction = trialOrder[(trialnum * 2) + 1]
        polarity = trialOrder[(trialnum * 2)]
        if instruction == "Focus":
            if polarity == 'Focused-Wandering':
                idealized = np.linspace(90, 90 + (length - 1), num=length)
                auc_balanced = (this_trial.needle_position.values - 90)
            elif polarity == 'Wandering-Focused':
                idealized = np.linspace(90, 90 - (length - 1), num=length)
                auc_balanced = (this_trial.needle_position.values - 90) * -1
            
            #calculating scores
            auc = compute_auc_score(auc_balanced, length)
            skourascore = compute_skourascore(this_trial.needle_position.values, idealized)
            peak = compute_peak_score(auc_balanced)
            ttp = compute_ttp_score(auc_balanced)
            
            #storing scores in memory
            skouradict['downseries'][str(downseries_position)] += [skourascore]
            aucdict['downseries'][str(downseries_position)] += [auc]
            peakdict['downseries'][str(downseries_position)] += [peak]
            ttpdict['downseries'][str(downseries_position)] += [ttp]
            
            downseries_position += 1
        elif instruction == "Wander":
            if polarity == 'Focused-Wandering':
                idealized = np.linspace(90, 90 - (length - 1), num=length)
                auc_balanced = (this_trial.needle_position.values - 90) * -1
            elif polarity == 'Wandering-Focused':
                idealized = np.linspace(90, 90 + (length - 1), num=length)
                auc_balanced = (this_trial.needle_position.values - 90)
            
            #calculating scores
            auc = compute_auc_score(auc_balanced, length)
            skourascore = compute_skourascore(this_trial.needle_position.values, idealized)
            peak = compute_peak_score(auc_balanced)
            ttp = compute_ttp_score(auc_balanced)
            
            #storing scores in memory
            skouradict['upseries'][str(upseries_position)] += [skourascore]
            aucdict['upseries'][str(upseries_position)] += [auc]
            peakdict['upseries'][str(upseries_position)] += [peak]
            ttpdict['upseries'][str(upseries_position)] += [ttp]
            
            upseries_position += 1
        else:
            print("something is horribly wrong")
        skouradict['series'][str(series_position)] += [skourascore]
        aucdict['series'][str(series_position)] += [auc]
        peakdict['series'][str(series_position)] += [peak]
        ttpdict['series'][str(series_position)] += [ttp]
        
        series_position += 1



In [3]:
#convert series dicts into dataframes
#first skourascores
skouraseries = pd.DataFrame(skouradict['series'])
skouradownseries = pd.DataFrame(skouradict['downseries'])
skouraupseries = pd.DataFrame(skouradict['upseries'])
#now AUC scores
aucseries = pd.DataFrame(aucdict['series'])
aucdownseries = pd.DataFrame(aucdict['downseries'])
aucupseries = pd.DataFrame(aucdict['upseries'])
#now peak scores
ttpseries = pd.DataFrame(ttpdict['series'])
ttpdownseries = pd.DataFrame(ttpdict['downseries'])
ttpupseries = pd.DataFrame(ttpdict['upseries'])
#now TTP scores
peakseries = pd.DataFrame(peakdict['series'])
peakdownseries = pd.DataFrame(peakdict['downseries'])
peakupseries = pd.DataFrame(peakdict['upseries'])

#now sort them by IDs
skouraseries.sort_values(by=['ID'])
skouradownseries.sort_values(by=['ID'])
skouraupseries.sort_values(by=['ID'])
aucseries.sort_values(by=['ID'])
aucdownseries.sort_values(by=['ID'])
aucupseries.sort_values(by=['ID'])
ttpseries.sort_values(by=['ID'])
ttpdownseries.sort_values(by=['ID'])
ttpupseries.sort_values(by=['ID'])
peakseries.sort_values(by=['ID'])
peakdownseries.sort_values(by=['ID'])
peakupseries.sort_values(by=['ID'])
None;

skouraseries[['ID']].to_csv("events_IDs.csv", index=False)

Function for calculating skouras-style "learning" score. "learning" in this context is measured as the average of trials 4-6 minus the average of trials 1-3. This "learning" measurement is independent of score-type, and can be calculated for each one.

It also calculates the 6-1 learning measure.

I define a function for calculating "learning" here.

In [4]:
#this function assumes a trial-series of length 6 (either up or down regulation, but not both)
def add_learning_measures(series):
    series['first_avg'] = series[['1', '2','3']].mean(axis=1)
    series['second_avg'] = series[['4', '5','6']].mean(axis=1)
    series['skouras_learning'] = series['second_avg'] - series['first_avg']
    series['six_minus_one'] = series['6'] - series['1']
    series['difference_between_measures'] = series['six_minus_one'] - series['skouras_learning']
    return series

In [5]:
%matplotlib notebook

Create function for plotting learning curves and histograms of arbitrary # of trials

In [6]:
import matplotlib
import matplotlib.pyplot as plt
from scipy import stats

def learningcurve(seriesdata, length, scoretype):
    x = []
    for i in range(length):
        x += [i + 1]

    series = pd.DataFrame(seriesdata)
    series = series.sort_values(by=['ID'])
    series = series.to_numpy()
    fig, ax = plt.subplots(2, 1)
    diffs = []
    for j in range(len(series)):
        x=np.asarray(x).astype(np.float)
        ax[0].scatter(x, series[j][1:], color='b', alpha=0.1)
        diffs += [series[j][6] - series[j][1]]
        y = series[j][1:].astype(np.float)
        slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        z = np.polyfit(x, y, 1)
        p = np.poly1d(z)
        ax[0].plot(x,p(x),"r", alpha=0.2)
    ax[1].hist(diffs, 4)
    # set ticks and tick labels
    ax[0].set_xlim((1, length))
    ax[0].set_xticks(x)
    ax[0].set_xticklabels(x)

    plt.xlabel('Trial Position (not the actual trial number)')
    plt.ylabel('Score')
    plt.title(scoretype + ' learning curves')
    plt.tight_layout()

    plt.show()
    
def series_histogram(seriesdata, scoretype, length, n_bins):
    series = pd.DataFrame(seriesdata)
    series = series.sort_values(by=['ID'])

    fig, axs = plt.subplots(length, 1, sharey=True, sharex=True)
    for trialPos in range(length):
        axs[trialPos].hist(series[str(trialPos + 1)].to_list(), bins=n_bins)
        axs[trialPos].set_title(scoretype + ' Trial ' + str(trialPos + 1))
    plt.xlabel('Score')
    plt.tight_layout()
    plt.show()

In [7]:
%matplotlib notebook
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt

def learning_hist(series, title, dosample=False):
    if dosample:
        print(title, "score")
        print("Average difference between measures =", series['difference_between_measures'].mean(axis=0))
        print('skouras_learning measure has a variance of', np.var(series[['skouras_learning']].values))
        print('6-1 measure has a variance of', np.var(series[['six_minus_one']].values))
        print("Here is a random sample of 10 subjects.")
        print(series[['skouras_learning','six_minus_one', 'difference_between_measures']].sample(10))
    plt.figure()
    x = series[['skouras_learning']].values
    y = series[['six_minus_one']].values
    sns.distplot(x, kde=True, rug=True, label="avg(4,5,6)-avg(1,2,3)")
    sns.distplot(y, kde=True, rug=True, label="6 - 1")
    plt.legend()
    plt.title(title)
    plt.show()

# aucdownserieslearning = add_learning_measures(aucdownseries)
# skouradownserieslearning = add_learning_measures(skouradownseries)
# ttpdownserieslearning = add_learning_measures(ttpdownseries)
# peakdownserieslearning = add_learning_measures(peakdownseries)


# learning_hist(aucdownserieslearning, "AUC", dosample=True)
# learning_hist(skouradownserieslearning, "Skouras-Score", dosample=True)
# learning_hist(ttpdownserieslearning, "Time-to-peak", dosample=True)
# learning_hist(peakdownserieslearning, "Peak", dosample=True)

# from IPython.core.display import display, HTML
# display(HTML("<style>div.output_scroll { height: 250em; }</style>"))

Reading in age and clinical status csv file made in pmetrics.ipynb.

Going to test for interesting correlations with them and the other data.

From skouras:
    In the control group, age (M = 30.71 years; SD = 7.48; nb = 62) correlated negatively with overall DMN NF performance score (M = 0.195, SD = 0.312) with a moderate association that explained 17% of the variance, r(62)=-0.412, R2 = 0.17, P = 0.0009; Fig. 3B.
    
My output:
    In the control group, age ( M = 32.05501195912154  years; SD = 7.8142617590016386 n = 63 ) correlated negatively with overall DMN NF performance score ( M = 0.19537175723329936 SD = 0.2955934924510536 ) with an association that explained 17.79582649049908 % of the variance, r(63) = -0.4218509984639017 R2 = 0.1779582649049908 P = 0.0005741932845391171
    
parameters that matter for correlations    
-whether I use first 30 secs or whole session

-type of scoring method

-type of score aggregation method (learning or overall)

-whether I remove sleepers

-which trials are considered in the aggregation


Also, it seems like the weirdness with n values is caused by how there are a few subjects we don't have ages for.

In [8]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML
from IPython.display import display, HTML
from scipy import stats
from scipy.stats import norm
from statsmodels.stats.diagnostic import lilliefors
import seaborn as sns

pd.set_option('display.max_rows', 300)

def lillie(nums, name=""):
    ksstat, pval = lilliefors(nums, pvalmethod='table')
    print("Lilliefors Test " + name)
    print("n=", len(nums), "D =", ksstat, "p=", pval)
    
def compare_with_age(othernums, vname="", path=False, exact=False, nosleep=True):
    diags_ages = pd.read_csv('./diags_ages.csv')
    diags_ages = pd.merge(skouraseries[['ID']], diags_ages, how='inner', on='ID').sort_values(by=['ID']).drop_duplicates(subset=['ID']).reset_index(drop=True)
    if nosleep:
        diags_ages = diags_ages[diags_ages['NFB3_MRIQ_01'] == 0].sort_values(by=['ID']).reset_index(drop=True)
    if not path:
        diags_ages_control = diags_ages[diags_ages['DIAG_01#CODE'] == 'V71.09'].sort_values(by=['ID']).reset_index(drop=True)
    else:
        diags_ages_control = diags_ages[diags_ages['DIAG_01#CODE'] != 'V71.09'].sort_values(by=['ID']).reset_index(drop=True)
    
    print("Comparing ages of control-group with " + vname)
    
    if exact:
        diags_ages_control = diags_ages_control[['ID', 'AGE_04']]
    else:
        diags_ages_control = diags_ages_control[['ID', 'bids_age']]
    data = pd.merge(othernums, diags_ages_control, how='inner', on='ID').sort_values(by=['ID']).drop_duplicates(subset=['ID']).reset_index(drop=True)
    if exact:
        ages = data.AGE_04.values
    else:
        ages = data.bids_age.values
    
    lillie(data[othernums.columns[1]].values, vname)
    
    pr, pp = stats.pearsonr(data[othernums.columns[1]].values, ages)
    sr, sp = stats.spearmanr(data[othernums.columns[1]].values, ages)
    
    x, y = pd.Series(ages, name="Age"), pd.Series(data[othernums.columns[1]].values, name=vname)
    plt.figure()
    ax = sns.regplot(x=x, y=y, label=vname)
    plt.legend()
    plt.title(vname)
    plt.show()
    
    print("Correlation Tests " + vname + " (ran both, since I haven't figured out yet how to check for normal distribution)")
    print("Pearson r =", pr, "R2 = ", pr ** 2, "p = ", pp)
    print("Spearman r =", sr, "R2 = ", sr ** 2, "p = ", sp)

    

# diags_ages = pd.read_csv('./diags_ages.csv')
# diags_ages = pd.merge(aucdownseries[['ID', 'skouras_learning']], diags_ages, how='inner', on='ID').sort_values(by=['ID']).drop_duplicates(subset=['ID']).reset_index(drop=True)


# diags_ages_control = diags_ages[diags_ages['DIAG_01#CODE'] == 'V71.09'].sort_values(by=['ID']).reset_index(drop=True)
# diags_ages_path = diags_ages[diags_ages['DIAG_01#CODE'] != 'V71.09'].sort_values(by=['ID']).reset_index(drop=True)

# lillie(diags_ages.AGE_04.values, "- Ages of all NFB Participants")
# lillie(diags_ages_control.AGE_04.values, "- Ages of control-group NFB Participants")
# lillie(diags_ages_path.AGE_04.values, "- Ages of pathological NFB Participants")

# print("\n\n\n\n")
# trials = aucseries.loc[: , "7":"12"]
# aucseries['scores'] = trials.mean(axis=1)
# scores_only = aucseries[['ID', 'scores']]
# compare_with_age(scores_only, "AUC overall score", nosleep=False)





Run 48-comparisons analysis (either 30secs or full-trials, can't really do both at the same time)

Variables:
score_type (AUC & Skouras)
sleepers_group (nonsleepers, sleepers, or everyone) & clinical_status_group (control, path, & both)
score_aggregation_method (slope, avg456, six_minus_one, avg456_minus_avg123)

2 x 2 x 3 x 4 = 48 combinations, each spearman-correlated with age (sort the results by p-value)

How, computationally, is best to do this?

Which functions?

a
in: score_type_str, df with clinical status, age, and sleep info (for a single score_type)| lists of strings for sleep and clinical_status and aggregation_types
implementation:
    create output dict - ??? structure
    for a in aggregations:
        if a is this:
            calculate it
            add to df
    temp df
    for c in clinical_statuses:
        for s in sleep_types:
            if its c:
                store appropriate subset of input df in temp df
                calculate spearman and add results with appropriate descriptor to output dict
            
out: dict

In [9]:
from IPython.core.display import display, HTML
from scipy import stats
import numpy as np
display(HTML("<style>.container { width:90% !important; }</style>"))

diags_ages = pd.read_csv('./diags_ages.csv')
diags_ages = pd.merge(aucdownseries[['ID']], diags_ages, how='inner', on='ID').sort_values(by=['ID']).drop_duplicates(subset=['ID']).reset_index(drop=True)
#one subject is lost here cuz we have literally zero assessment data for them, not even age, maybe will try to get age from participants.tsv (but that's annoying from a data-hygiene perspective)



def add_slopes(series, length):
    if length == 6:
        nums = np.asarray(['1','2', '3', '4', '5', '6'])
        series['slope'] = series[['1','2', '3', '4', '5', '6']].apply(lambda x: np.polyfit(range(len(series[['1','2', '3', '4', '5', '6']].columns)), x, 1)[0], axis=1)
    return series

def add_avg456(series, length):
    if length == 6:
        trials = series.loc[: , "4":"6"]
        series['avg456'] = trials.mean(axis=1)
    return series

def last_minus_first(series, length):
    if length == 6:
        series['last_minus_first'] = series['6'] - series['1']
    return series

def end_avg_minus_start_avg(series, length):
    if length == 6:
        series['end_avg_minus_start_avg'] = series[['4', '5','6']].mean(axis=1) - series[['1', '2','3']].mean(axis=1)
    return series

def multi_subset_comparison(score_type_name, scoreseries, assessment_data, length=6, aggregation_methods=['slope', 'avg456', 'last_minus_first', 'end_avg_minus_start_avg'], sleep_groups=['nosleep', 'even_sleepers'], clinical_status_groups=['control', 'path', 'control_and_path']):
    out = {}
    ultraseries = scoreseries.copy()
    
    #compute the necessary aggregation values (maybe should move this outside of the function?)
    for a in aggregation_methods:
        if a == 'slope':
            ultraseries = add_slopes(ultraseries, length)
        if a == 'avg456':
            ultraseries = add_avg456(ultraseries, length)
        if a == 'last_minus_first':
            ultraseries = last_minus_first(ultraseries, length)
        if a == 'end_avg_minus_start_avg':
            ultraseries = end_avg_minus_start_avg(ultraseries, length)
    for s in sleep_groups:
        for c in clinical_status_groups:
            subset_temp = assessment_data.copy()
            if s == 'nosleep':
                subset_temp = subset_temp[subset_temp['NFB3_MRIQ_01'] == 0].sort_values(by=['ID']).reset_index(drop=True)
            if c == 'control':
                subset_temp = subset_temp[subset_temp['DIAG_01#CODE'] == 'V71.09'].sort_values(by=['ID']).reset_index(drop=True)
            elif c == 'path':
                subset_temp = subset_temp[subset_temp['DIAG_01#CODE'] != 'V71.09'].dropna(subset=['DIAG_01#CODE']).sort_values(by=['ID']).reset_index(drop=True)
            
            
            analysis_temp = ultraseries.copy()
            analysis_temp = pd.merge(subset_temp, analysis_temp[['ID'] + aggregation_methods], how='inner', on='ID').sort_values(by=['ID']).reset_index(drop=True)
            for a in aggregation_methods:
                r, p = stats.spearmanr(analysis_temp[a].values, analysis_temp['bids_age'].values)
                if p < 0.05:
                    criteria = score_type_name + ' ' + c + ' ' + s + ' ' + a + ' n = ' + str(len(subset_temp.index))
                    print(criteria)
                    print("Spearman r =", r, "R2 = ", r ** 2, "p = ", p)
                
    return ultraseries

multi_subset_comparison("AUC-down", aucdownseries, diags_ages)





Unnamed: 0,ID,1,2,3,4,5,6,slope,avg456,last_minus_first,end_avg_minus_start_avg
0,A00028185,-0.074157,0.017386,0.569696,0.279486,0.019048,0.046897,0.009144,0.115144,0.121055,-0.055831
1,A00032875,-0.021696,-0.647842,0.321173,-0.338397,-0.115621,0.689224,0.128334,0.078402,0.710921,0.194524
2,A00033747,0.323009,-0.179313,-0.354922,0.505323,0.593041,0.166376,0.068404,0.42158,-0.156634,0.491988
3,A00034854,-0.000319,0.166014,0.25074,0.520836,0.27313,0.722287,0.120128,0.505418,0.722606,0.366606
4,A00035072,-0.00844,0.793946,0.80398,0.607637,0.350008,0.794943,0.071107,0.584196,0.803383,0.054367
5,A00035827,-0.147504,-0.122453,0.36003,0.71382,0.263838,-0.397775,0.007466,0.193294,-0.250271,0.16327
6,A00035840,0.03075,-0.083743,0.37461,0.190733,-0.092829,0.478667,0.057956,0.192191,0.447917,0.084985
7,A00037112,0.033101,0.512429,0.123412,-0.021919,0.069863,0.071264,-0.036635,0.039736,0.038163,-0.183244
8,A00037511,0.096261,0.144126,-0.077809,0.27492,-0.292004,0.208875,-0.011217,0.06393,0.112614,0.009738
9,A00037848,0.017385,0.060305,-0.005868,-0.016197,-0.080445,0.012271,-0.01309,-0.028124,-0.005114,-0.052064


In [11]:
multi_subset_comparison("skouros-learning", skouradownseries, diags_ages).to_csv("./dependent.csv", index=False)