In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import sys
import json


In [2]:
# age and sex info
dfage = pd.read_excel('../data/age.xlsx')

# included sessions, import sessions.json and the field 'allsessions_prediction'
with open("../sessions.json", 'r') as file:
    sessions = json.load(file)
    sessions = sessions.get('allsessions_prediction')

# if age is nan, then write "data_avail" False, else True
dfage['invited'] = ~dfage['age'].isna()

# if sub_ses_str of dfage is in sessions, then write True in the column 'included'
dfage['included'] = dfage['sub_ses_str'].isin(sessions)

dfage.head(n=5)

Unnamed: 0,sub,ses,sub_ses_str,age,sex,invited,included
0,1,1,sub-001_ses-001,13.0,m,True,True
1,1,2,sub-001_ses-002,,,False,False
2,2,1,sub-002_ses-001,6.0,f,True,True
3,2,2,sub-002_ses-002,10.0,f,True,True
4,3,1,sub-003_ses-001,8.0,f,True,True


In [3]:
# include the n_trials of each session
dfage["trials"] = np.nan

for i, row in dfage.iterrows():
    if row['invited']:
        try:
            with open(f'../data/interim/{row["sub_ses_str"]}/event_counts.json', 'r') as file:
                info = json.load(file)
                dfage.loc[i, 'trials'] = min(info.values())
        except:
            dfage.loc[i, 'trials'] = np.nan

In [4]:
dfage["incl_session"] = dfage["trials"] >= 20


d = {}
for subject in np.unique(dfage["sub"]):
    # tmp df for each subject
    tmp = dfage[dfage["sub"] == subject]
    d[subject] = {}
    # if only 1 session available
    if len(tmp) == 1:
        # if there are enough trials, use sub
        if tmp.iloc[0].n_trials >= 35:
            d[subject] = True
        else:
            d[subject] = False
    # if there are 2 sessions available
    else:
        # if both sessions have more than 20 trials, then true
        if all(tmp.incl_session):
            d[subject] = True
        # if only one is included, does it have more than 35? then true
        elif any(tmp.incl_session):
            d[subject] = any(tmp.trials >= 35)        
        else:
            d[subject] =False
        
dfd = pd.DataFrame(d.items(), columns=["sub", "incl_subject"])
# merge with dfage
dfage = dfage.merge(dfd, on="sub")

In [5]:
# output all rows, where incl_subject is not = included

dfage[~dfage["included"] == dfage["incl_subject"]]

Unnamed: 0,sub,ses,sub_ses_str,age,sex,invited,included,trials,incl_session,incl_subject
1,1,2,sub-001_ses-002,,,False,False,,False,True
9,5,2,sub-005_ses-002,,,False,False,,False,True
13,7,2,sub-007_ses-002,,,False,False,,False,True
31,16,2,sub-016_ses-002,10.0,m,True,False,,False,True
37,19,2,sub-019_ses-002,,,False,False,,False,True
49,25,2,sub-025_ses-002,10.0,m,True,False,,False,True
79,40,2,sub-040_ses-002,10.0,f,True,False,,False,True


In [6]:
dfage.head(n=40)

Unnamed: 0,sub,ses,sub_ses_str,age,sex,invited,included,trials,incl_session,incl_subject
0,1,1,sub-001_ses-001,13.0,m,True,True,126.0,True,True
1,1,2,sub-001_ses-002,,,False,False,,False,True
2,2,1,sub-002_ses-001,6.0,f,True,True,61.0,True,True
3,2,2,sub-002_ses-002,10.0,f,True,True,47.0,True,True
4,3,1,sub-003_ses-001,8.0,f,True,True,64.0,True,True
5,3,2,sub-003_ses-002,10.0,f,True,True,41.0,True,True
6,4,1,sub-004_ses-001,8.0,m,True,True,86.0,True,True
7,4,2,sub-004_ses-002,10.0,m,True,True,49.0,True,True
8,5,1,sub-005_ses-001,9.0,m,True,True,51.0,True,True
9,5,2,sub-005_ses-002,,,False,False,,False,True


In [7]:
# SAVE
dfage.to_csv("../data/demographics_trials_inclusions.csv", index=False)

### how many participants were invited

In [8]:
filters = ["invited", "included"]

for thisFilter in filters:

    print(f"\n###############################################")
    print(f"{thisFilter} participants !")
    print(f"###############################################\n")

    # invited subjects
    dfinv = dfage[dfage[thisFilter] == True]

    # sex of participants
    distinct_male_subjects = dfinv[dfinv['sex'] == 'm']['sub'].nunique()
    distinct_female_subjects = dfinv[dfinv['sex'] == 'f']['sub'].nunique()
    print(f"Males: {distinct_male_subjects}, Females: {distinct_female_subjects}")

    # count all sessions
    n_all =len(dfinv)
    print(f"Distinct number of sessions that were acquired (1-2 per sub): {n_all}")

    # now count the distinct subs
    n_sub = len(np.unique(dfinv["sub"]))
    print(f"Distinct number of subjects that were acquired (1-2 per sub): {n_sub}")

    # count the number of rows for each sub_ses_str, this is the number of sessions per subject
    n_ses = dfinv.groupby('sub').size()
    n_ses = n_ses.value_counts()
    print(f"Number of subjects with 1 session: {n_ses[1]}")
    print(f"Number of subjects with 2 sessions: {n_ses[2]}")

    #  average age (just also average the age from 2 sessions)
    print(f"Mean age of subjects: {np.mean(dfinv.age)}")
    print(f"Median age of subjects: {np.median(dfinv.age)}")
    print(f"Range of age: {np.min(dfinv.age)} - {np.max(dfinv.age)}")

    # filter only subjects that have 2 sessions
    dfinv2ses = dfinv.groupby('sub').filter(lambda x: len(x) == 2)

    # average median distance between 2 sessions (grouped by subject)
    #dfinv2ses = dfinv2ses.sort_values(by=['sub', 'ses'])
    #dfinv2ses['age_diff'] = abs(dfinv2ses['age'].diff())
    dfinv2ses['age_diff'] = dfinv2ses.groupby('sub')['age'].diff()
    print(f"Mean age difference between 2 sessions: {np.nanmean(dfinv2ses.age_diff)}")
    print(f"Median age difference between 2 sessions: {np.nanmedian(dfinv2ses.age_diff)}")




###############################################
invited participants !
###############################################

Males: 24, Females: 20
Distinct number of sessions that were acquired (1-2 per sub): 82
Distinct number of subjects that were acquired (1-2 per sub): 44
Number of subjects with 1 session: 6
Number of subjects with 2 sessions: 38
Mean age of subjects: 8.207317073170731
Median age of subjects: 9.0
Range of age: 5.0 - 13.0
Mean age difference between 2 sessions: 3.5526315789473686
Median age difference between 2 sessions: 4.0

###############################################
included participants !
###############################################

Males: 19, Females: 19
Distinct number of sessions that were acquired (1-2 per sub): 69
Distinct number of subjects that were acquired (1-2 per sub): 38
Number of subjects with 1 session: 7
Number of subjects with 2 sessions: 31
Mean age of subjects: 8.231884057971014
Median age of subjects: 9.0
Range of age: 5.0 - 13.0
Mean age

In [None]:
# also include min_trials per session from interim folder

# also merge, how many participants there were too few trials per session, so the session was ultimately excluded
