# **ExtraLong 2021:** Selecting Subjects for the Group Template
__Project:__    ExtraLong <br>
__Maintainer:__ Katja Zoner <br>
__Updated:__    12/10/2021 <br>

In [1]:
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None

In [2]:
def filterBySessionCount(df, thresh):

    # Get scan counts for each subject (bblid).
    bblid_counts = df.groupby(["subid"]).size().reset_index()
    bblid_counts.columns = ["subid", "scan_count"]
    bblid_counts.sort_values("scan_count")

    # Get list of bblids that should be included
    include_bblids = bblid_counts.subid[bblid_counts["scan_count"]>=thresh]

    # Filter dataframe to only include subjects that meet session requirements
    df = df[df["subid"].isin(include_bblids)].copy()

    # Update ntimepoints column to indicate number of sessions in ExtraLong 2021 for each subject.
    for subid in df.subid.unique():

        # Get subject's number of timepoints in ExtraLong 2021
        num_tps = bblid_counts[bblid_counts.subid == subid].scan_count.item()

        # Update ntimeponts in ExtraLong dataframe
        df.loc[df.subid==subid, 'ntimepoints'] = num_tps

    # Update timepoint column to indicate timepoint number for each subject
    df["timepoint"] = df.groupby("subid").cumcount()+1

    return df.sort_values(["subid","doscan"])

In [6]:
# Read in csv of demographics/scanner/exclusion info
fname = "csv/demographics+exclusion_datafreeze-2021_euler-212_minspan-180.csv"
df = pd.read_csv(fname)

# Combine Prisma and Prisma_fit
df.loc[df["scanner"] == "Prisma_fit", "scanner"] = "Prisma"
df.scanner.unique()

# Put age into 4-year bins
bins = [i for i in range(8,37,4)]
df["age_bin"] = pd.cut(df.scanage_years,bins)

df

Unnamed: 0,subid,sesid,acq,doscan,timepoint,ntimepoints,scanage_months,scanage_years,sex,race,ethnic,scanner,euler,exclude,age_bin
0,11801,5145,DAY,2011-06-06,1,2,370.0,30.833,1.0,1.0,2.0,TrioTim,-58,False,"(28, 32]"
1,11801,8591,NEFF,2013-10-23,2,2,399.0,33.250,1.0,1.0,2.0,TrioTim,-56,False,"(32, 36]"
2,12073,5906,FNDM,2011-10-21,1,2,396.0,33.000,1.0,1.0,2.0,TrioTim,-90,False,"(32, 36]"
3,12073,6752,FNDM,2012-04-19,2,2,402.0,33.500,1.0,1.0,2.0,TrioTim,-80,False,"(32, 36]"
4,12202,3764,DAY,2010-09-03,1,3,371.0,30.917,2.0,2.0,2.0,TrioTim,-62,False,"(28, 32]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1899,139272,8456,PNC,2013-08-30,1,3,227.0,18.917,2.0,2.0,2.0,TrioTim,-52,False,"(16, 20]"
1900,139272,8795,NODRA,2014-03-14,2,3,234.0,19.500,2.0,2.0,2.0,TrioTim,-30,False,"(16, 20]"
1901,139272,10040,PNC,2016-01-20,3,3,256.0,21.333,2.0,2.0,2.0,TrioTim,-44,False,"(20, 24]"
1902,139553,8410,PNC,2013-08-23,1,2,107.0,8.917,2.0,2.0,2.0,TrioTim,-114,False,"(8, 12]"


In [3]:
# Read in old unfilted dataframe for comparison
old = pd.read_csv("csv/demographics+exclusion_datafreeze-2021_cutoff-212.csv")

old = old[old.exclude == False]
old = filterBySessionCount(old, 2)

# Combine Prisma and Prisma_fit
old.loc[old["scanner"] == "Prisma_fit", "scanner"] = "Prisma"

# Put age into 4-year bins
bins = [i for i in range(8,37,4)]
old["age_bin"] = pd.cut(old.scanage_years,bins)

old

Unnamed: 0,subid,sesid,acq,doscan,timepoint,ntimepoints,scanage_months,scanage_years,sex,race,ethnic,scanner,euler,exclude,age_bin
0,11399,3468,DAY,2010-06-29,1,2,414.0,34.500,2.0,2.0,2.0,TrioTim,-86,False,"(32, 36]"
1,11399,3592,DAY,2010-07-29,2,2,415.0,34.583,2.0,2.0,2.0,TrioTim,-174,False,"(32, 36]"
2,11801,5145,DAY,2011-06-06,1,3,370.0,30.833,1.0,1.0,2.0,TrioTim,-58,False,"(28, 32]"
3,11801,5200,FNDM,2011-06-10,2,3,370.0,30.833,1.0,1.0,2.0,TrioTim,-66,False,"(28, 32]"
4,11801,8591,NEFF,2013-10-23,3,3,399.0,33.250,1.0,1.0,2.0,TrioTim,-56,False,"(32, 36]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2446,139272,8929,ONM,2014-06-16,4,6,237.0,19.750,2.0,2.0,2.0,TrioTim,-30,False,"(16, 20]"
2447,139272,10040,PNC,2016-01-20,5,6,256.0,21.333,2.0,2.0,2.0,TrioTim,-44,False,"(20, 24]"
2448,139272,10109,GRMPY,2016-04-07,6,6,259.0,21.583,2.0,2.0,2.0,Prisma,-48,False,"(20, 24]"
2451,139553,8410,PNC,2013-08-23,1,2,107.0,8.917,2.0,2.0,2.0,TrioTim,-114,False,"(8, 12]"


In [7]:
# Get age range
df.scanage_years.describe()

count    1904.000000
mean       18.323659
std         4.494183
min         8.167000
25%        15.167000
50%        18.250000
75%        21.333000
max        34.333000
Name: scanage_years, dtype: float64

In [8]:
df.value_counts(subset=["age_bin"])

age_bin 
(16, 20]    640
(20, 24]    481
(12, 16]    429
(8, 12]     169
(24, 28]    145
(28, 32]     32
(32, 36]      8
dtype: int64

In [9]:
ses_dist = pd.DataFrame(df.value_counts(subset=["age_bin","sex", "scanner"])).reset_index()
ses_dist = ses_dist.sort_values(by=["age_bin","sex","scanner"])
ses_dist.columns = ["age_bin","sex", "scanner", "count"]
ses_dist = ses_dist.reset_index(drop=True)
ses_dist['%'] = 100 * ses_dist['count'] / 2349
ses_dist['%'] = ses_dist['%'].round(2)
# ses_dist.to_csv("csv/age_by_sex_by_scanner.csv", index=False)
ses_dist

Unnamed: 0,age_bin,sex,scanner,count,%
0,"(8, 12]",1.0,TrioTim,78,3.32
1,"(8, 12]",2.0,TrioTim,91,3.87
2,"(12, 16]",1.0,Prisma,2,0.09
3,"(12, 16]",1.0,TrioTim,215,9.15
4,"(12, 16]",2.0,Prisma,3,0.13
5,"(12, 16]",2.0,TrioTim,209,8.9
6,"(16, 20]",1.0,Prisma,44,1.87
7,"(16, 20]",1.0,TrioTim,269,11.45
8,"(16, 20]",2.0,Prisma,30,1.28
9,"(16, 20]",2.0,TrioTim,296,12.6


## Get demographics data by subject

In [10]:
def by_subject(df):
    # Add other columns to subs dataframe
    subjects = df[df.timepoint == 1][["subid", "ntimepoints", "sex", "race", "ethnic"]]

    # Add columns for session count in each age bin
    bins = list(df.age_bin.unique())
    bins.reverse()
    for b in bins:
        col = f"Count {str(b)}"
        subjects[col] = 0

    # Add age at first scan, last scan, span
    for sub in subjects.subid.unique():
        first=df[df.subid == sub].scanage_years.min()
        last=df[df.subid == sub].scanage_years.max()
        subjects.loc[subjects["subid"]==sub, "age_first"] = first
        subjects.loc[subjects["subid"]==sub, "age_last"] = last
        subjects.loc[subjects["subid"]==sub, "age_span"] = last-first

        # Get list of subject's age bins at each timepoint
        tp_bins = df[df.subid == sub].age_bin

        # Get subject's most common age bin and count. Add to df.
        mode = tp_bins.value_counts().index[0]
        count = tp_bins.value_counts()[0]
        subjects.loc[subjects.subid==sub,"age_bin"] = mode
        subjects.loc[subjects.subid==sub,"max_tp_count"] = count

        # Increment count at each age bin
        for b in tp_bins:
            col = f"Count {str(b)}"
            subjects.loc[subjects["subid"]==sub, col] += 1

        # Get number of bins subject belongs to
        nbins = len(tp_bins.unique())
        subjects.loc[subjects["subid"]==sub, "nbins"] = nbins

        # Get TrioTim ses count and Prisma ses count
        t_count = len(df[(df.subid==sub) & (df.scanner == "TrioTim")])
        p_count = len(df[(df.subid==sub) & (df.scanner == "Prisma")])
        subjects.loc[subjects.subid==sub,"TrioTim_count"] = t_count
        subjects.loc[subjects.subid==sub,"Prisma_count"] = p_count


    # Convert new columns to ints
    subjects.max_tp_count = subjects.max_tp_count.astype(int)
    subjects.nbins = subjects.nbins.astype(int)
    subjects.TrioTim_count = subjects.TrioTim_count.astype(int)
    subjects.Prisma_count = subjects.Prisma_count.astype(int)

    # Re-order columns
    subjects = subjects[[
        'subid',
        'ntimepoints',
        'sex',
        'race',
        'ethnic',
        'age_first',
        'age_last',
        'age_span',
        'age_bin',
        'nbins',
        'max_tp_count',
        'TrioTim_count',
        'Prisma_count',
        'Count (8, 12]',
        'Count (12, 16]',
        'Count (16, 20]',
        'Count (20, 24]',
        'Count (24, 28]',
        'Count (28, 32]',
        'Count (32, 36]'
    ]]

    return subjects

In [12]:
subjects = by_subject(df)
# old_subjects = by_subject(old)
subjects

Unnamed: 0,subid,ntimepoints,sex,race,ethnic,age_first,age_last,age_span,age_bin,nbins,max_tp_count,TrioTim_count,Prisma_count,"Count (8, 12]","Count (12, 16]","Count (16, 20]","Count (20, 24]","Count (24, 28]","Count (28, 32]","Count (32, 36]"
0,11801,2,1.0,1.0,2.0,30.833,33.250,2.417,"(32, 36]",2,1,2,0,0,0,0,0,0,1,1
2,12073,2,1.0,1.0,2.0,33.000,33.500,0.500,"(32, 36]",1,2,2,0,0,0,0,0,0,0,2
4,12202,3,2.0,2.0,2.0,30.917,34.333,3.416,"(28, 32]",2,2,3,0,0,0,0,0,0,2,1
7,13190,3,1.0,1.0,2.0,28.333,32.333,4.000,"(28, 32]",2,2,3,0,0,0,0,0,0,2,1
10,13473,3,2.0,4.0,2.0,26.750,29.917,3.167,"(24, 28]",2,2,3,0,0,0,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1893,135484,2,1.0,1.0,2.0,16.333,18.167,1.834,"(16, 20]",1,2,2,0,0,0,2,0,0,0,0
1895,138788,2,1.0,2.0,2.0,13.333,15.000,1.667,"(12, 16]",1,2,2,0,0,2,0,0,0,0,0
1897,139181,2,2.0,2.0,,13.500,20.833,7.333,"(20, 24]",2,1,1,1,0,1,0,1,0,0,0
1899,139272,3,2.0,2.0,2.0,18.917,21.333,2.416,"(16, 20]",2,2,3,0,0,0,2,1,0,0,0


In [203]:
print(f"Number of subjects with only Prisma sessions: {len(subjects[subjects.TrioTim_count == 0 ])}")
print(f"Number of subjects with only TrioTim sessions: {len(subjects[subjects.Prisma_count == 0 ])}")
print(f"Number of subjects with sessions on both scanners: {len(subjects[(subjects.TrioTim_count != 0) & (subjects.Prisma_count != 0)])}")

both = subjects[(subjects.TrioTim_count != 0) & (subjects.Prisma_count != 0)].subid
bothPrismaFirst = df[(df.subid.isin(both)) & (df.timepoint == 1) & (df.scanner == "Prisma")]
print(f"Number of subjects scanned on Prisma THEN TrioTim: {len(bothPrismaFirst)}")
# NOTE: Of the subjects scanned by both scanners (N=221), only 1 subject (with 2 sessions) was scanned on Prisma first.
# NOTE: All other subjects with both Trio and Prisma scans were scaned with TrioTim first.

Number of subjects with only Prisma sessions: 18
Number of subjects with only TrioTim sessions: 455
Number of subjects with sessions on both scanners: 221
Number of subjects scanned on Prisma THEN TrioTim: 1


## Selecting subjects for GT!!!

__Goals:__ 
- Balance scanner, sex, and age. 
- Select people that are representative of our sample (i.e. scanned longitudinally, 3+ times, not 2)
- Get a good sense of people who were scanned younger, and older.

__Three pools:__ 
1. `Prisma only`
2. `TrioTim only`
3. `Both TrioTim-Prisma` _(ignoring single case where both were used but Prisma came first)_

__In each pool:__
- Ensure subject covers at least 3 age bins. _(when possible! this cannot be accomplished for the Prisma only group)_
- Select 1 male, 1 female
- One session should be either in 8-12 or 28-32 _(this should create good mix of young-middle and middle-old)_

In [21]:
# Get scanner groups first
prisma = subjects[subjects.TrioTim_count == 0 ]
trio = subjects[subjects.Prisma_count == 0 ]
both = subjects[(subjects.TrioTim_count != 0) & (subjects.Prisma_count != 0)]
print(f"Prisma only: {len(prisma)}")
print(f"TrioTim only: {len(trio)}")
print(f"Both scanners: {len(both)}")

Prisma only: 18
TrioTim only: 455
Both scanners: 221


In [247]:
# Look at subjects scanned with Prisma only
prisma = subjects[subjects.TrioTim_count == 0 ]
prisma = prisma[prisma.age_span >= 1.75] # NOTE: using age span rather than nbins! Actually selects larger span since most subjects only have 1-2 timepoints
prisma_male = prisma[prisma.sex == 1]
prisma_female = prisma[prisma.sex == 2]

In [250]:
prisma_female.sort_values("age_first")

Unnamed: 0,subid,ntimepoints,sex,race,ethnic,age_first,age_last,age_span,age_bin,nbins,max_tp_count,TrioTim_count,Prisma_count,"Count (8, 12]","Count (12, 16]","Count (16, 20]","Count (20, 24]","Count (24, 28]","Count (28, 32]","Count (32, 36]"
93,20082,2,2.0,1.0,2.0,17.833,19.75,1.917,"(16, 20]",1,2,0,2,0,0,2,0,0,0,0
95,20160,2,2.0,2.0,2.0,20.25,22.5,2.25,"(20, 24]",1,2,0,2,0,0,0,2,0,0,0
101,20325,2,2.0,5.0,2.0,20.583,22.583,2.0,"(20, 24]",1,2,0,2,0,0,0,2,0,0,0
97,20182,2,2.0,6.0,1.0,21.5,23.583,2.083,"(20, 24]",1,2,0,2,0,0,0,2,0,0,0
1853,132176,2,2.0,2.0,2.0,23.417,25.25,1.833,"(24, 28]",2,1,0,2,0,0,0,1,1,0,0
99,20197,2,2.0,1.0,2.0,25.5,27.667,2.167,"(24, 28]",1,2,0,2,0,0,0,0,2,0,0


In [205]:
# Manual selection
prisma_subs = [99242, 103035, 20082, 20197]
selected_prisma = prisma[prisma.subid.isin(prisma_subs)]

In [283]:
# Look at subjects scanned with TrioTim only
trio = subjects[subjects.Prisma_count == 0]
# trio = trio[trio.nbins >= 3]
# NOTE: No subjects that span 3+ age bins have a session in the older (28-32) age bin! Changing criteria to the below.
trio = trio[(trio.age_span >= 4) & (trio.ntimepoints >= 3)] 

trio_male_young = trio[(trio.sex == 1) & (trio.age_first < 12)].sort_values(by="age_first")
trio_female_young = trio[(trio.sex == 2) & (trio.age_first < 11)].sort_values(by="age_first")
trio_male_old = trio[(trio.sex == 1) & (trio["Count (28, 32]"] != 0)].sort_values(by="age_first")
trio_female_old = trio[(trio.sex == 2) & (trio["Count (28, 32]"] != 0)].sort_values(by="age_first")
# trio.sort_values("age_span", ascending=False)

In [284]:
trio_female_old


Unnamed: 0,subid,ntimepoints,sex,race,ethnic,age_first,age_last,age_span,age_bin,nbins,max_tp_count,TrioTim_count,Prisma_count,"Count (8, 12]","Count (12, 16]","Count (16, 20]","Count (20, 24]","Count (24, 28]","Count (28, 32]","Count (32, 36]"
36,16349,3,2.0,2.0,2.0,26.0,30.0,4.0,"(24, 28]",2,2,3,0,0,0,0,0,2,1,0


In [85]:
# Look at subjects scanned with both scanners
both = subjects[(subjects.TrioTim_count != 0) & (subjects.Prisma_count != 0)]
both = both[both.nbins >= 3]
both_male_young = both[(both.sex == 1) & (both.age_first < 11)].sort_values(by="age_first")
both_female_young = both[(both.sex == 2) & (both.age_first < 11)].sort_values(by="age_first")
both_male_old = both[(both.sex == 1) & (both.age_last > 28)].sort_values(by="age_first")
both_female_old = both[(both.sex == 2) & (both.age_last > 28)].sort_values(by="age_first")

In [277]:
both_female_old

Unnamed: 0,subid,ntimepoints,sex,race,ethnic,age_first,age_last,age_span,age_bin,nbins,max_tp_count,TrioTim_count,Prisma_count,"Count (8, 12]","Count (12, 16]","Count (16, 20]","Count (20, 24]","Count (24, 28]","Count (28, 32]","Count (32, 36]"
364,87135,5,2.0,1.0,2.0,17.917,28.083,10.166,"(20, 24]",3,2,3,2,0,0,2,2,0,1,0
252,84354,4,2.0,5.0,1.0,20.5,28.417,7.917,"(20, 24]",3,2,3,1,0,0,0,2,1,1,0
116,80557,5,2.0,2.0,2.0,21.5,29.5,8.0,"(28, 32]",3,2,4,1,0,0,0,2,1,2,0
121,80688,3,2.0,5.0,2.0,21.917,30.0,8.083,"(28, 32]",3,1,2,1,0,0,0,1,1,1,0


In [None]:
# RANDOMIZED SELECTION
gt = pd.DataFrame()


In [285]:
# MANUAL SELECTION
gt_set = pd.DataFrame()

########### PRISMA SCANNER ONLY #############
# NOTE: Max age span for prisma only subjects is only 2.5 years. Only 1 subject has 3 timepoints.
# NOTE: Selected 2 male, 2 female with age span > 1.75. Prioritized age span / spread when possible.
gt_set = gt_set.append(selected_prisma)

########### TRIOTIM SCANNER ONLY #############

# Add male with younger session
# selected = trio_male_young[trio_male_young.subid == 105370]
selected = trio_male_young[trio_male_young.subid == 110295]
gt_set = gt_set.append(selected) 

# Add female with younger session
selected = trio_female_young[trio_female_young.subid == 98314]
gt_set = gt_set.append(selected)  

# Add male with older session (13190)
selected = trio_male_old
gt_set = gt_set.append(selected)

# Add female with older session (16349)
selected = trio_female_old 
gt_set = gt_set.append(selected)

############### BOTH SCANNERS ###############

# Add male with younger session
# selected = both_male_young[(both_male_young.subid == 96659)]
selected = both_male_young[(both_male_young.subid == 96659) | (both_male_young.subid == 97994)]
gt_set = gt_set.append(selected)  

# Add female with younger session
# selected = both_female_young[both_female_young["age_first"] < 10]
gt_set = gt_set.append(both_female_young) # NOTE: Adding two subjects here.

# Add male with older session
selected = both_male_old[both_male_old["Count (28, 32]"] == 2]
gt_set = gt_set.append(selected)

# Add female with older session
selected = both_female_old[both_female_old["Count (28, 32]"] == 2]
gt_set = gt_set.append(selected)

del gt_set["age_bin"]
del gt_set["max_tp_count"]

gt_set.columns = [
    'subid', 'ntimepoints', 'sex', 'race', 'ethnic', 
    'age_first', 'age_last', 'age_span', 'nbins', 
    'nTrioTim', 'nPrisma', 'Count (8, 12]', 
    'Count (12, 16]', 'Count (16, 20]', 'Count (20, 24]',
    'Count (24, 28]', 'Count (28, 32]', 'Count (32, 36]'
]

gt_set.sort_values(by=["age_first"]) 
# gt_set.sort_values(by=['Count (8, 12]', 'Count (12, 16]', 'Count (16, 20]', 'Count (20, 24]', 'Count (24, 28]', 'Count (28, 32]', 'Count (32, 36]'], ascending=False)


Unnamed: 0,subid,ntimepoints,sex,race,ethnic,age_first,age_last,age_span,nbins,nTrioTim,nPrisma,"Count (8, 12]","Count (12, 16]","Count (16, 20]","Count (20, 24]","Count (24, 28]","Count (28, 32]","Count (32, 36]"
1729,127542,4,2.0,2.0,2.0,8.75,16.667,7.917,3,3,1,2,1,1,0,0,0,0
875,98314,4,2.0,2.0,2.0,9.833,16.5,6.667,3,4,0,1,2,1,0,0,0,0
794,95057,4,2.0,5.0,1.0,10.167,17.583,7.416,3,3,1,2,1,1,0,0,0,0
835,96659,4,1.0,1.0,2.0,10.333,20.167,9.834,3,2,2,2,1,0,1,0,0,0
866,97994,6,1.0,1.0,2.0,10.833,21.083,10.25,4,3,3,1,2,2,1,0,0,0
1238,110295,3,1.0,2.0,2.0,11.167,16.083,4.916,3,3,0,1,1,1,0,0,0,0
93,20082,2,2.0,1.0,2.0,17.833,19.75,1.917,1,0,2,0,0,2,0,0,0,0
1066,105168,6,1.0,1.0,1.0,18.167,28.667,10.5,4,2,4,0,0,1,1,2,2,0
1006,103035,3,1.0,1.0,2.0,21.0,22.75,1.75,1,0,3,0,0,0,3,0,0,0
116,80557,5,2.0,2.0,2.0,21.5,29.5,8.0,3,4,1,0,0,0,2,1,2,0


In [286]:
gt_subs = gt_set.subid.unique()
gt_sessions = df[df.subid.isin(gt_subs)]

print(f"Average age: {round(gt_sessions.scanage_years.mean(), ndigits=3)}")
print(f"Average age span: {round(gt_set.age_span.mean(), ndigits=3)}")
print(f"Average ntimepoints: {round(gt_set.ntimepoints.mean(), ndigits=3)}")
print(f"TrioTim Sessions: {gt_set.nTrioTim.sum()}")
print(f"Prisma Sessions: {gt_set.nPrisma.sum()}")

print(f"\nSession Counts by Age Bin:")
gt_set[['Count (8, 12]', 'Count (12, 16]','Count (16, 20]', 'Count (20, 24]', 'Count (24, 28]', 'Count (28, 32]', 'Count (32, 36]']].sum()

Average age: 19.871
Average age span: 5.845
Average ntimepoints: 3.643
TrioTim Sessions: 30
Prisma Sessions: 21

Session Counts by Age Bin:


Count (8, 12]     9
Count (12, 16]    8
Count (16, 20]    9
Count (20, 24]    9
Count (24, 28]    8
Count (28, 32]    7
Count (32, 36]    1
dtype: int64

In [196]:
df.scanage_years.describe()

count    1904.000000
mean       18.323659
std         4.494183
min         8.167000
25%        15.167000
50%        18.250000
75%        21.333000
max        34.333000
Name: scanage_years, dtype: float64

In [219]:

gt_sessions.scanage_years.describe()

count    37.000000
mean     21.783730
std       6.554875
min       9.250000
25%      16.500000
50%      22.750000
75%      27.667000
max      32.333000
Name: scanage_years, dtype: float64

In [192]:
subjects.age_span.describe()

count    694.000000
mean       4.197742
std        2.628178
min        0.500000
25%        1.917000
50%        3.833000
75%        5.833000
max       10.667000
Name: age_span, dtype: float64

In [176]:
gt_set.age_span.describe()

count    12.000000
mean      5.347333
std       3.145097
min       1.750000
25%       2.416750
50%       4.458000
75%       7.937750
max      10.500000
Name: age_span, dtype: float64

## OLD WORK BELOW

In [75]:
# Filter the subject pool to only consider subjects whose sessions cover 3+ age bins
pool = subjects[subjects.nbins >= 3]

# Further filter pool to df subjects who have sessions from both scanners
# pool = pool[(pool.TrioTim_count>0) & (pool.Prisma_count>0)]

# Futher filter pool to df subjects with only one session per bin?
# pool = pool[pool.max_tp_count == 1]

pool.sort_values(by=["age_first"])

Unnamed: 0,subid,ntimepoints,sex,race,ethnic,age_first,age_last,age_span,age_bin,nbins,max_tp_count,TrioTim_count,Prisma_count,"Count (8, 12]","Count (12, 16]","Count (16, 20]","Count (20, 24]","Count (24, 28]","Count (28, 32]","Count (32, 36]"
1729,127542,4,2.0,2.0,2.0,8.750,16.667,7.917,"(8, 12]",3,2,3,1,2,1,1,0,0,0,0
1550,121050,6,1.0,2.0,2.0,9.750,18.500,8.750,"(16, 20]",3,3,3,3,2,1,3,0,0,0,0
875,98314,4,2.0,2.0,2.0,9.833,16.500,6.667,"(12, 16]",3,2,4,0,1,2,1,0,0,0,0
798,95116,4,1.0,2.0,2.0,10.083,17.833,7.750,"(8, 12]",3,2,3,1,2,1,1,0,0,0,0
1053,104563,4,2.0,1.0,2.0,10.083,16.500,6.417,"(8, 12]",3,2,4,0,2,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,87538,6,1.0,2.0,2.0,20.500,28.417,7.917,"(20, 24]",3,3,3,3,0,0,0,3,2,1,0
225,83835,6,1.0,2.0,2.0,20.667,28.583,7.916,"(20, 24]",3,4,4,2,0,0,0,4,1,1,0
1144,106802,4,1.0,1.0,2.0,21.250,28.167,6.917,"(20, 24]",3,2,3,1,0,0,0,2,1,1,0
116,80557,5,2.0,2.0,2.0,21.500,29.500,8.000,"(28, 32]",3,2,4,1,0,0,0,2,1,2,0


In [37]:
# NOTE: first criteria, subjects who's sessions span 3+ age bins. (This does cover ages 8-30!)
# GOAL: Can we select subjects with 3+ agebins, both scanners, adequately cover all age bins without inflating the session count in the middle age bins.
pool.age_first.describe()

count    102.00000
mean      15.27698
std        3.21725
min        8.75000
25%       12.85400
50%       15.12500
75%       18.22925
max       21.91700
Name: age_first, dtype: float64

In [26]:
pool.age_last.describe()

count    102.000000
mean      23.294941
std        2.984289
min       16.667000
25%       21.167000
50%       22.708000
75%       25.396000
max       30.000000
Name: age_last, dtype: float64

In [27]:
sessions = df[df.subid.isin(pool.subid.unique())]
distrib =  sessions.value_counts(subset=["age_bin", "sex"]).reset_index()
distrib.columns = ["age_bin", "sex", "session_count"]
distrib.sort_values(by=["age_bin", "sex"])

Unnamed: 0,age_bin,sex,session_count
8,"(8, 12]",1.0,19
9,"(8, 12]",2.0,10
4,"(12, 16]",1.0,42
5,"(12, 16]",2.0,39
0,"(16, 20]",1.0,72
2,"(16, 20]",2.0,69
1,"(20, 24]",1.0,69
3,"(20, 24]",2.0,68
6,"(24, 28]",1.0,28
7,"(24, 28]",2.0,23


In [28]:
sub_distrib =  pool.value_counts(subset=["age_bin", "sex"]).reset_index()
sub_distrib.columns = ["age_bin", "sex", "subject_count"]
sub_distrib.sort_values(by=["age_bin", "sex"])

Unnamed: 0,age_bin,sex,subject_count
8,"(8, 12]",1.0,3
10,"(8, 12]",2.0,2
7,"(12, 16]",1.0,3
6,"(12, 16]",2.0,3
2,"(16, 20]",1.0,15
3,"(16, 20]",2.0,13
1,"(20, 24]",1.0,21
0,"(20, 24]",2.0,21
5,"(24, 28]",1.0,9
4,"(24, 28]",2.0,9


In [29]:
# GOAL: Can we select subjects with 3+ agebins, both scanners, adequately cover all age bins without oversampling sessions from the middle age bins.
def pickSubjects(pool, random_state=None):

    # Build a random sample, balanced on age bin and sex, from the inputted pool of subjects.
    sample = pd.DataFrame()

    # For each age bin and sex combination
    for age in pool.age_bin.unique():
        for sex in pool.sex.unique():

            try:
                # Randomly select one of the elibile subjects...
                sub = pool[(pool.age_bin == age) & (pool.sex == sex)].sample(n=1, random_state=random_state)
                # ...and add to sample dataframe.
                sample = sample.append(sub)
                
            except ValueError:
                print(f"Found no eligible subjects for criteria: {age}, {sex}")

    return sample

In [256]:
selected = pickSubjects(pool, random_state=3)
selected.sort_values(by=["age_bin","sex"])

Unnamed: 0,subid,ntimepoints,sex,race,ethnic,age_first,age_last,age_span,age_bin,nbins,max_tp_count,TrioTim_count,Prisma_count,"Count (8, 12]","Count (12, 16]","Count (16, 20]","Count (20, 24]","Count (24, 28]","Count (28, 32]","Count (32, 36]"
807,95378,4,1.0,2.0,2.0,10.833,18.667,7.834,"(8, 12]",3,2,3,1,2,1,1,0,0,0,0
1729,127542,4,2.0,2.0,2.0,8.75,16.667,7.917,"(8, 12]",3,2,3,1,2,1,1,0,0,0,0
473,88773,4,1.0,2.0,2.0,13.167,20.833,7.666,"(12, 16]",3,2,3,1,0,2,1,1,0,0,0
606,91855,5,2.0,1.0,2.0,12.25,22.167,9.917,"(12, 16]",3,3,4,1,0,3,1,1,0,0,0
1715,127236,5,1.0,1.0,2.0,17.667,24.417,6.75,"(16, 20]",3,3,4,1,0,0,3,1,1,0,0
1287,112332,4,2.0,5.0,2.0,15.083,21.833,6.75,"(16, 20]",3,2,3,1,0,1,2,1,0,0,0
1813,130908,3,1.0,1.0,2.0,13.333,21.083,7.75,"(20, 24]",3,1,2,1,0,1,1,1,0,0,0
1822,131405,3,2.0,1.0,2.0,11.833,20.667,8.834,"(20, 24]",3,1,2,1,1,1,0,1,0,0,0
1157,107055,4,1.0,1.0,2.0,18.75,26.917,8.167,"(24, 28]",3,2,2,2,0,0,1,1,2,0,0
886,98394,3,2.0,2.0,1.0,20.0,25.167,5.167,"(24, 28]",3,1,2,1,0,0,1,1,1,0,0


In [None]:
gt_set = []


In [91]:
# Filter the subject pool to only consider subjects whose sessions cover 3+ age bins
pool = subjects[subjects.nbins >= 2]

prisma = pool[pool.TrioTim_count == 0]
trio = pool[pool.Prisma_count == 0]
both = pool[(pool.TrioTim_count != 0) & (pool.Prisma_count != 0)]
print(f"Prisma Only: {len(prisma)}")
print(f"TrioTim Only: {len(trio)}")
print(f"Both: {len(both)}")

Prisma Only: 4
TrioTim Only: 305
Both: 220


In [100]:
prisma = subjects[subjects.TrioTim_count == 0]
prisma.sort_values("age_first")

Unnamed: 0,subid,ntimepoints,sex,race,ethnic,age_first,age_last,age_span,age_bin,nbins,max_tp_count,TrioTim_count,Prisma_count,"Count (8, 12]","Count (12, 16]","Count (16, 20]","Count (20, 24]","Count (24, 28]","Count (28, 32]","Count (32, 36]"
93,20082,2,2.0,1.0,2.0,17.833,19.75,1.917,"(16, 20]",1,2,0,2,0,0,2,0,0,0,0
1195,108498,2,1.0,2.0,2.0,18.0,19.25,1.25,"(16, 20]",1,2,0,2,0,0,2,0,0,0,0
95,20160,2,2.0,2.0,2.0,20.25,22.5,2.25,"(20, 24]",1,2,0,2,0,0,0,2,0,0,0
101,20325,2,2.0,5.0,2.0,20.583,22.583,2.0,"(20, 24]",1,2,0,2,0,0,0,2,0,0,0
1006,103035,3,1.0,1.0,2.0,21.0,22.75,1.75,"(20, 24]",1,3,0,3,0,0,0,3,0,0,0
91,20011,2,2.0,1.0,2.0,21.25,21.833,0.583,"(20, 24]",1,2,0,2,0,0,0,2,0,0,0
750,93853,2,1.0,2.0,2.0,21.333,22.5,1.167,"(20, 24]",1,2,0,2,0,0,0,2,0,0,0
97,20182,2,2.0,6.0,1.0,21.5,23.583,2.083,"(20, 24]",1,2,0,2,0,0,0,2,0,0,0
103,20699,2,1.0,1.0,2.0,21.75,22.333,0.583,"(20, 24]",1,2,0,2,0,0,0,2,0,0,0
105,20792,2,1.0,1.0,2.0,21.833,23.583,1.75,"(20, 24]",1,2,0,2,0,0,0,2,0,0,0
