# **ExtraLong 2021:** Selecting Subjects for the Group Template
Project:    ExtraLong <br>
Maintainer: Katja Zoner <br>
Updated:    11/19/2021 <br>

In [44]:
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None

In [45]:
# Read in demographics csv
demo_fname = "csv/demographics_datafreeze-2021.csv"
demo = pd.read_csv(demo_fname)
demo.columns = [
    'subid', 'sesid', 'acq', 
    'doscan', 'timepoint', 'ntimepoints',
    'scanage_months', 'sex', 'race', 'ethnic'
]
demo

Unnamed: 0,subid,sesid,acq,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic
0,11399,3468,DAY,2010-06-29,1,2,414.0,2.0,2.0,2.0
1,11399,3592,DAY,2010-07-29,2,2,415.0,2.0,2.0,2.0
2,11801,5145,DAY,2011-06-06,1,3,370.0,1.0,1.0,2.0
3,11801,5200,FNDM,2011-06-10,2,3,370.0,1.0,1.0,2.0
4,11801,8591,NEFF,2013-10-23,3,3,399.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...
2448,139272,10109,GRMPY,2016-04-07,6,6,259.0,2.0,2.0,2.0
2449,139490,8461,PNC,2013-08-30,1,2,105.0,1.0,2.0,2.0
2450,139490,10564,CONTE,2017-04-29,2,2,149.0,1.0,2.0,2.0
2451,139553,8410,PNC,2013-08-23,1,2,107.0,2.0,2.0,2.0


In [46]:
# Read in QC exclusion csv
fname = "./csv/exclusion_datafreeze-2021_cutoff-212.csv"
ex = pd.read_csv(fname)
ex

Unnamed: 0,subid,sesid,euler,exclude
0,11399,3468,-86,False
1,11399,3592,-174,False
2,11801,5145,-58,False
3,11801,5200,-66,False
4,11801,8591,-56,False
...,...,...,...,...
2448,139272,10109,-48,False
2449,139490,8461,-730,True
2450,139490,10564,-202,True
2451,139553,8410,-114,False


In [47]:
# Read in scanner model info
scanners = pd.read_csv("csv/scanner_info.csv")
scanners.scanner.unique()
scanners


Unnamed: 0,subid,sesid,scanner
0,11399,3468,TrioTim
1,11399,3592,TrioTim
2,11801,5145,TrioTim
3,11801,5200,TrioTim
4,11801,8591,TrioTim
...,...,...,...
2446,139272,10109,Prisma
2447,139490,8461,TrioTim
2448,139490,10564,TrioTim
2449,139553,8410,TrioTim


In [48]:
# Merge scanner info into demographics info
demo = pd.merge(
    demo,
    scanners, 
    on=['subid', 'sesid'],
    how="left"
)
demo

Unnamed: 0,subid,sesid,acq,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic,scanner
0,11399,3468,DAY,2010-06-29,1,2,414.0,2.0,2.0,2.0,TrioTim
1,11399,3592,DAY,2010-07-29,2,2,415.0,2.0,2.0,2.0,TrioTim
2,11801,5145,DAY,2011-06-06,1,3,370.0,1.0,1.0,2.0,TrioTim
3,11801,5200,FNDM,2011-06-10,2,3,370.0,1.0,1.0,2.0,TrioTim
4,11801,8591,NEFF,2013-10-23,3,3,399.0,1.0,1.0,2.0,TrioTim
...,...,...,...,...,...,...,...,...,...,...,...
2448,139272,10109,GRMPY,2016-04-07,6,6,259.0,2.0,2.0,2.0,Prisma
2449,139490,8461,PNC,2013-08-30,1,2,105.0,1.0,2.0,2.0,TrioTim
2450,139490,10564,CONTE,2017-04-29,2,2,149.0,1.0,2.0,2.0,TrioTim
2451,139553,8410,PNC,2013-08-23,1,2,107.0,2.0,2.0,2.0,TrioTim


In [49]:
# Two scans don't have scanner info in the json sidecar
demo[demo.scanner.isna()]

Unnamed: 0,subid,sesid,acq,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic,scanner
261,19525,10211,CONTE,2016-06-26,1,2,163.0,1.0,2.0,2.0,
608,87166,5539,PNC,2011-08-11,1,3,210.0,1.0,2.0,2.0,


In [50]:
def filterBySessionCount(df, thresh):

    # Get scan counts for each subject (bblid).
    bblid_counts = df.groupby(["subid"]).size().reset_index()
    bblid_counts.columns = ["subid", "scan_count"]
    bblid_counts.sort_values("scan_count")

    # Get list of bblids that should be included
    include_bblids = bblid_counts.subid[bblid_counts["scan_count"]>=thresh]

    # Filter dataframe to only include subjects that meet session requirements
    df = df[df["subid"].isin(include_bblids)].copy()

    # Update ntimepoints column to indicate number of sessions in ExtraLong 2021 for each subject.
    for subid in df.subid.unique():

        # Get subject's number of timepoints in ExtraLong 2021
        num_tps = bblid_counts[bblid_counts.subid == subid].scan_count.item()

        # Update ntimeponts in ExtraLong dataframe
        df.loc[df.subid==subid, 'ntimepoints'] = num_tps

    # Update timepoint column to indicate timepoint number for each subject
    df["timepoint"] = df.groupby("subid").cumcount()+1

    return df.sort_values(["subid","doscan"])

In [51]:
# Merge demographics info and exclusion info into one dataframe
all_sessions = pd.merge(
    demo, 
    ex,
    on=['subid', 'sesid'],
    how='outer'
)

# Add column for scanage in years
all_sessions['scanage_years'] = round(all_sessions['scanage_months']/12, ndigits=3)

# Save as csv
all_sessions.to_csv("demographics+exclusion_datafreeze-2021_cutoff-212.csv", index=False) # and save as csv
all_sessions


Unnamed: 0,subid,sesid,acq,doscan,timepoint,ntimepoints,scanage_months,sex,race,ethnic,scanner,euler,exclude,scanage_years
0,11399,3468,DAY,2010-06-29,1,2,414.0,2.0,2.0,2.0,TrioTim,-86,False,34.500
1,11399,3592,DAY,2010-07-29,2,2,415.0,2.0,2.0,2.0,TrioTim,-174,False,34.583
2,11801,5145,DAY,2011-06-06,1,3,370.0,1.0,1.0,2.0,TrioTim,-58,False,30.833
3,11801,5200,FNDM,2011-06-10,2,3,370.0,1.0,1.0,2.0,TrioTim,-66,False,30.833
4,11801,8591,NEFF,2013-10-23,3,3,399.0,1.0,1.0,2.0,TrioTim,-56,False,33.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2448,139272,10109,GRMPY,2016-04-07,6,6,259.0,2.0,2.0,2.0,Prisma,-48,False,21.583
2449,139490,8461,PNC,2013-08-30,1,2,105.0,1.0,2.0,2.0,TrioTim,-730,True,8.750
2450,139490,10564,CONTE,2017-04-29,2,2,149.0,1.0,2.0,2.0,TrioTim,-202,True,12.417
2451,139553,8410,PNC,2013-08-23,1,2,107.0,2.0,2.0,2.0,TrioTim,-114,False,8.917


In [52]:
# Get df of all sessions that should be included
include = all_sessions[all_sessions.exclude == False]

# Re-do timepoint counts
include = filterBySessionCount(include,2)

In [53]:
# Get age range
include.scanage_years.describe()

count    2350.000000
mean       18.981735
std         4.767595
min         8.167000
25%        15.583000
50%        19.041500
75%        22.146000
max        34.917000
Name: scanage_years, dtype: float64

In [54]:
# Combine Prisma and Prisma_fit
include.loc[include["scanner"] == "Prisma_fit", "scanner"] = "Prisma"
include.scanner.unique()

array(['TrioTim', 'Prisma', nan], dtype=object)

In [55]:
[i for i in range(8,37,4)]

[8, 12, 16, 20, 24, 28, 32, 36]

In [56]:
# Put age into bins
bins = [i for i in range(8,37,4)]
include["age_binned"] = pd.cut(include.scanage_years,bins)

In [57]:
include.value_counts(subset=["age_binned"])

age_binned
(16, 20]      730
(20, 24]      651
(12, 16]      462
(24, 28]      233
(8, 12]       191
(28, 32]       63
(32, 36]       20
dtype: int64

In [58]:
ses_dist = pd.DataFrame(include.value_counts(subset=["age_binned","sex", "scanner"])).reset_index()
ses_dist = ses_dist.sort_values(by=["age_binned","sex","scanner"])
ses_dist.columns = ["age_binned","sex", "scanner", "count"]
ses_dist = ses_dist.reset_index(drop=True)


In [59]:
ses_dist['%'] = 100 * ses_dist['count'] / 2349
ses_dist['%'] = ses_dist['%'].round(2)
ses_dist.to_csv("csv/age_by_sex_by_scanner.csv", index=False)
ses_dist

Unnamed: 0,age_binned,sex,scanner,count,%
0,"(8, 12]",1.0,TrioTim,91,3.87
1,"(8, 12]",2.0,TrioTim,100,4.26
2,"(12, 16]",1.0,Prisma,3,0.13
3,"(12, 16]",1.0,TrioTim,228,9.71
4,"(12, 16]",2.0,Prisma,7,0.3
5,"(12, 16]",2.0,TrioTim,224,9.54
6,"(16, 20]",1.0,Prisma,58,2.47
7,"(16, 20]",1.0,TrioTim,295,12.56
8,"(16, 20]",2.0,Prisma,44,1.87
9,"(16, 20]",2.0,TrioTim,332,14.13


In [60]:
# TODO: generate age x sex x scanner distribution!
# Age == age at first scan??
# try with 2 year age bins? --> 14 bins (range 8yo - 35) 
# Incorporate sex --> 14 bins x 2 = 28 bins for sex+age 
# How do we incorporate scanner?

In [61]:
# Get info by subject
subs = include.subid.unique()
subs = pd.DataFrame(subs)
subs.columns=["subid"]
subs

Unnamed: 0,subid
0,11399
1,11801
2,12073
3,12202
4,12835
...,...
773,135484
774,138788
775,139181
776,139272


In [62]:
# Add other columns to subs dataframe
subs = include[include.timepoint == 1]
del subs["sesid"]
del subs["acq"]
del subs["timepoint"]
del subs["scanage_months"]
del subs["scanage_years"]
del subs["exclude"]
del subs["age_binned"]
del subs["scanner"]
del subs["doscan"]
subs


Unnamed: 0,subid,ntimepoints,sex,race,ethnic,euler
0,11399,2,2.0,2.0,2.0,-86
2,11801,3,1.0,1.0,2.0,-58
5,12073,2,1.0,1.0,2.0,-90
7,12202,3,2.0,2.0,2.0,-62
10,12835,3,1.0,1.0,2.0,-72
...,...,...,...,...,...,...
2437,135484,2,1.0,1.0,2.0,-68
2439,138788,2,1.0,2.0,2.0,-76
2441,139181,2,2.0,2.0,,-114
2443,139272,6,2.0,2.0,2.0,-52


In [63]:
# Add age at first scan, last scan, span
for sub in subs.subid.unique():
    first=include[include.subid == sub].scanage_years.min()
    last=include[include.subid == sub].scanage_years.max()
    subs.loc[subs["subid"]==sub, "age_first"] = first
    subs.loc[subs["subid"]==sub, "age_last"] = last
    subs.loc[subs["subid"]==sub, "age_span"] = last-first

subs

Unnamed: 0,subid,ntimepoints,sex,race,ethnic,euler,age_first,age_last,age_span
0,11399,2,2.0,2.0,2.0,-86,34.500,34.583,0.083
2,11801,3,1.0,1.0,2.0,-58,30.833,33.250,2.417
5,12073,2,1.0,1.0,2.0,-90,33.000,33.500,0.500
7,12202,3,2.0,2.0,2.0,-62,30.917,34.333,3.416
10,12835,3,1.0,1.0,2.0,-72,28.750,28.833,0.083
...,...,...,...,...,...,...,...,...,...
2437,135484,2,1.0,1.0,2.0,-68,16.333,18.167,1.834
2439,138788,2,1.0,2.0,2.0,-76,13.333,15.000,1.667
2441,139181,2,2.0,2.0,,-114,13.500,20.833,7.333
2443,139272,6,2.0,2.0,2.0,-52,18.917,21.583,2.666


## Selecting subjects for GT!!!

In [64]:
len(include[(include.subid==sub) & (include.scanner == "TrioTim")])
len(include[(include.subid==sub) & (include.scanner == "Prisma")])

0

In [65]:
subjects = subs.subid.unique()

# For each subject get age bin with most timepoints and count of sessions within that bin
for sub in subjects:

    # Get list of subject's age bins at each timepoint
    tp_bins = include[include.subid == sub].age_binned

    # Get subject's most common age bin and count. Add to df.
    mode = tp_bins.value_counts().index[0]
    count = tp_bins.value_counts()[0]
    subs.loc[subs.subid==sub,"age_bin"] = mode
    subs.loc[subs.subid==sub,"tp_count_in_bin"] = count

    # Also add data on scanner types by session...
    scanners = include[include.subid==sub].scanner 

    # Get TrioTim ses count and Prisma ses count
    t_count = len(include[(include.subid==sub) & (include.scanner == "TrioTim")])
    p_count = len(include[(include.subid==sub) & (include.scanner == "Prisma")])
    subs.loc[subs.subid==sub,"TrioTim_count"] = t_count
    subs.loc[subs.subid==sub,"Prisma_count"] = p_count

# Convert new columns to ints
subs.tp_count_in_bin = subs.tp_count_in_bin.astype(int)
subs.TrioTim_count = subs.TrioTim_count.astype(int)
subs.Prisma_count = subs.Prisma_count.astype(int)
subs


Unnamed: 0,subid,ntimepoints,sex,race,ethnic,euler,age_first,age_last,age_span,age_bin,tp_count_in_bin,TrioTim_count,Prisma_count
0,11399,2,2.0,2.0,2.0,-86,34.500,34.583,0.083,"(32, 36]",2,2,0
2,11801,3,1.0,1.0,2.0,-58,30.833,33.250,2.417,"(28, 32]",2,3,0
5,12073,2,1.0,1.0,2.0,-90,33.000,33.500,0.500,"(32, 36]",2,2,0
7,12202,3,2.0,2.0,2.0,-62,30.917,34.333,3.416,"(28, 32]",2,3,0
10,12835,3,1.0,1.0,2.0,-72,28.750,28.833,0.083,"(28, 32]",3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2437,135484,2,1.0,1.0,2.0,-68,16.333,18.167,1.834,"(16, 20]",2,2,0
2439,138788,2,1.0,2.0,2.0,-76,13.333,15.000,1.667,"(12, 16]",2,2,0
2441,139181,2,2.0,2.0,,-114,13.500,20.833,7.333,"(20, 24]",1,1,1
2443,139272,6,2.0,2.0,2.0,-52,18.917,21.583,2.666,"(16, 20]",4,5,1


In [66]:
# Look at subject counts per bin
subs.age_bin.value_counts()

(16, 20]    239
(20, 24]    238
(12, 16]    128
(24, 28]     94
(8, 12]      47
(28, 32]     23
(32, 36]      9
Name: age_bin, dtype: int64

In [67]:
# Filter df to only include subjects with more than half of timepoints in their mode agebin
pool = subs[subs.tp_count_in_bin > subs.ntimepoints/2]
pool.age_bin.value_counts()

(16, 20]    136
(20, 24]    135
(12, 16]     82
(24, 28]     48
(8, 12]      43
(28, 32]     17
(32, 36]      8
Name: age_bin, dtype: int64

In [68]:
distrib = pd.DataFrame(pool.value_counts(subset=["age_bin","sex"])).reset_index()
distrib = distrib.sort_values(by=["age_bin","sex"])
distrib.columns = ["age_bin","sex", "count"]
distrib = distrib.reset_index(drop=True)
distrib


Unnamed: 0,age_bin,sex,count
0,"(8, 12]",1.0,21
1,"(8, 12]",2.0,22
2,"(12, 16]",1.0,43
3,"(12, 16]",2.0,39
4,"(16, 20]",1.0,70
5,"(16, 20]",2.0,66
6,"(20, 24]",1.0,53
7,"(20, 24]",2.0,82
8,"(24, 28]",1.0,26
9,"(24, 28]",2.0,22
