# **ExtraLong 2021:** QC - Removing Extra Sessions
__Project:__    ExtraLong <br>
__Maintainer:__ Katja Zoner <br>
__Updated:__    12/07/2021 <br>

This notebook is for looking at age spans between sessions within each subject. 
The goal is to reduce the session counts for subjects who have many sessions within a small time frame.

In [437]:
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None

In [438]:
# Read in csv of exclusion status and demographics info
df = pd.read_csv("csv/demographics+exclusion_datafreeze-2021_cutoff-212.csv")
df

Unnamed: 0,subid,sesid,acq,doscan,timepoint,ntimepoints,scanage_months,scanage_years,sex,race,ethnic,scanner,euler,exclude
0,11399,3468,DAY,2010-06-29,1,2,414.0,34.500,2.0,2.0,2.0,TrioTim,-86,False
1,11399,3592,DAY,2010-07-29,2,2,415.0,34.583,2.0,2.0,2.0,TrioTim,-174,False
2,11801,5145,DAY,2011-06-06,1,3,370.0,30.833,1.0,1.0,2.0,TrioTim,-58,False
3,11801,5200,FNDM,2011-06-10,2,3,370.0,30.833,1.0,1.0,2.0,TrioTim,-66,False
4,11801,8591,NEFF,2013-10-23,3,3,399.0,33.250,1.0,1.0,2.0,TrioTim,-56,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2448,139272,10109,GRMPY,2016-04-07,6,6,259.0,21.583,2.0,2.0,2.0,Prisma,-48,False
2449,139490,8461,PNC,2013-08-30,1,2,105.0,8.750,1.0,2.0,2.0,TrioTim,-730,True
2450,139490,10564,CONTE,2017-04-29,2,2,149.0,12.417,1.0,2.0,2.0,TrioTim,-202,True
2451,139553,8410,PNC,2013-08-23,1,2,107.0,8.917,2.0,2.0,2.0,TrioTim,-114,False


In [439]:
def filterBySessionCount(df, thresh):

    # Get scan counts for each subject (bblid).
    bblid_counts = df.groupby(["subid"]).size().reset_index()
    bblid_counts.columns = ["subid", "scan_count"]
    bblid_counts.sort_values("scan_count")

    # Get list of bblids that should be included
    include_bblids = bblid_counts.subid[bblid_counts["scan_count"]>=thresh]

    # Filter dataframe to only include subjects that meet session requirements
    df = df[df["subid"].isin(include_bblids)].copy()

    # Update ntimepoints column to indicate number of sessions in ExtraLong 2021 for each subject.
    for subid in df.subid.unique():

        # Get subject's number of timepoints in ExtraLong 2021
        num_tps = bblid_counts[bblid_counts.subid == subid].scan_count.item()

        # Update ntimeponts in ExtraLong dataframe
        df.loc[df.subid==subid, 'ntimepoints'] = num_tps

    # Update timepoint column to indicate timepoint number for each subject
    df["timepoint"] = df.groupby("subid").cumcount()+1

    return df.sort_values(["subid","doscan"])

In [440]:
# Get df of all sessions that should be included
df = df[df.exclude == False]

# Re-do timepoint counts
df = filterBySessionCount(df,2)

# Convert doscan to datetime type
df.doscan = pd.to_datetime(df.doscan)

df

Unnamed: 0,subid,sesid,acq,doscan,timepoint,ntimepoints,scanage_months,scanage_years,sex,race,ethnic,scanner,euler,exclude
0,11399,3468,DAY,2010-06-29,1,2,414.0,34.500,2.0,2.0,2.0,TrioTim,-86,False
1,11399,3592,DAY,2010-07-29,2,2,415.0,34.583,2.0,2.0,2.0,TrioTim,-174,False
2,11801,5145,DAY,2011-06-06,1,3,370.0,30.833,1.0,1.0,2.0,TrioTim,-58,False
3,11801,5200,FNDM,2011-06-10,2,3,370.0,30.833,1.0,1.0,2.0,TrioTim,-66,False
4,11801,8591,NEFF,2013-10-23,3,3,399.0,33.250,1.0,1.0,2.0,TrioTim,-56,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2446,139272,8929,ONM,2014-06-16,4,6,237.0,19.750,2.0,2.0,2.0,TrioTim,-30,False
2447,139272,10040,PNC,2016-01-20,5,6,256.0,21.333,2.0,2.0,2.0,TrioTim,-44,False
2448,139272,10109,GRMPY,2016-04-07,6,6,259.0,21.583,2.0,2.0,2.0,Prisma,-48,False
2451,139553,8410,PNC,2013-08-23,1,2,107.0,8.917,2.0,2.0,2.0,TrioTim,-114,False


In [421]:
# Value counts for ntimepoints for all subjects
df[df.timepoint == 1].ntimepoints.value_counts()

2     383
3     203
4      89
5      48
6      26
7      16
8      10
10      2
11      1
Name: ntimepoints, dtype: int64

In [412]:
# Try with 180 day threshold for minimum elapsed days since previous scan
TIME_THRESH = pd.Timedelta(days=180)
TIME_THRESH

Timedelta('180 days 00:00:00')

In [441]:
# Add column for time difference between scans
df = df.sort_values(['subid', 'doscan'])
df['delta_time'] = df.groupby(['subid'])['doscan'].transform(lambda x: x.diff()) 
df.sort_index()

Unnamed: 0,subid,sesid,acq,doscan,timepoint,ntimepoints,scanage_months,scanage_years,sex,race,ethnic,scanner,euler,exclude,delta_time
0,11399,3468,DAY,2010-06-29,1,2,414.0,34.500,2.0,2.0,2.0,TrioTim,-86,False,NaT
1,11399,3592,DAY,2010-07-29,2,2,415.0,34.583,2.0,2.0,2.0,TrioTim,-174,False,30 days
2,11801,5145,DAY,2011-06-06,1,3,370.0,30.833,1.0,1.0,2.0,TrioTim,-58,False,NaT
3,11801,5200,FNDM,2011-06-10,2,3,370.0,30.833,1.0,1.0,2.0,TrioTim,-66,False,4 days
4,11801,8591,NEFF,2013-10-23,3,3,399.0,33.250,1.0,1.0,2.0,TrioTim,-56,False,866 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2446,139272,8929,ONM,2014-06-16,4,6,237.0,19.750,2.0,2.0,2.0,TrioTim,-30,False,94 days
2447,139272,10040,PNC,2016-01-20,5,6,256.0,21.333,2.0,2.0,2.0,TrioTim,-44,False,583 days
2448,139272,10109,GRMPY,2016-04-07,6,6,259.0,21.583,2.0,2.0,2.0,Prisma,-48,False,78 days
2451,139553,8410,PNC,2013-08-23,1,2,107.0,8.917,2.0,2.0,2.0,TrioTim,-114,False,NaT


In [442]:
# Add column to dataframe indicating the number of following timepoints that occurred within 180 days.

# For each subject...
for sub in df.subid.unique():

    # ...get a df of all sessions...
    sessions = df[df.subid == sub].reset_index(drop=True)

    # For each session, calculate the number of following timepoints that occurred within 180 days.
    for i in range(len(sessions)):
        date = sessions.doscan[i]
        n_within = len(sessions[(date < sessions.doscan) & (sessions.doscan < date + TIME_THRESH)])
        df.loc[(df.subid == sub) & (df.timepoint == i+1), "n_within"] = n_within

df.n_within = df.n_within.astype(int)
df

Unnamed: 0,subid,sesid,acq,doscan,timepoint,ntimepoints,scanage_months,scanage_years,sex,race,ethnic,scanner,euler,exclude,delta_time,n_within
0,11399,3468,DAY,2010-06-29,1,2,414.0,34.500,2.0,2.0,2.0,TrioTim,-86,False,NaT,1
1,11399,3592,DAY,2010-07-29,2,2,415.0,34.583,2.0,2.0,2.0,TrioTim,-174,False,30 days,0
2,11801,5145,DAY,2011-06-06,1,3,370.0,30.833,1.0,1.0,2.0,TrioTim,-58,False,NaT,1
3,11801,5200,FNDM,2011-06-10,2,3,370.0,30.833,1.0,1.0,2.0,TrioTim,-66,False,4 days,0
4,11801,8591,NEFF,2013-10-23,3,3,399.0,33.250,1.0,1.0,2.0,TrioTim,-56,False,866 days,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2446,139272,8929,ONM,2014-06-16,4,6,237.0,19.750,2.0,2.0,2.0,TrioTim,-30,False,94 days,0
2447,139272,10040,PNC,2016-01-20,5,6,256.0,21.333,2.0,2.0,2.0,TrioTim,-44,False,583 days,1
2448,139272,10109,GRMPY,2016-04-07,6,6,259.0,21.583,2.0,2.0,2.0,Prisma,-48,False,78 days,0
2451,139553,8410,PNC,2013-08-23,1,2,107.0,8.917,2.0,2.0,2.0,TrioTim,-114,False,NaT,0


## Do the hard work...

In [443]:
# For each subject,
for sub in df.subid.unique():

    # Get df of subject's sessions.
    sessions = df[df.subid == sub].reset_index(drop=True)
    print(f"Subject {sub} started with {len(sessions)} sessions...", end="")

    # If for first scan, n_within is ntimepoints-1, then all timepoints are too close --> exclude subject.
    if sessions.iloc[0].ntimepoints - sessions.iloc[0].n_within == 1:
        print(f"Excluding all sessions from subject {sub}!")
        df.loc[df.subid == sub, "exclude"] = True                   # Indicate to exclude subject
        continue                                                    # Continue to next subject

    # Loop through sessions...
    for i in range(1, len(sessions)+1):

        # Get current session
        ses = df[(df.subid == sub) & (df.timepoint == i)]

        # If we already are excluding this session, just continue
        if ses.exclude.bool():
            continue
        
        # If any number of following sessions are within 180 days of current session, 
        # ... go through following procedure to decide which to keep.
        if ses.n_within.item() != 0:

            # Decision set --> set of sessions in which any one session can be kept without altering the final session count.
            decision_set = pd.DataFrame(ses)

            tp = ses.timepoint.item()
            next_ses = sessions[sessions.timepoint == tp+1]

            # While n_within is decrementing by 1, add following sessions to decision set. Any one session can be kept.
            while next_ses.n_within.item() == ses.n_within.item() - 1:

                decision_set = decision_set.append(next_ses)        # Append new session
                tp+=1                                               # Increment timepoint
                ses = next_ses                                      # Reset ses
                next_ses = sessions[sessions.timepoint == tp+1]     # Reset next_ses

                # If this was the last session, break
                if len(next_ses) == 0:
                    break

            # Now decide which to keep from decision set --> keep session with best (max) Euler!
            keep = decision_set.iloc[decision_set.euler.argmax()]
            # print(f"Keeping tp {keep.timepoint} out of timepoints: {set(decision_set.timepoint)}.")

            # We should exclude all other timepoints from decision set, except for one we decided to keep.
            exclude_tps = set(decision_set.timepoint)
            exclude_tps.remove(keep.timepoint)
            # We should additionally exclude all timepoints within 180 days of the session we decided to keep.
            tps_within = [keep.timepoint + n for n in range(1, keep.n_within + 1)]
            exclude_tps = exclude_tps.union(tps_within)

            # Update exclude column in actual dataframe!
            for tp in exclude_tps:
               df.loc[(df.subid == sub) & (df.timepoint == tp), "exclude"] = True

    print(f"...Subject {sub} now has {len(df[(df.subid == sub) & (df.exclude != True)])} sessions.")


Subject 11399 started with 2 sessions...Excluding all sessions from subject 11399!
Subject 11801 started with 3 sessions......Subject 11801 now has 2 sessions.
Subject 12073 started with 2 sessions......Subject 12073 now has 2 sessions.
Subject 12202 started with 3 sessions......Subject 12202 now has 3 sessions.
Subject 12835 started with 3 sessions...Excluding all sessions from subject 12835!
Subject 12913 started with 2 sessions...Excluding all sessions from subject 12913!
Subject 13190 started with 4 sessions......Subject 13190 now has 3 sessions.
Subject 13473 started with 5 sessions......Subject 13473 now has 3 sessions.
Subject 13550 started with 3 sessions...Excluding all sessions from subject 13550!
Subject 13831 started with 2 sessions......Subject 13831 now has 2 sessions.
Subject 14343 started with 4 sessions......Subject 14343 now has 2 sessions.
Subject 14828 started with 2 sessions......Subject 14828 now has 2 sessions.
Subject 15290 started with 3 sessions......Subject 1

In [444]:
# Print summary of results
remaining_subs = len(df[df.exclude==False].subid.unique())
excluded_subs = len(df.subid.unique()) - remaining_subs

print(f"After filtering sessions such that all sessions are 180+ days apart:")
print(f"{len(df[df.exclude == True])} additional sessions are excluded.")
print(f"{len(df[df.exclude == False])} sessions remain.")
print(f"{excluded_subs} additional subjects are excluded.")
print(f"{remaining_subs} subjects remain.")

After filtering sessions such that all sessions are 180+ days apart:
446 additional sessions are excluded.
1904 sessions remain.
84 additional subjects are excluded.
694 subjects remain.


In [445]:
# Remove exclude==True rows from dataframe, remove extra columns.
df = df[df.exclude == False]
del df["n_within"]
del df["delta_time"]
df

Unnamed: 0,subid,sesid,acq,doscan,timepoint,ntimepoints,scanage_months,scanage_years,sex,race,ethnic,scanner,euler,exclude
2,11801,5145,DAY,2011-06-06,1,3,370.0,30.833,1.0,1.0,2.0,TrioTim,-58,False
4,11801,8591,NEFF,2013-10-23,3,3,399.0,33.250,1.0,1.0,2.0,TrioTim,-56,False
5,12073,5906,FNDM,2011-10-21,1,2,396.0,33.000,1.0,1.0,2.0,TrioTim,-90,False
6,12073,6752,FNDM,2012-04-19,2,2,402.0,33.500,1.0,1.0,2.0,TrioTim,-80,False
7,12202,3764,DAY,2010-09-03,1,3,371.0,30.917,2.0,2.0,2.0,TrioTim,-62,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2443,139272,8456,PNC,2013-08-30,1,6,227.0,18.917,2.0,2.0,2.0,TrioTim,-52,False
2445,139272,8795,NODRA,2014-03-14,3,6,234.0,19.500,2.0,2.0,2.0,TrioTim,-30,False
2447,139272,10040,PNC,2016-01-20,5,6,256.0,21.333,2.0,2.0,2.0,TrioTim,-44,False
2451,139553,8410,PNC,2013-08-23,1,2,107.0,8.917,2.0,2.0,2.0,TrioTim,-114,False


In [448]:
# Redo timepoint counts after excluding sessions
df = filterBySessionCount(df,2)
df

Unnamed: 0,subid,sesid,acq,doscan,timepoint,ntimepoints,scanage_months,scanage_years,sex,race,ethnic,scanner,euler,exclude
2,11801,5145,DAY,2011-06-06,1,2,370.0,30.833,1.0,1.0,2.0,TrioTim,-58,False
4,11801,8591,NEFF,2013-10-23,2,2,399.0,33.250,1.0,1.0,2.0,TrioTim,-56,False
5,12073,5906,FNDM,2011-10-21,1,2,396.0,33.000,1.0,1.0,2.0,TrioTim,-90,False
6,12073,6752,FNDM,2012-04-19,2,2,402.0,33.500,1.0,1.0,2.0,TrioTim,-80,False
7,12202,3764,DAY,2010-09-03,1,3,371.0,30.917,2.0,2.0,2.0,TrioTim,-62,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2443,139272,8456,PNC,2013-08-30,1,3,227.0,18.917,2.0,2.0,2.0,TrioTim,-52,False
2445,139272,8795,NODRA,2014-03-14,2,3,234.0,19.500,2.0,2.0,2.0,TrioTim,-30,False
2447,139272,10040,PNC,2016-01-20,3,3,256.0,21.333,2.0,2.0,2.0,TrioTim,-44,False
2451,139553,8410,PNC,2013-08-23,1,2,107.0,8.917,2.0,2.0,2.0,TrioTim,-114,False


In [449]:
# Export new set as csv
df.to_csv("csv/demographics+exclusion_datafreeze-2021_euler-212_minspan-180.csv", index=False)