In [1]:
import pandas as pd

In [2]:
# matching data table generated with instruction above
match = "./test_matching/test_matching_365_b_365_a.csv"
match_df = pd.read_csv(match)

# Directory: /vol/biomedic3/bglocker/brain/oasis3/rigid_to_mni/seg/fsl/meshes

# Set parameters

In [3]:
"""
SET THRESHOLD FOR first scan inclusion (NOT currently it's absolute deterioration)
i.e. if threshold set to 0.5, then individuals must deteriorate to 1 to be classed
as unhealthy. If want to do relative (i.e. cdr = 0 unhealthy if go to 0.5, but
threshold for first scan is 0.5, need to change workflow)
 """
threshold = 0

In [4]:
""" Set threshold for min number of clinical sessions """
min_n_sessions = 3

# Data List

In [5]:
# Reduce down to select columns
cols = ["ADRC_ADRCCLINICALDATA ID", "MR ID_MR", "Subject", "cdr"]
red_match_df = match_df[cols].copy()

In [6]:
# Sort by MR_ID (so ID and day)
MR_sorted_df = red_match_df.sort_values('MR ID_MR').copy()

In [7]:
# Get earliest MR per subject
earliest_MRs = MR_sorted_df.groupby('Subject').first().copy().reset_index()

In [8]:
# Select out those where cdr > threshold
normal_first_scans = earliest_MRs[earliest_MRs["cdr"] <= threshold].copy()
normal_first_scans

Unnamed: 0,Subject,ADRC_ADRCCLINICALDATA ID,MR ID_MR,cdr
0,OAS30001,OAS30001_ClinicalData_d0000,OAS30001_MR_d0129,0.0
1,OAS30002,OAS30002_ClinicalData_d0751,OAS30002_MR_d0653,0.0
2,OAS30003,OAS30003_ClinicalData_d2630,OAS30003_MR_d2682,0.0
3,OAS30004,OAS30004_ClinicalData_d1102,OAS30004_MR_d1101,0.0
4,OAS30005,OAS30005_ClinicalData_d0000,OAS30005_MR_d0143,0.0
...,...,...,...,...
1077,OAS31165,OAS31165_ClinicalData_d1072,OAS31165_MR_d1122,0.0
1078,OAS31166,OAS31166_ClinicalData_d0833,OAS31166_MR_d0947,0.0
1079,OAS31167,OAS31167_ClinicalData_d0000,OAS31167_MR_d0064,0.0
1080,OAS31168,OAS31168_ClinicalData_d0000,OAS31168_MR_d0148,0.0


# Get labels for IDs

In [9]:
# Get all IDs from matched
all_ids = red_match_df.copy()

In [10]:
id_counts = all_ids.groupby(['Subject']).count().reset_index()
id_counts = id_counts.drop(columns = ['ADRC_ADRCCLINICALDATA ID', "MR ID_MR"])
id_counts.columns = ['Subject', 'Count']
id_counts

Unnamed: 0,Subject,Count
0,OAS30001,8
1,OAS30002,3
2,OAS30003,4
3,OAS30004,3
4,OAS30005,4
...,...,...
1079,OAS31167,3
1080,OAS31168,4
1081,OAS31170,2
1082,OAS31171,1


In [11]:
id_more_than_ = id_counts[id_counts["Count"] >= min_n_sessions].copy()

In [12]:
# Sort original matching data by clinic
clinic_sorted_df = red_match_df.sort_values('ADRC_ADRCCLINICALDATA ID').copy()

In [13]:
# get latest clinic for each
latest_clinic = clinic_sorted_df.groupby('Subject').last().copy().reset_index()

In [14]:
# only keep rows where id is in id_more_than_
ids_to_keep = id_more_than_["Subject"].copy()
latest_clinic_more_than = latest_clinic[latest_clinic["Subject"].isin(ids_to_keep)].copy()

In [15]:
# add labels column based on cdr in this last session
subject_labels = latest_clinic_more_than.copy()

def label_ (row):
    if row['cdr'] > threshold :
        return 1
    else:
        return 0

subject_labels["Label"] = subject_labels.apply(lambda row: label_(row), axis=1)
subject_labels

Unnamed: 0,Subject,ADRC_ADRCCLINICALDATA ID,MR ID_MR,cdr,Label
0,OAS30001,OAS30001_ClinicalData_d3332,OAS30001_MR_d3132,0.0,0
1,OAS30002,OAS30002_ClinicalData_d2585,OAS30002_MR_d2345,0.0,0
2,OAS30003,OAS30003_ClinicalData_d3633,OAS30003_MR_d3731,0.0,0
3,OAS30004,OAS30004_ClinicalData_d3458,OAS30004_MR_d3457,0.0,0
4,OAS30005,OAS30005_ClinicalData_d3325,OAS30005_MR_d3453,0.0,0
...,...,...,...,...,...
1071,OAS31159,OAS31159_ClinicalData_d1701,OAS31159_MR_d1893,0.0,0
1076,OAS31164,OAS31164_ClinicalData_d1680,OAS31164_MR_d1678,0.0,0
1079,OAS31167,OAS31167_ClinicalData_d4565,OAS31167_MR_d4564,0.0,0
1080,OAS31168,OAS31168_ClinicalData_d2438,OAS31168_MR_d2526,0.0,0


In [16]:
subject_labels[subject_labels["Label"] == 1]

Unnamed: 0,Subject,ADRC_ADRCCLINICALDATA ID,MR ID_MR,cdr,Label
6,OAS30007,OAS30007_ClinicalData_d2939,OAS30007_MR_d2722,0.5,1
11,OAS30012,OAS30012_ClinicalData_d0758,OAS30012_MR_d0581,0.5,1
25,OAS30027,OAS30027_ClinicalData_d2680,OAS30027_MR_d2394,0.5,1
27,OAS30029,OAS30029_ClinicalData_d1211,OAS30029_MR_d0893,1.0,1
33,OAS30035,OAS30035_ClinicalData_d4165,OAS30035_MR_d3850,0.5,1
...,...,...,...,...,...
1008,OAS31092,OAS31092_ClinicalData_d3336,OAS31092_MR_d3113,0.5,1
1025,OAS31111,OAS31111_ClinicalData_d3801,OAS31111_MR_d3618,0.5,1
1041,OAS31128,OAS31128_ClinicalData_d2679,OAS31128_MR_d2332,0.5,1
1051,OAS31139,OAS31139_ClinicalData_d1848,OAS31139_MR_d1568,0.5,1


# Final dataset merging

In [17]:
final_df = pd.merge(subject_labels, normal_first_scans, on='Subject', how='inner')
# final_df.rename(columns={'ADRC_ADRCCLINICALDATA ID':'Final_clinic_id'}, inplace=True)
final_df = final_df.drop(columns = ["MR ID_MR_x"])
final_df.columns = ["Subject", "Final_Clinic_ID", "Final_cdr", "Label",
                   "First_Clinical_ID", "First_MR_ID", "FIrst_Clinic_cdr"]
final_df

Unnamed: 0,Subject,Final_Clinic_ID,Final_cdr,Label,First_Clinical_ID,First_MR_ID,FIrst_Clinic_cdr
0,OAS30001,OAS30001_ClinicalData_d3332,0.0,0,OAS30001_ClinicalData_d0000,OAS30001_MR_d0129,0.0
1,OAS30002,OAS30002_ClinicalData_d2585,0.0,0,OAS30002_ClinicalData_d0751,OAS30002_MR_d0653,0.0
2,OAS30003,OAS30003_ClinicalData_d3633,0.0,0,OAS30003_ClinicalData_d2630,OAS30003_MR_d2682,0.0
3,OAS30004,OAS30004_ClinicalData_d3458,0.0,0,OAS30004_ClinicalData_d1102,OAS30004_MR_d1101,0.0
4,OAS30005,OAS30005_ClinicalData_d3325,0.0,0,OAS30005_ClinicalData_d0000,OAS30005_MR_d0143,0.0
...,...,...,...,...,...,...,...
363,OAS31159,OAS31159_ClinicalData_d1701,0.0,0,OAS31159_ClinicalData_d0437,OAS31159_MR_d0074,0.0
364,OAS31164,OAS31164_ClinicalData_d1680,0.0,0,OAS31164_ClinicalData_d0000,OAS31164_MR_d0069,0.0
365,OAS31167,OAS31167_ClinicalData_d4565,0.0,0,OAS31167_ClinicalData_d0000,OAS31167_MR_d0064,0.0
366,OAS31168,OAS31168_ClinicalData_d2438,0.0,0,OAS31168_ClinicalData_d0000,OAS31168_MR_d0148,0.0


In [18]:
print("Total entries: ", len(final_df))
print("Number of 'declining' patients: ", sum(final_df["Label"]))

Total entries:  368
Number of 'declining' patients:  44


In [19]:
final_df

Unnamed: 0,Subject,Final_Clinic_ID,Final_cdr,Label,First_Clinical_ID,First_MR_ID,FIrst_Clinic_cdr
0,OAS30001,OAS30001_ClinicalData_d3332,0.0,0,OAS30001_ClinicalData_d0000,OAS30001_MR_d0129,0.0
1,OAS30002,OAS30002_ClinicalData_d2585,0.0,0,OAS30002_ClinicalData_d0751,OAS30002_MR_d0653,0.0
2,OAS30003,OAS30003_ClinicalData_d3633,0.0,0,OAS30003_ClinicalData_d2630,OAS30003_MR_d2682,0.0
3,OAS30004,OAS30004_ClinicalData_d3458,0.0,0,OAS30004_ClinicalData_d1102,OAS30004_MR_d1101,0.0
4,OAS30005,OAS30005_ClinicalData_d3325,0.0,0,OAS30005_ClinicalData_d0000,OAS30005_MR_d0143,0.0
...,...,...,...,...,...,...,...
363,OAS31159,OAS31159_ClinicalData_d1701,0.0,0,OAS31159_ClinicalData_d0437,OAS31159_MR_d0074,0.0
364,OAS31164,OAS31164_ClinicalData_d1680,0.0,0,OAS31164_ClinicalData_d0000,OAS31164_MR_d0069,0.0
365,OAS31167,OAS31167_ClinicalData_d4565,0.0,0,OAS31167_ClinicalData_d0000,OAS31167_MR_d0064,0.0
366,OAS31168,OAS31168_ClinicalData_d2438,0.0,0,OAS31168_ClinicalData_d0000,OAS31168_MR_d0148,0.0


In [20]:
import pickle
with open("./cached_files/oasis_test_subjects_file", 'wb') as subjects_file:
            pickle.dump(final_df, subjects_file)

In [None]:
with open(self.flat_list_file, 'rb') as flat_list_file:
                self.flat_list = pickle.load(flat_list_file)