In [1]:
import numpy as np
import sklearn
import pandas as pd
import warnings
%reload_ext autoreload
%autoreload 2
%matplotlib inline
warnings.filterwarnings('ignore')

## Input Data

### Train target

In [2]:
train_target = pd.read_csv('../data/TADPOLE_TargetData_train.csv')
# normalize the format of DATE
train_target['Date'] = pd.to_datetime(train_target['Date'])

# Sorting data
train_target = train_target.sort_values(by=['PTID_Key'])
train_target = train_target.groupby(['PTID_Key']).apply(lambda x: x.sort_values(['Date'], ascending = True))
train_target = train_target.reset_index(drop=True)

train_target.head()

Unnamed: 0,Date,PTID_Key,CN_Diag,MCI_Diag,AD_Diag,ADAS13,Ventricles_Norm,MMSE
0,2013-06-07,6,1.0,0.0,0.0,,,30.0
1,2013-10-16,6,,,,,,
2,2014-05-29,6,1.0,0.0,0.0,3.0,,30.0
3,2016-05-05,6,1.0,0.0,0.0,1.0,,29.0
4,2013-03-28,8,,,,,,


In [15]:
train_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2506 entries, 0 to 2505
Data columns (total 8 columns):
Date               2506 non-null datetime64[ns]
PTID_Key           2506 non-null int64
CN_Diag            1629 non-null float64
MCI_Diag           1629 non-null float64
AD_Diag            1629 non-null float64
ADAS13             1637 non-null float64
Ventricles_Norm    868 non-null float64
MMSE               1658 non-null float64
dtypes: datetime64[ns](1), float64(6), int64(1)
memory usage: 156.7 KB


In [11]:
train_target.isnull().sum()

Date                  0
PTID_Key              0
CN_Diag             877
MCI_Diag            877
AD_Diag             877
ADAS13              869
Ventricles_Norm    1638
MMSE                848
dtype: int64

## validation target

In [3]:
val_target = pd.read_csv('../data/TADPOLE_TargetData_validation.csv')
# normalize the format of DATE
val_target['Date'] = pd.to_datetime(val_target['Date'])

# Sorting data
val_target = val_target.sort_values(by=['PTID_Key'])
val_target = val_target.groupby(['PTID_Key']).apply(lambda x: x.sort_values(['Date'], ascending = True))
val_target = val_target.reset_index(drop=True)

val_target.head()

Unnamed: 0,Date,PTID_Key,CN_Diag,MCI_Diag,AD_Diag,ADAS13,Ventricles_Norm,MMSE
0,2013-02-13,5,1.0,0.0,0.0,16.0,0.019279,28.0
1,2013-08-14,5,,,,,,
2,2014-02-20,5,0.0,1.0,0.0,23.0,,27.0
3,2015-03-05,5,0.0,0.0,1.0,29.0,,27.0
4,2016-04-28,5,0.0,0.0,1.0,26.0,,19.0


In [17]:
val_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 867 entries, 0 to 866
Data columns (total 8 columns):
Date               867 non-null datetime64[ns]
PTID_Key           867 non-null int64
CN_Diag            569 non-null float64
MCI_Diag           569 non-null float64
AD_Diag            569 non-null float64
ADAS13             574 non-null float64
Ventricles_Norm    281 non-null float64
MMSE               582 non-null float64
dtypes: datetime64[ns](1), float64(6), int64(1)
memory usage: 54.3 KB


In [13]:
val_target.isnull().sum()

Date                 0
PTID_Key             0
CN_Diag            298
MCI_Diag           298
AD_Diag            298
ADAS13             293
Ventricles_Norm    586
MMSE               285
dtype: int64

## Input data 

In [4]:
Input_Data = pd.read_csv('../data/TADPOLE_InputData.csv')

# normalize the format of DATE
Input_Data['EXAMDATE'] = pd.to_datetime(Input_Data['EXAMDATE'],errors='coerce')

bl_time = ['Month']
fixed_features = ['PTID_Key','EXAMDATE']
selected_features = ['CDRSB','ADAS11','RAVLT_immediate','Hippocampus','WholeBrain','Entorhinal','MidTemp','APOE4','AGE']
predicted_features = ['DX_bl','DX','ADAS13','Ventricles','MMSE']
df_data = Input_Data[fixed_features + selected_features + predicted_features + bl_time]

# Sorting data
df_data = df_data.sort_values(by=['PTID_Key'])

# Sorting the EXAMDATE because the EXAMDATE of one objective are disorderly
df_data = df_data.groupby(['PTID_Key']).apply(lambda x: x.sort_values(['EXAMDATE'], ascending = True))
df_data = df_data.reset_index(drop=True)

# df_data = df_data.dropna(axis=0)

# Check how many missing data and the type of data
print(df_data.head())
print('-'*50)
print(df_data.isnull().sum())
print('-'*50)
print(df_data.info())

   PTID_Key   EXAMDATE  CDRSB  ADAS11  RAVLT_immediate  Hippocampus  \
0       1.0 2010-12-10    1.5   14.00             28.0       6288.0   
1       1.0 2011-04-07    NaN     NaN              NaN       6314.0   
2       1.0 2011-09-08    3.0   17.00             26.0       6345.0   
3       2.0 2006-07-21    4.5   18.67             25.0       4951.0   
4       2.0 2007-01-16    5.0   19.33             20.0          NaN   

   WholeBrain  Entorhinal  MidTemp  APOE4   AGE DX_bl        DX  ADAS13  \
0    904429.0      4081.0  14310.0    1.0  81.6  EMCI       MCI   21.00   
1    909689.0      3635.0  14899.0    1.0  81.6  EMCI       NaN     NaN   
2    901352.0      3645.0  14891.0    1.0  81.6  EMCI       MCI   26.00   
3    828140.0      2275.0  11760.0    0.0  76.6    AD  Dementia   27.67   
4    828686.0         NaN      NaN    0.0  76.6    AD  Dementia   30.33   

   Ventricles  MMSE  Month  
0     30260.0  27.0    0.0  
1     30302.0   NaN    3.0  
2     31615.0  24.0    6.0  
3     

## Forming data structure

Use `PTID` from train_target to get individual's history examination record and make a time series training set including individual's history visit with history results and future results.

### training data

In [16]:
ID_train = np.unique(train_target.PTID_Key.values)
for ID in ID_train:
    idx_list = train_target[train_target['PTID_Key']==ID].index.values
    for idx in idx_list:
        train_target.loc[idx,'blTime'] = df_data[df_data['PTID_Key']==ID]['EXAMDATE'].values[0]
        train_target.loc[idx,'DX_bl'] = df_data[df_data['PTID_Key']==ID]['DX'].values[0]
#train_target['DX_bl'] = train_target['DX_bl'].astype(int)
# train_target.head()

In [17]:
train_target['Month'] = ((train_target.Date - train_target.blTime)/np.timedelta64(1, 'M'))
train_target['Month'] = np.rint(train_target['Month'])
train_target.head()

Unnamed: 0,Date,PTID_Key,CN_Diag,MCI_Diag,AD_Diag,ADAS13,Ventricles_Norm,MMSE,blTime,DX_bl,Month
0,2013-06-07,6,1.0,0.0,0.0,,,30.0,2006-04-26,NL,85.0
1,2013-10-16,6,,,,,,,2006-04-26,NL,90.0
2,2014-05-29,6,1.0,0.0,0.0,3.0,,30.0,2006-04-26,NL,97.0
3,2016-05-05,6,1.0,0.0,0.0,1.0,,29.0,2006-04-26,NL,120.0
4,2013-03-28,8,,,,,,,2011-10-27,NL,17.0


In [202]:

for ID in ID_train:
    print(ID)
    last_visit = df_data[df_data['PTID_Key']==ID]['Month'].values.tolist()[-1]
    print(df_data[df_data['PTID_Key']==ID]['Month'].values.tolist())
    future_visit_last_predict = train_target[train_target['PTID_Key']==ID]['Month'].values.tolist()[-1]
    print(train_target[train_target['PTID_Key']==ID]['Month'].values.tolist())
    print('-'*50)
    

6
[0.0, 6.0, 12.0, 18.0, 24.0, 36.0, 36.0, 48.0, 54.0, 60.0, 66.0, 72.0, 78.0]
[85.0, 90.0, 97.0, 120.0]
--------------------------------------------------
8
[0.0, 3.0, 6.0, 12.0]
[17.0, 24.0, 30.0]
--------------------------------------------------
18
[0.0, 3.0, 6.0, 12.0, 18.0]
[25.0, 32.0, 37.0, 50.0]
--------------------------------------------------
21
[0.0, 3.0]
[6.0, 12.0, 18.0, 24.0, 36.0, 49.0]
--------------------------------------------------
22
[0.0, 3.0]
[6.0, 12.0, 18.0, 24.0, 37.0, 49.0]
--------------------------------------------------
25
[0.0]
[2.0, 6.0, 12.0]
--------------------------------------------------
26
[0.0, 3.0, 6.0, 12.0]
[20.0, 24.0, 30.0, 53.0]
--------------------------------------------------
32
[0.0, 3.0]
[6.0, 12.0, 18.0, 24.0, 39.0]
--------------------------------------------------
34
[0.0, 3.0, 6.0]
[12.0, 18.0, 24.0, 48.0]
--------------------------------------------------
36
[0.0]
[0.0, 6.0]
--------------------------------------------------
40

[0.0, 3.0, 6.0]
[12.0, 18.0, 24.0, 48.0]
--------------------------------------------------
422
[0.0, 3.0, 6.0]
[12.0, 34.0]
--------------------------------------------------
423
[0.0, 3.0, 6.0, 6.0, 18.0, 18.0, 30.0]
[34.0, 42.0, 48.0, 61.0]
--------------------------------------------------
424
[0.0, 3.0]
[6.0, 13.0, 18.0]
--------------------------------------------------
425
[0.0, 3.0]
[6.0, 12.0, 18.0, 24.0]
--------------------------------------------------
429
[0.0, 6.0, 12.0, 18.0, 24.0, 30.0, 36.0, 42.0, 48.0, 54.0, 60.0, 66.0, 72.0]
[79.0, 90.0, 97.0, 112.0]
--------------------------------------------------
431
[0.0, 3.0, 6.0, 12.0]
[18.0, 24.0, 48.0]
--------------------------------------------------
434
[0.0, 6.0, 12.0, 18.0, 24.0, 30.0, 36.0, 42.0, 48.0, 54.0, 60.0, 66.0, 72.0]
[78.0, 84.0, 105.0]
--------------------------------------------------
435
[0.0, 3.0, 6.0]
[12.0, 18.0, 24.0, 36.0, 48.0]
--------------------------------------------------
437
[0.0, 3.0, 6.0]
[13

[6.0, 12.0, 18.0, 24.0, 36.0, 48.0]
--------------------------------------------------
817
[0.0]
[0.0, 3.0, 6.0]
--------------------------------------------------
819
[0.0, 3.0]
[6.0, 12.0, 18.0]
--------------------------------------------------
825
[0.0, 6.0, 12.0, 18.0, 30.0, 36.0, 42.0, 54.0, 60.0, 72.0]
[76.0, 84.0, 90.0, 96.0, 113.0]
--------------------------------------------------
827
[0.0, 3.0, 6.0, 12.0]
[18.0, 24.0, 30.0, 48.0]
--------------------------------------------------
829
[0.0, 3.0, 6.0]
[12.0, 24.0]
--------------------------------------------------
830
[0.0, 3.0, 6.0]
[18.0, 19.0, 26.0, 49.0]
--------------------------------------------------
833
[0.0, 3.0, 3.0]
[12.0, 20.0, 24.0, 37.0, 52.0]
--------------------------------------------------
835
[0.0, 3.0, 6.0, 12.0, 18.0, 24.0]
[30.0, 36.0, 48.0, 62.0]
--------------------------------------------------
837
[0.0, 3.0, 6.0, 12.0]
[19.0, 24.0, 49.0]
--------------------------------------------------
838
[0.0, 3.

[0.0, 6.0, 12.0, 18.0, 24.0, 30.0, 36.0, 42.0, 54.0, 60.0, 66.0, 72.0, 72.0, 78.0]
[86.0, 90.0, 96.0, 102.0]
--------------------------------------------------
1198
[0.0, 6.0, 12.0, 18.0, 24.0, 30.0, 36.0, 42.0, 48.0, 54.0, 60.0, 66.0, 72.0, 78.0]
[87.0, 93.0]
--------------------------------------------------
1200
[0.0]
[0.0, 2.0, 6.0, 12.0, 24.0]
--------------------------------------------------
1201
[0.0, 3.0]
[6.0, 12.0, 18.0]
--------------------------------------------------
1202
[0.0, 3.0]
[6.0, 12.0, 18.0, 25.0, 41.0]
--------------------------------------------------
1204
[0.0, 3.0, 6.0, 12.0]
[18.0, 24.0, 30.0, 49.0]
--------------------------------------------------
1208
[0.0]
[3.0, 6.0, 12.0, 18.0, 24.0, 38.0, 43.0]
--------------------------------------------------
1209
[0.0]
[3.0, 6.0, 12.0]
--------------------------------------------------
1212
[0.0]
[2.0, 6.0, 12.0, 25.0]
--------------------------------------------------
1213
[0.0, 3.0]
[6.0, 12.0, 18.0, 24.0, 36.0]


[0.0]
[0.0, 3.0, 7.0, 17.0]
--------------------------------------------------
1555
[0.0, 3.0, 6.0, 12.0, 18.0]
[33.0, 61.0]
--------------------------------------------------
1562
[0.0, 6.0, 12.0, 18.0, 24.0, 30.0, 36.0, 42.0, 48.0, 60.0, 66.0]
[73.0, 79.0, 84.0, 88.0, 108.0]
--------------------------------------------------
1564
[0.0, 6.0, 12.0, 18.0, 24.0, 30.0, 36.0, 42.0, 48.0, 54.0, 60.0, 66.0, 72.0]
[78.0, 84.0, 90.0, 109.0]
--------------------------------------------------
1565
[0.0, 6.0, 12.0, 18.0, 24.0, 30.0, 36.0, 66.0, 78.0, 84.0]
[96.0, 110.0, 121.0]
--------------------------------------------------
1567
[0.0, 6.0, 12.0, 18.0, 24.0, 30.0, 72.0]
[82.0]
--------------------------------------------------
1568
[0.0, 3.0, 6.0]
[16.0, 19.0, 25.0, 36.0]
--------------------------------------------------
1570
[0.0, 6.0, 12.0, 18.0, 24.0, 30.0, 36.0, 48.0, 54.0, 60.0, 66.0, 72.0]
[78.0]
--------------------------------------------------
1573
[0.0, 6.0, 12.0, 18.0, 24.0, 30.0, 3

In [18]:
predict_step = []
for ID in ID_train:
    #print(ID)
    last_visit = df_data[df_data['PTID_Key']==ID]['Month'].values.tolist()[-1]
    #print(last_visit)
    future_visit_last_predict = train_target[train_target['PTID_Key']==ID]['Month'].values.tolist()[-1]
    future_visit_time = (future_visit_last_predict - last_visit)//6
    #print(future_visit_time)
    #print('-'*50)
    predict_step.append(future_visit_time)
    
df_predict_step = pd.DataFrame(predict_step)
df_predict_step.describe()

Unnamed: 0,0
count,655.0
mean,4.876336
std,2.121314
min,0.0
25%,3.0
50%,5.0
75%,7.0
max,9.0


### check for validation data

time step and future visit

In [173]:
ID_val = np.unique(val_target.PTID_Key.values)
for ID in ID_val:
    idx_list = val_target[val_target['PTID_Key']==ID].index.values
    for idx in idx_list:
        val_target.loc[idx,'blTime'] = df_data[df_data['PTID_Key']==ID]['EXAMDATE'].values[0]
        val_target.loc[idx,'DX_bl'] = df_data[df_data['PTID_Key']==ID]['DX'].values[0]
#train_target['DX_bl'] = train_target['DX_bl'].astype(int)

val_target['Month'] = ((val_target.Date - val_target.blTime)/np.timedelta64(1, 'M'))
val_target['Month'] = np.rint(val_target['Month'])
val_target.head()


predict_step = []
for ID in ID_val:
    #print(ID)
    last_visit = df_data[df_data['PTID_Key']==ID]['Month'].values.tolist()[-1]
    #print(last_visit)
    future_visit_last_predict = val_target[val_target['PTID_Key']==ID]['Month'].values.tolist()[-1]
    future_visit_time = (future_visit_last_predict - last_visit)//6
    #print(future_visit_time)
    #print('-'*50)
    predict_step.append(future_visit_time)
    
df_predict_step_val = pd.DataFrame(predict_step)
df_predict_step_val.describe()
#print(predict_step.count(max(predict_step)))

Unnamed: 0,0
count,218.0
mean,5.256881
std,2.096315
min,0.0
25%,4.0
50%,6.0
75%,7.0
max,9.0


### check for test data

In [197]:
test_target = pd.read_csv('../data/TADPOLE_PredictTargetData_test.csv')
# normalize the format of DATE
test_target['Date'] = pd.to_datetime(test_target['Date'])

# Sorting data
test_target = test_target.sort_values(by=['PTID_Key'])
test_target = test_target.groupby(['PTID_Key']).apply(lambda x: x.sort_values(['Date'], ascending = True))
test_target = test_target.reset_index(drop=True)

ID_test = np.unique(test_target.PTID_Key.values)
for ID in ID_test:
    idx_list = test_target[test_target['PTID_Key']==ID].index.values
    for idx in idx_list:
        test_target.loc[idx,'blTime'] = df_data[df_data['PTID_Key']==ID]['EXAMDATE'].values[0]
        test_target.loc[idx,'DX_bl'] = df_data[df_data['PTID_Key']==ID]['DX'].values[0]
#train_target['DX_bl'] = train_target['DX_bl'].astype(int)

test_target['Month'] = ((test_target.Date - test_target.blTime)/np.timedelta64(1, 'M'))
test_target['Month'] = np.rint(test_target['Month'])

predict_step = []
for ID in ID_test:
    #print(ID)
    last_visit = df_data[df_data['PTID_Key']==ID]['Month'].values.tolist()[-1]
    #print(df_data[df_data['PTID_Key']==ID]['Month'].values.tolist())
    future_visit_last_predict = test_target[test_target['PTID_Key']==ID]['Month'].values.tolist()[-1]
    future_visit_time = (future_visit_last_predict - last_visit)//6
    #print(test_target[test_target['PTID_Key']==ID]['Month'].values.tolist())
    #print(future_visit_time)
    #print('-'*50)
    predict_step.append(future_visit_time)
    
df_predict_step_test = pd.DataFrame(predict_step)
df_predict_step_test.describe()

Unnamed: 0,0
count,218.0
mean,4.807339
std,2.277832
min,0.0
25%,3.0
50%,6.0
75%,7.0
max,9.0


## preparing for training

In this part, first we normalize the time interval to be uniform as 6 months, and we use the nearest history visit data to fill in the training data and their corresponding label of the disease classification as well as the regression results for ADAS13, Ventricles and MMSE.

**Notice** For the time series data formation, we assume the time interval between two consecutive visit is **6 months**, and according to our problem, we try to predict the future **8 visits' results** based on current record. Here the future 8 visits comes as a hyper-parameters which we would tune by cross-validation.

**For the data missing, we also use the neareast value to fill in the NaN.

In [5]:
df_data.head()

Unnamed: 0,PTID_Key,EXAMDATE,CDRSB,ADAS11,RAVLT_immediate,Hippocampus,WholeBrain,Entorhinal,MidTemp,APOE4,AGE,DX_bl,DX,ADAS13,Ventricles,MMSE,Month
0,1.0,2010-12-10,1.5,14.0,28.0,6288.0,904429.0,4081.0,14310.0,1.0,81.6,EMCI,MCI,21.0,30260.0,27.0,0.0
1,1.0,2011-04-07,,,,6314.0,909689.0,3635.0,14899.0,1.0,81.6,EMCI,,,30302.0,,3.0
2,1.0,2011-09-08,3.0,17.0,26.0,6345.0,901352.0,3645.0,14891.0,1.0,81.6,EMCI,MCI,26.0,31615.0,24.0,6.0
3,2.0,2006-07-21,4.5,18.67,25.0,4951.0,828140.0,2275.0,11760.0,0.0,76.6,AD,Dementia,27.67,62890.0,25.0,0.0
4,2.0,2007-01-16,5.0,19.33,20.0,,828686.0,,,0.0,76.6,AD,Dementia,30.33,65800.0,24.0,6.0


In [14]:
Input_Data = pd.read_csv('../data/TADPOLE_InputData.csv')

# normalize the format of DATE
Input_Data['EXAMDATE'] = pd.to_datetime(Input_Data['EXAMDATE'],errors='coerce')

bl_time = ['Month']
fixed_features = ['PTID_Key','EXAMDATE']
selected_features = ['CDRSB','ADAS11','RAVLT_immediate','Hippocampus','WholeBrain','Entorhinal','MidTemp','APOE4','AGE', 'ICV']
predicted_features = ['DX_bl','DX','ADAS13','Ventricles','MMSE']
df_data = Input_Data[fixed_features + selected_features + predicted_features + bl_time]

# Sorting data
df_data = df_data.sort_values(by=['PTID_Key'])

# Sorting the EXAMDATE because the EXAMDATE of one objective are disorderly
df_data = df_data.groupby(['PTID_Key']).apply(lambda x: x.sort_values(['EXAMDATE'], ascending = True))
df_data = df_data.reset_index(drop=True)

# df_data = df_data.dropna(axis=0)

# Check how many missing data and the type of data
print(df_data.isnull().sum())
print(df_data.info())

PTID_Key              0
EXAMDATE              0
CDRSB              2427
ADAS11             2423
RAVLT_immediate    2474
Hippocampus        3355
WholeBrain         2418
Entorhinal         3412
MidTemp            3412
APOE4                12
AGE                   0
ICV                2290
DX_bl                 0
DX                 2399
ADAS13             2486
Ventricles         2566
MMSE               2412
Month                 0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8715 entries, 0 to 8714
Data columns (total 18 columns):
PTID_Key           8715 non-null float64
EXAMDATE           8715 non-null datetime64[ns]
CDRSB              6288 non-null float64
ADAS11             6292 non-null float64
RAVLT_immediate    6241 non-null float64
Hippocampus        5360 non-null float64
WholeBrain         6297 non-null float64
Entorhinal         5303 non-null float64
MidTemp            5303 non-null float64
APOE4              8703 non-null float64
AGE                8715 non-nul

In [23]:
train_target['Month'] = train_target['Month'].astype(int)
train_target.head()

Unnamed: 0,Date,PTID_Key,CN_Diag,MCI_Diag,AD_Diag,ADAS13,Ventricles_Norm,MMSE,blTime,DX_bl,Month
0,2013-06-07,6,1.0,0.0,0.0,,,30.0,2006-04-26,NL,85
1,2013-10-16,6,,,,,,,2006-04-26,NL,90
2,2014-05-29,6,1.0,0.0,0.0,3.0,,30.0,2006-04-26,NL,97
3,2016-05-05,6,1.0,0.0,0.0,1.0,,29.0,2006-04-26,NL,120
4,2013-03-28,8,,,,,,,2011-10-27,NL,17


In [22]:
df_data['PTID_Key'] = df_data['PTID_Key'].astype(int)
df_data['Month'] = df_data['Month'].astype(int)
df_data.head()

Unnamed: 0,PTID_Key,EXAMDATE,CDRSB,ADAS11,RAVLT_immediate,Hippocampus,WholeBrain,Entorhinal,MidTemp,APOE4,AGE,ICV,DX_bl,DX,ADAS13,Ventricles,MMSE,Month
0,1,2010-12-10,1.5,14.0,28.0,6288.0,904429.0,4081.0,14310.0,1.0,81.6,1328070.0,EMCI,MCI,21.0,30260.0,27.0,0
1,1,2011-04-07,,,,6314.0,909689.0,3635.0,14899.0,1.0,81.6,1331790.0,EMCI,,,30302.0,,3
2,1,2011-09-08,3.0,17.0,26.0,6345.0,901352.0,3645.0,14891.0,1.0,81.6,1335120.0,EMCI,MCI,26.0,31615.0,24.0,6
3,2,2006-07-21,4.5,18.67,25.0,4951.0,828140.0,2275.0,11760.0,0.0,76.6,1413940.0,AD,Dementia,27.67,62890.0,25.0,0
4,2,2007-01-16,5.0,19.33,20.0,,828686.0,,,0.0,76.6,1415100.0,AD,Dementia,30.33,65800.0,24.0,6


In [247]:
df_data_re = df_data
train_target_re = train_target

# print("nrow of past visit:", df_data_re.shape[0], end = "\n\n")
# print("nrow of future visit:", train_target_re.shape[0], end = "\n\n")

time_interval = 6
for ID in ID_train:
    past_visit = df_data_re[df_data_re['PTID_Key']==ID]['Month'].values
    idx = df_data_re[df_data_re['PTID_Key']==ID].index.values
    
    # print(past_visit//time_interval)
    # print(idx)
    visit_time = 0
    index_delete = []
    index = 0
    month_record = past_visit//time_interval
    for record in month_record:
        if record == visit_time:
            visit_time += 1
        elif record < visit_time:
            # print(idx[index])
            index_delete.append(idx[index])
        elif record > visit_time:
            visit_time = record
            visit_time += 1
        index += 1
    
    df_data_re = df_data_re.drop(index_delete).reset_index(drop=True)
    
    future_visit = train_target_re[train_target_re['PTID_Key']==ID]['Month'].values
    idx = train_target_re[train_target_re['PTID_Key']==ID].index.values
    if future_visit[0]//time_interval <= visit_time-1:
        train_target_re.loc[(train_target_re['PTID_Key']==ID)
                            & (train_target_re['Month']//time_interval<=visit_time-1),
                            'Month'] = int(visit_time*time_interval)
    
    future_visit = train_target_re[train_target_re['PTID_Key']==ID]['Month'].values
    index_delete = []
    index = 0
    # print(future_visit//time_interval)
    for month_record in future_visit/time_interval:
        if month_record == visit_time:
            visit_time += 1
        elif month_record < visit_time:
            index_delete.append(idx[index])
        elif month_record > visit_time:
            visit_time = month_record
            visit_time += 1
        index += 1
    
    train_target_re = train_target_re.drop(index_delete).reset_index(drop=True)
    
print("nrow of past visit:", df_data_re.shape[0], end = "\n\n")
print("nrow of future visit:", train_target_re.shape[0], end = "\n\n")

nrow of past visit: 8298

nrow of future visit: 2158



In [272]:
df_data_re = df_data_re.fillna(method='ffill')
time_interval = 6
for ID in ID_train:
    past_visit = df_data_re[df_data_re['PTID_Key']==ID]['Month'].values
    
    # print('past:', past_visit//time_interval)
    visit_time = 0
    month_record = past_visit//time_interval
    for record in month_record:
        if record > visit_time:
            add_num = record - visit_time
            past_data = df_data_re.loc[(df_data_re['PTID_Key']==ID) & (df_data_re['Month']//time_interval==visit_time-1), :]
            past_data_index = past_data.index.values
            past_data = pd.DataFrame(np.repeat(past_data.values,add_num,axis=0))
            past_data.columns = df_data_re.columns
            past_data['Month'] = [i for i in (visit_time + np.arange(0,add_num))*time_interval]
            past_data.index = [i for i in past_data_index+np.arange(0,1,1/(add_num+1))[1:]]
            df_data_re = df_data_re.append(past_data, ignore_index=False)
            df_data_re = df_data_re.sort_index().reset_index(drop=True)
            visit_time = record
        visit_time += 1
    
    future_visit = train_target_re[train_target_re['PTID_Key']==ID]['Month'].values
    # print('past:', past_visit//time_interval, 'future:', future_visit//time_interval)
    month_record = future_visit//time_interval
    
    if month_record[0] > visit_time:
        past_data = train_target_re.loc[(train_target_re['PTID_Key']==ID), :].iloc[0:1]
        past_data['Month'] = visit_time*time_interval
        past_data['CN_Diag'] = np.nan
        past_data['MCI_Diag'] = np.nan
        past_data['AD_Diag'] = np.nan
        past_data['ADAS13'] = np.nan
        past_data['Ventricles_Norm'] = np.nan
        past_data['MMSE'] = np.nan
        past_data.index = past_data.index.values-0.1
        train_target_re = train_target_re.append(past_data, ignore_index=False).sort_index().reset_index(drop=True)
        
    future_visit = train_target_re[train_target_re['PTID_Key']==ID]['Month'].values
    month_record = future_visit//time_interval
    for record in month_record:
        if record > visit_time:
            add_num = record - visit_time
            past_data = train_target_re.loc[(train_target_re['PTID_Key']==ID) & (train_target_re['Month']//time_interval==visit_time-1), :]
            past_data_index = past_data.index.values
            past_data = pd.DataFrame(np.repeat(past_data.values,add_num,axis=0))
            past_data.columns = train_target_re.columns
            past_data['Month'] = [i for i in (visit_time + np.arange(0,add_num))*time_interval]
            past_data['CN_Diag'] = np.nan
            past_data['MCI_Diag'] = np.nan
            past_data['AD_Diag'] = np.nan
            past_data['ADAS13'] = np.nan
            past_data['Ventricles_Norm'] = np.nan
            past_data['MMSE'] = np.nan
            past_data.index = [i for i in past_data_index+np.arange(0,1,1/(add_num+1))[1:]]
            train_target_re = train_target_re.append(past_data, ignore_index=False)
            train_target_re = train_target_re.sort_index().reset_index(drop=True)
            visit_time = record
        visit_time += 1
    
    

print("nrow of past visit:", df_data_re.shape[0], end = "\n\n")
print("nrow of future visit:", train_target_re.shape[0], end = "\n\n")

nrow of past visit: 8546

nrow of future visit: 3242



In [281]:
## Encoding `DX_bl` and `DX`

print(np.unique(df_data['DX_bl'].values))
print('-'*50)
print(np.unique(df_data['DX'].astype(str).values))

DX_mapping = {"NL": 1, "MCI": 2, "Dementia": 3, "NL to MCI": 4, "MCI to Dementia": 5, "NL to Dementia":6, "MCI to NL":7,"Dementia to MCI":8}
df_data_re['DX'] = df_data['DX'].map(DX_mapping)

DXbl_mapping = {"AD": 2, "CN": 0, "EMCI": 1, "LMCI": 1, "SMC": 1, "NL": 0, "MCI": 1, "Dementia": 2, }
df_data_re['DX_bl'] = df_data['DX_bl'].map(DXbl_mapping)
train_target_re['DX_bl'] = train_target['DX_bl'].map(DXbl_mapping)

df_data_re.head()

['AD' 'CN' 'EMCI' 'LMCI' 'SMC']
--------------------------------------------------
['Dementia' 'Dementia to MCI' 'MCI' 'MCI to Dementia' 'MCI to NL' 'NL'
 'NL to Dementia' 'NL to MCI' 'nan']


Unnamed: 0,PTID_Key,EXAMDATE,CDRSB,ADAS11,RAVLT_immediate,Hippocampus,WholeBrain,Entorhinal,MidTemp,APOE4,AGE,ICV,DX_bl,DX,ADAS13,Ventricles,MMSE,Month
0,1,2010-12-10,1.5,14.0,28.0,0.422246,0.305082,0.529349,0.259515,1.0,81.6,0.44182,1,2.0,21.0,3.883123e-08,27.0,0
1,1,2011-04-07,1.5,14.0,28.0,0.4253,0.311367,0.449535,0.28391,1.0,81.6,0.444475,1,,21.0,3.843727e-08,27.0,3
2,1,2011-09-08,3.0,17.0,26.0,0.428941,0.301405,0.451324,0.283578,1.0,81.6,0.446851,1,2.0,26.0,3.969078e-08,24.0,6
3,2,2006-07-21,4.5,18.67,25.0,0.26521,0.213931,0.206156,0.153903,0.0,76.6,0.503096,2,3.0,27.67,6.252718e-08,25.0,0
4,2,2007-01-16,5.0,19.33,20.0,0.26521,0.214583,0.206156,0.153903,0.0,76.6,0.503924,2,3.0,30.33,6.52059e-08,24.0,6


In [277]:
df_data_re = df_data_re.fillna(method='ffill')

index_value1 = df_data_re[(df_data_re['DX']==1) | (df_data_re['DX']==7)].index.values
for idx in index_value1:
    df_data_re.loc[idx,'DX'] = 0

index_value2 = df_data_re[(df_data_re['DX']==2) | (df_data_re['DX']==4) | (df_data_re['DX']==8)].index.values
for idx in index_value2:
    df_data_re.loc[idx,'DX'] = 1

index_value3 = df_data_re[(df_data_re['DX']==3) | (df_data_re['DX']==5) | (df_data_re['DX']==6)].index.values
for idx in index_value3:
    df_data_re.loc[idx,'DX'] = 2
    
# transform the type of 'DX'
df_data_re['DX'] = df_data_re['DX'].astype(int)

df_data_re['PTID_Key'] = df_data_re['PTID_Key'].astype(int)

# normlize the Ventricles
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# scaler_result = scaler.fit_transform(df_data['Ventricles'].values.reshape(-1,1))
# df_data['Ventricles'] = scaler_result

df_data_re['Ventricles'] = df_data_re['Ventricles']/df_data_re['ICV']

df_data_re.head()

# Normalize feature
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_data_re['Hippocampus'] = scaler.fit_transform(df_data_re['Hippocampus'].values.reshape(-1,1))
df_data_re['WholeBrain'] = scaler.fit_transform(df_data_re['WholeBrain'].values.reshape(-1,1))
df_data_re['Entorhinal'] = scaler.fit_transform(df_data_re['Entorhinal'].values.reshape(-1,1))
df_data_re['MidTemp'] = scaler.fit_transform(df_data_re['MidTemp'].values.reshape(-1,1))
df_data_re['ICV'] = scaler.fit_transform(df_data_re['ICV'].values.reshape(-1,1))


# Check how many missing data and the type of data
print(df_data_re.isnull().sum())
df_data_re.head()

PTID_Key           0
EXAMDATE           0
CDRSB              0
ADAS11             0
RAVLT_immediate    0
Hippocampus        0
WholeBrain         0
Entorhinal         0
MidTemp            0
APOE4              0
AGE                0
ICV                0
DX_bl              0
DX                 0
ADAS13             0
Ventricles         0
MMSE               0
Month              0
dtype: int64


Unnamed: 0,PTID_Key,EXAMDATE,CDRSB,ADAS11,RAVLT_immediate,Hippocampus,WholeBrain,Entorhinal,MidTemp,APOE4,AGE,ICV,DX_bl,DX,ADAS13,Ventricles,MMSE,Month
0,1,2010-12-10,1.5,14.0,28.0,0.422246,0.305082,0.529349,0.259515,1.0,81.6,0.44182,1,0,21.0,3.883123e-08,27.0,0
1,1,2011-04-07,1.5,14.0,28.0,0.4253,0.311367,0.449535,0.28391,1.0,81.6,0.444475,1,0,21.0,3.843727e-08,27.0,3
2,1,2011-09-08,3.0,17.0,26.0,0.428941,0.301405,0.451324,0.283578,1.0,81.6,0.446851,1,0,26.0,3.969078e-08,24.0,6
3,2,2006-07-21,4.5,18.67,25.0,0.26521,0.213931,0.206156,0.153903,0.0,76.6,0.503096,2,0,27.67,6.252718e-08,25.0,0
4,2,2007-01-16,5.0,19.33,20.0,0.26521,0.214583,0.206156,0.153903,0.0,76.6,0.503924,2,0,30.33,6.52059e-08,24.0,6


### training_label

In [283]:
train_target_re.head()

Unnamed: 0,Date,PTID_Key,CN_Diag,MCI_Diag,AD_Diag,ADAS13,Ventricles_Norm,MMSE,blTime,DX_bl,Month
0,2013-06-07 00:00:00,6,1.0,0.0,0.0,,,30.0,2006-04-26 00:00:00,0.0,85
1,2013-06-07 00:00:00,6,,,,,,,2006-04-26 00:00:00,0.0,90
2,2014-05-29 00:00:00,6,1.0,0.0,0.0,3.0,,30.0,2006-04-26 00:00:00,0.0,97
3,2014-05-29 00:00:00,6,,,,,,,2006-04-26 00:00:00,0.0,102
4,2014-05-29 00:00:00,6,,,,,,,2006-04-26 00:00:00,0.0,108


In [286]:
index_value1 = train_target_re[(train_target_re['CN_Diag']==1) | (train_target_re['MCI_Diag']==0) | (train_target_re['AD_Diag']==0)].index.values
for idx in index_value1:
    train_target_re.loc[idx,'DX'] = 0

index_value2 = train_target_re[(train_target_re['CN_Diag']==0) | (train_target_re['MCI_Diag']==1) | (train_target_re['AD_Diag']==0)].index.values
for idx in index_value2:
    train_target_re.loc[idx,'DX'] = 1

index_value3 = train_target_re[(train_target_re['CN_Diag']==0) | (train_target_re['MCI_Diag']==0) | (train_target_re['AD_Diag']==1)].index.values
for idx in index_value3:
    train_target_re.loc[idx,'DX'] = 2
    
# transform the type of 'DX'
# train_target_re['DX'] = train_target_re['DX'].astype(int)

# train_target_re['PTID_Key'] = train_target_re['PTID_Key'].astype(int)

## Output csv for LSTM

In [284]:
df_data_re.to_csv('./input_data_lstm.csv',index=False)

In [289]:
# To begin with, we fill the training label with the nearest previous value
train_target_re = train_target_re.fillna(method='ffill')
print(train_target_re.isnull().sum())

train_target_re['DX'] = train_target_re['DX'].astype(int)
train_target_re['PTID_Key'] = train_target_re['PTID_Key'].astype(int)
train_target_re.to_csv('./train_target_lstm_pad.csv',index=False)

Date               0
PTID_Key           0
CN_Diag            0
MCI_Diag           0
AD_Diag            0
ADAS13             2
Ventricles_Norm    8
MMSE               0
blTime             0
DX_bl              0
Month              0
DX                 0
dtype: int64
