In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
all_forms_training1 = pd.read_csv('all_forms_PROACT_training.txt', sep='|', header=0, dtype={"feature_value":str, "feature_unit":str}, low_memory = False)
all_forms_training2 = pd.read_csv('all_forms_PROACT_training2.txt', sep='|', header=0, dtype={"feature_value":str, "feature_unit":str}, low_memory = False)
all_forms_leaderboard = pd.read_csv('all_forms_PROACT_leaderboard_full.txt', sep='|', header=0, dtype={"feature_value":str, "feature_unit":str}, low_memory = False)
all_forms_validation = pd.read_csv('all_forms_PROACT_validation_full.txt', sep='|', header=0, dtype={"feature_value":str, "feature_unit":str}, low_memory = False)

In [3]:
all_forms = pd.concat([all_forms_training1, all_forms_training2, all_forms_leaderboard, all_forms_validation])
print("The number of subjects in all_forms data is", len(all_forms["SubjectID"].unique()))

The number of subjects in all_forms data is 10723


In [4]:
target = pd.read_csv('ALSFRS_slope.csv')
als_hx = pd.read_csv('als_hx.csv')
demographics = pd.read_csv('demographic.csv')
fvc = pd.read_csv('fvc_3mo_meta.csv')
vitals = pd.read_csv('vitals.csv')

In [5]:
print("The number of subjects in target data is", len(target["SubjectID"].unique()))

The number of subjects in target data is 3096


Extracting Static Features

In [6]:
demographics = demographics[['SubjectID', 'Age']]
als_hx['diag_minus_onset'] = als_hx['diag_delta'] - als_hx['onset_delta']
als_hx = als_hx[['SubjectID', 'diag_delta', 'onset_delta', 'diag_minus_onset']]

In [7]:
static = pd.DataFrame(columns=['SubjectID'])
static_list = [demographics, als_hx]
for i in static_list :
    df = i
    static = static.merge(df, on='SubjectID', how='outer')

Extracting Time-resolved Features

In [8]:
fvc = fvc[['SubjectID','fvc_slope', 'fvc_max']]
vitals.query('feature_delta < 92', inplace=True)

In [9]:
vitals2 = vitals.iloc[:, [0,2,3,4,5]].groupby('SubjectID').agg(['max', 'mean'])
vitals2.columns = ['max_bp_d','mean_bp_d', 'max_bp_s', 'mean_bp_s',  'max_pulse','mean_pulse',   'max_resp_rate','mean_resp_rate']
vitals2 = vitals2[[ 'mean_bp_d', 'max_bp_s', 'max_resp_rate', 'max_pulse']]
vitals3 = vitals[['SubjectID', 'feature_delta', 'bp_systolic']]
vitals4 = vitals3.groupby('SubjectID').agg(['first', 'last'])
vitals4['interval'] = vitals4.iloc[:,1] - vitals4.iloc[:,0]
vitals4['bp_s_slope'] = (vitals4.iloc[:,3] - vitals4.iloc[:,2])/vitals4['interval']
vitals5 = vitals4[['bp_s_slope']]
vitals5.columns = ['bp_s_slope']
vitals5.reset_index()
vitals_fin = vitals2.merge(vitals5, on='SubjectID', how='outer')

In [10]:
time_res = pd.DataFrame(columns=['SubjectID'])
time_res_list = [vitals_fin, fvc]
for i in time_res_list :
    df = i
    time_res = time_res.merge(df, on='SubjectID', how='outer')

Merge all features

In [11]:
features = static.merge(time_res, on='SubjectID', how='outer')

Selecting SubjectIDs in target data

In [12]:
extent = list(target["SubjectID"].unique())

In [13]:
features.query("SubjectID == {0}".format(extent), inplace=True)

In [14]:
len(features["SubjectID"].unique())

3096

In [15]:
features_and_target_with_nans_full = features.merge(target, on='SubjectID', how='outer')

In [16]:
features_and_target_with_nans_full.to_csv('features_and_target_with_nans_full.csv')

Train/Test Splitting

In [23]:
x = features_and_target_with_nans_full[['SubjectID', 'Age', 'diag_delta', 'onset_delta', 'diag_minus_onset', 'mean_bp_d', 'max_bp_s', 'max_resp_rate', 'max_pulse', 'bp_s_slope', 'fvc_slope', 'fvc_max']]
y = features_and_target_with_nans_full[['SubjectID','ALSFRS_slope']]

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2)

In [25]:
x_train.to_csv('x_train_with_nans.csv')
x_test.to_csv('x_test_with_nans.csv')
y_train.to_csv('y_train_with_nans.csv')
y_test.to_csv('y_test_with_nans.csv')