In [5]:
import pandas as pd
import numpy as np

# load the date from disk and extract the pid for the test set
df_X_train = pd.read_csv("train_features.csv")
df_y_train = pd.read_csv("train_labels.csv")
df_X_test = pd.read_csv("test_features.csv")
pid = df_X_test.pid.unique()

## Preprocess the data

### Deal with NaN and strange time values

In [6]:
# Convert all times s.t. they start at 1 and end with 12
def normalizeTime(row, dict):
    return int(row.Time) - dict[row.pid]

dict_tr = (df_X_train['Time'].groupby(df_X_train.pid).max()-12).to_dict()
dict_te = (df_X_test['Time'].groupby(df_X_test.pid).max()-12).to_dict()


df_X_train['Time'] = df_X_train.apply(lambda row: normalizeTime(row, dict_tr), axis=1)
df_X_test['Time'] = df_X_test.apply(lambda row: normalizeTime(row, dict_te), axis=1)

# For verification only
# df_X_train.groupby('Time').Time.describe()

In [27]:
# Introduce a binary label (0 or 1) for every collumn that marks if the value was NaN or not
df_X_train_mask = (~df_X_train.iloc[:,3:].isna()).astype(int)
df_X_test_mask = (~df_X_test.iloc[:,3:].isna()).astype(int)


def collapseMask(mask):
    mask = np.reshape(mask.values, (-1, 12, mask.shape[1]))
    mask = np.sum(mask, axis=1)
    return pd.DataFrame(mask)

# sum up per patient
df_X_train_cmask = collapseMask(df_X_train_mask)
df_X_test_cmask = collapseMask(df_X_test_mask)


       0   1   2   3   4   5   6   7   8   9   ...  23  24  25  26  27  28  \
0      12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
1      12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
2      12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
3      12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
4      12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
...    ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ..  ..  ..  ..  ..  ..   
12659  12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
12660  12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
12661  12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
12662  12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
12663  12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   

       29  30  31  32  
0      12  12  12  12  
1      12  12  

In [8]:
# Replace all NaN values with the mean of the other values of the correspondig catetgory of the patient
# If no values are present for the coresspondig patient the median of all patients is taken
def fillNaN(series, medians):
    m = medians[series.name] if np.isnan(series.mean()) else series.mean()
    return series.fillna(m)

median_X_train = df_X_train.median()
median_X_test = df_X_test.median()

df_X_train = df_X_train.groupby("pid").transform(lambda x: fillNaN(x, median_X_train))
df_X_test = df_X_test.groupby("pid").transform(lambda x: fillNaN(x, median_X_test))

### Transform the data s.t. we only have one row per patient

In [9]:
# Concatenates all 12 rows of a patient into one row (throwing away the pid and keeping the age only once)

def transformDf(df):
    frames = [df[df.Time == 1].reset_index()['Age']]
    for i in range(1, 13):
        frames.append(df[df.Time == i].reset_index().iloc[:,3:]) # reset_index() introduces a new collumn
    return pd.concat(frames, axis=1)

df_X_train_con = transformDf(df_X_train)
df_X_test_con = transformDf(df_X_test)

In [10]:
# alternative approach were we take the mean for every patient instead of keeping every feature
def transformDfMean(df):
    age = df[df.Time == 1].reset_index()['Age']
    avg_df = df[df.Time == 1].reset_index().iloc[:,3:]
    
    for i in range(2, 13):
        avg_df = avg_df + df[df.Time == i].reset_index().iloc[:,3:] # reset_index() introduces a new collumn
    return pd.concat([age, avg_df/12], axis=1)

df_X_train_mean = transformDfMean(df_X_train)
df_X_test_mean = transformDfMean(df_X_test)

In [25]:
# Concatenate the mask with the other frames
dfc_X_train = pd.concat([df_X_train_con, df_X_train_cmask], axis = 1)
dfc_X_test = pd.concat([df_X_test_con, df_X_test_cmask], axis = 1)

dfm_X_train = pd.concat([df_X_train_mean, df_X_train_cmask], axis = 1)
dfm_X_test = pd.concat([df_X_test_mean, df_X_test_cmask], axis = 1)



       0   1   2   3   4   5   6   7   8   9   ...  23  24  25  26  27  28  \
0      12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
1      12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
2      12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
3      12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
4      12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
...    ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ..  ..  ..  ..  ..  ..   
18990  12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
18991  12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
18992  12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
18993  12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   
18994  12  12  12  12  12  12  12  12  12  12  ...  12  12  12  12  12  12   

       29  30  31  32  
0      12  12  12  12  
1      12  12  

## Task 1

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

y = df_y_train.iloc[:, 1:11].values
X_train, X_test, y_train, y_test = train_test_split(dfm_X_train, y, test_size=0.01, random_state=42)

rfc1 = RandomForestClassifier(n_estimators = 1000, n_jobs = -1,verbose = 1, warm_start = True)
model = MultiOutputClassifier(estimator=rfc1, n_jobs=-1) 
model.fit(X_train, y_train)

y_pred1 = model.predict_proba(dfm_X_test)

# turn the output into a 2d array (originally for each parameter an 2d array is returned in a list wich contains for every entry
# the proabability p and 1-p)
y_pred1 = np.dstack(y_pred1)[:,1,:]

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.7s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    1.2s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    1.6s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.6s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    1.2s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    1.6s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elaps

In [13]:
# Check performance with test set
from sklearn.metrics import roc_auc_score

y_test_pred = np.dstack(model.predict_proba(X_test))[:,1,:]
print("Task 1 Model Score: " + str(model.score(X_train, y_train)))
print(f"Task 1 Score: {np.mean([roc_auc_score(y_test[:,i], y_test_pred[:,i]) for i in range(0, 10)])}")

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elaps

Task 1 Model Score: 1.0
Task 1 Score: 0.8491862523669071


## Task 2

In [14]:
y = df_y_train.iloc[:, 11]
X_train, X_test, y_train, y_test = train_test_split(dfm_X_train, y, test_size=0.01, random_state=42)

rfc2 = RandomForestClassifier(n_estimators=1000,n_jobs = -1,verbose = 1,warm_start = True)
rfc2.fit(X_train, y_train)
y_pred2 = rfc2.predict_proba(dfm_X_test)[:,1]

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   10.7s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.7s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    1.4s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    1.8s finished


In [15]:
# Check performance with test set
print("Task 2 Model Score: " + str(rfc2.score(X_train, y_train)))
print(f"Task 2 Score: {roc_auc_score(y_test, rfc2.predict_proba(X_test)[:,1])}")

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.3s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.8s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    1.6s


Task 2 Model Score: 1.0
Task 2 Score: 0.7092115534738485


[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    2.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    0.0s finished


## Task 3

In [16]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import RidgeCV

y = df_y_train.iloc[:, 12:].values

X_train, X_test, y_train, y_test = train_test_split(dfc_X_train, y, test_size=0.01, random_state=42)
regr = MultiOutputRegressor(RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10]), n_jobs=-1).fit(X_train, y_train)

y_pred3 = regr.predict(dfc_X_test)

In [17]:
from sklearn.metrics import r2_score

# Check performance with test set
y_test_pred = regr.predict(X_test)
print("Task 3 Model Score: " + str(regr.score(X_train, y_train)))
print(f"Task 3 Score: {np.mean([0.5 + 0.5 * np.maximum(0, r2_score(y_test[:,i], y_test_pred[:,i])) for i in range(0,4)])}")

Task 3 Model Score: 0.5578456447200713
Task 3 Score: 0.7682390926247677


## Export the results

In [18]:
# Concantenates all predicitions and adds back the pid collumn
sol = pd.DataFrame(np.concatenate((pid[:,None], y_pred1, y_pred2[:,None], y_pred3), axis=1))
sol.set_axis(df_y_train.axes[1].to_list(), axis = 1, inplace=True)

sol.to_csv('prediction.zip', index=False, compression=dict(method='zip', archive_name='prediction.csv'))
sol.to_csv('pred_rounded.zip', index=False, float_format='%.3f', compression=dict(method='zip', archive_name='prediction.csv'))

In [19]:
sol.describe()

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
count,12664.0,12664.0,12664.0,12664.0,12664.0,12664.0,12664.0,12664.0,12664.0,12664.0,12664.0,12664.0,12664.0,12664.0,12664.0,12664.0
mean,15889.25,0.279098,0.153652,0.255924,0.252783,0.255774,0.220658,0.238171,0.250764,0.039782,0.07258,0.076175,18.826886,82.406274,96.968422,84.174583
std,9120.097064,0.298752,0.104997,0.167785,0.168773,0.168119,0.183982,0.132997,0.217872,0.064512,0.161095,0.063285,2.392175,10.190763,1.340477,12.122225
min,0.0,0.009,0.011,0.01,0.007,0.008,0.014,0.057,0.016,0.0,0.0,0.0,8.863635,43.451825,73.612398,41.117931
25%,7993.0,0.044,0.1,0.129,0.125,0.129,0.083,0.136,0.088,0.009,0.004,0.032,17.290126,75.071765,96.206004,75.526379
50%,15983.0,0.104,0.135,0.223,0.22,0.223,0.156,0.204,0.1635,0.021,0.018,0.056,18.566209,81.104275,97.14927,83.524809
75%,23773.75,0.543,0.172,0.344,0.34125,0.342,0.306,0.303,0.348,0.045,0.06,0.09925,20.097399,88.46694,97.94421,91.847775
max,31655.0,0.973,0.931,0.957,0.962,0.973,0.963,0.798,0.978,0.798,0.911,0.487,34.4186,131.928992,103.783469,151.960514


# Some useful extra functions

In [20]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_X_test.groupby('Age').Age.describe())

        count   mean  std    min    25%    50%    75%    max
Age                                                         
16.0     48.0   16.0  0.0   16.0   16.0   16.0   16.0   16.0
17.0     24.0   17.0  0.0   17.0   17.0   17.0   17.0   17.0
18.0    132.0   18.0  0.0   18.0   18.0   18.0   18.0   18.0
19.0    384.0   19.0  0.0   19.0   19.0   19.0   19.0   19.0
20.0    516.0   20.0  0.0   20.0   20.0   20.0   20.0   20.0
21.0    696.0   21.0  0.0   21.0   21.0   21.0   21.0   21.0
22.0    636.0   22.0  0.0   22.0   22.0   22.0   22.0   22.0
23.0    708.0   23.0  0.0   23.0   23.0   23.0   23.0   23.0
24.0    432.0   24.0  0.0   24.0   24.0   24.0   24.0   24.0
25.0    612.0   25.0  0.0   25.0   25.0   25.0   25.0   25.0
26.0    708.0   26.0  0.0   26.0   26.0   26.0   26.0   26.0
27.0    504.0   27.0  0.0   27.0   27.0   27.0   27.0   27.0
28.0    564.0   28.0  0.0   28.0   28.0   28.0   28.0   28.0
29.0    624.0   29.0  0.0   29.0   29.0   29.0   29.0   29.0
30.0    744.0   30.0  0.

In [21]:
t = pd.DataFrame()
t['#NaN'] = df_X_train.isna().sum().to_frame()
t['Percent of not Nan'] = (df_X_train.isna().sum() / df_X_train.shape[0] * 100).to_frame()
t

Unnamed: 0,#NaN,Percent of not Nan
Time,0,0.0
Age,0,0.0
EtCO2,0,0.0
PTT,0,0.0
BUN,0,0.0
Lactate,0,0.0
Temp,0,0.0
Hgb,0,0.0
HCO3,0,0.0
BaseExcess,0,0.0


In [22]:
# making sure that the pid can be safely dropped (has to be 0)
(df_X_train.pid[df_X_train.Time == 1].values - df_y_train.pid.values).sum()

AttributeError: 'DataFrame' object has no attribute 'pid'