<a href="https://colab.research.google.com/github/Scott-S-Lin/NTUT_PhD/blob/main/FIDDLE_hands_on.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download files

In [None]:
# Download formatted sample data, PhysioNet Challenge 2012
# https://physionet.org/content/challenge-2012/
!gdown -q --id '1LabZCw6Ryjv1wwT4dWbQTWPvsehM6WiX'
!gdown -q --id '16UlO3E9q5pDmn_K6wgAelAwg2IhsQJtX'
!gdown -q --id '1JcO7FYY5QAix2Ui-dAlK8rEq9XGu_9sC'
!mkdir -p physionet_2012_data

# Download FIDDLE and unzip
!rm -rf FIDDLE FIDDLE-master.zip 
!wget -q https://gitlab.eecs.umich.edu/mld3/FIDDLE/-/archive/v0.1.0/FIDDLE-v0.1.0.zip
!unzip -qq -j FIDDLE-v0.1.0.zip 'FIDDLE-v0.1.0/FIDDLE/*' -d FIDDLE/

# Update packages
!pip install -q -U pyyaml sparse scikit-learn
!pip install -q -U icd9cms icd10-cm

# DONE
!echo 'DONE!'

[K     |████████████████████████████████| 276kB 2.7MB/s 
[K     |████████████████████████████████| 81kB 7.3MB/s 
[K     |████████████████████████████████| 6.8MB 42.2MB/s 
[K     |████████████████████████████████| 3.1MB 39.2MB/s 
[K     |████████████████████████████████| 24.6MB 38.2MB/s 
[?25h  Building wheel for pyyaml (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.2MB 2.7MB/s 
[K     |████████████████████████████████| 675kB 18.0MB/s 
[?25hDONE!


In [None]:
import numpy as np
import pandas as pd
import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
df_input = pd.read_csv('./physionet_2012_input_data.csv')

# Input data

In [None]:
df_input.head(50)

Unnamed: 0,ID,t,variable_name,variable_value
0,132539,,Age,54.0
1,132539,,Gender,_0
2,132539,,ICUType,_4
3,132539,0.116667,GCS,15.0
4,132539,0.116667,HR,73.0
5,132539,0.116667,NIDiasABP,65.0
6,132539,0.116667,NIMAP,92.33
7,132539,0.116667,NISysABP,147.0
8,132539,0.116667,RespRate,19.0
9,132539,0.116667,Temp,35.1


# Run FIDDLE (takes ~1min)

In [None]:
!echo 'parallel = False' >> ./FIDDLE/config.py   # turns off parallelization

In [None]:
!python -m FIDDLE.run \
    --data_path='./physionet_2012_data/' \
    --input_fname='./physionet_2012_input_data.csv' \
    --population='./physionet_2012_pop.csv' \
    --T=48 \
    --dt=24 \
    --theta_1=0.01 --theta_2=0.01 --theta_freq=1 \
    --binarize=no \
    --stats_functions 'min' 'max' 'mean' \
    --N=200

Input data file: ./physionet_2012_input_data.csv

Input arguments:
    T      = 48
    dt     = 24.0
    θ₁     = 0.01
    θ₂     = 0.01
    θ_freq = 1.0
    k      = 3 ['min', 'max', 'mean']
binarize = no

N = 200
L = 2


1) Pre-filter
Remove rows not in population
Remove rows with t outside of [0, 48]
2020-10-13 17:52:20,800: NumExpr defaulting to 2 threads.
Remove rare variables (<= 0.01)
Total variables     : 41
Rare variables      : 0
Remaining variables : 41
# rows (original)   : 81088
# rows (filtered)   : 81088

2) Transform; 3) Post-filter

--------------------------------------------------------------------------------
*) Detecting and parsing value types
--------------------------------------------------------------------------------
Saved as: ./physionet_2012_data/value_types.csv

--------------------------------------------------------------------------------
*) Separate time-invariant and time-dependent
---------------------------------------------------------------------

# Train a model

In [None]:
X = sparse.load_npz('{data_path}/X.npz'.format(data_path='physionet_2012_data')).todense()
s = sparse.load_npz('{data_path}/s.npz'.format(data_path='physionet_2012_data')).todense()
y = pd.read_csv('physionet_2012_labels.csv')['In-hospital_death']

N,L,D = X.shape
_,d = s.shape

X_all = np.hstack([s, X.reshape((N,L*D))])
y_all = y[:N]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_all = scaler.fit_transform(X_all)

print(X_all.shape, y_all.shape)
Xtr, Xte, ytr, yte = train_test_split(X_all, y_all, stratify=y_all, random_state=1)

(200, 328) (200,)


In [None]:
clf = LogisticRegression()
clf.fit(Xtr, ytr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [None]:
score = metrics.roc_auc_score(yte, clf.decision_function(Xte))
print('Test AUROC score:', score)

Test AUROC score: 0.7272727272727273
