In [96]:
import pandas
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [97]:
FILENAME = '20230706_2e+07'

In [98]:
df = pandas.read_csv(f"../Data/Preprocessed/chartevents_{FILENAME}_labeled.csv", engine='python')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 923360 entries, 0 to 923359
Data columns (total 9 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   hadm_id                                923360 non-null  int64  
 1   charttime                              923360 non-null  object 
 2   Heart Rate                             923360 non-null  float64
 3   Respiratory Rate                       923305 non-null  float64
 4   Non Invasive Blood Pressure systolic   919917 non-null  float64
 5   Non Invasive Blood Pressure diastolic  919917 non-null  float64
 6   O2 saturation pulseoxymetry            923282 non-null  float64
 7   Temperature Celsius                    922334 non-null  float64
 8   label                                  923360 non-null  bool   
dtypes: bool(1), float64(6), int64(1), object(1)
memory usage: 57.2+ MB


In [99]:
mark = ("Heart Rate", \
        "Respiratory Rate", \
        "Non Invasive Blood Pressure systolic", \
        "Non Invasive Blood Pressure diastolic", \
        "O2 saturation pulseoxymetry", \
        "Temperature Celsius")
df.dropna(how='any', inplace=True)
df_under, df_under_label = df[list(mark)], df.label
df_under_resample, df_under_label_resample = RandomUnderSampler(sampling_strategy='majority').fit_resample(df_under, df_under_label)
# df_under_resample = (df_under_resample - df_under_resample.mean()) / df_under_resample.std()

In [150]:
df_under_resample.sample(5)

Heart Rate                                88.8
Respiratory Rate                          23.2
Non Invasive Blood Pressure systolic     110.8
Non Invasive Blood Pressure diastolic     56.0
O2 saturation pulseoxymetry               87.8
Temperature Celsius                       36.6
dtype: float64

In [101]:
df_under_resample.mean()

Heart Rate                                89.386207
Respiratory Rate                          21.232759
Non Invasive Blood Pressure systolic     110.346552
Non Invasive Blood Pressure diastolic     61.881034
O2 saturation pulseoxymetry               93.824138
Temperature Celsius                       36.376724
dtype: float64

In [102]:
X_train, X_test, y_train, y_test = train_test_split(df_under_resample, df_under_label_resample, test_size=0.2, stratify=df_under_label_resample)

In [None]:
from sklearn import tree
model = tree.DecisionTreeClassifier(max_depth=3)
model = model.fit(X_train, y_train)
tree.plot_tree(model)

In [104]:
from graphviz import Source
features = list(mark)
dot_data = tree.export_graphviz(model, feature_names=features)
graph = Source(dot_data)
graph.render(view=True, format="pdf", filename=f"../Charts/tree{FILENAME}")

'../Charts/tree20230706_2e+07.pdf'

In [105]:
len(X_train), len(X_test)

(464, 116)

In [123]:
cross_val_score(model, X_test, y_test, cv=5, scoring='accuracy').mean()

0.6463768115942028

In [122]:
model.score(X_test, y_test)

0.8362068965517241

In [151]:
scores = cross_val_score(tree.DecisionTreeClassifier(max_depth=3), df_under_resample, df_under_label_resample, cv=5, scoring='accuracy')
print(scores)
print(scores.mean())

[0.68103448 0.71551724 0.72413793 0.59482759 0.61206897]
0.6655172413793103
