In [2]:
import wfdb
import pandas as pd
import numpy as np
import os
from sktime.classification.kernel_based import RocketClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

In [3]:
data_abs_path = '/Users/victor/physionet.org/files/mimic-iv-ecg/1.0/files'

split_csv = pd.read_csv('data/MIMIC-IV-ECG-Ext-Electrolytes/few_shot_splits/128shots/split1/Calcium50893.csv')

split_csv = split_csv[split_csv['subject_id'] <= 12000000] # i haven't downloaded the full dataset yet

labels = pd.read_csv('data/MIMIC-IV-ECG-Ext-Electrolytes/mimiciv_ECGv1.1_hospV2.2_Calcium50893.csv')

train = []
test = []
val = []

for index, row in split_csv.iterrows():
    path = data_abs_path + '/p' + f"{row['subject_id']}"[:4] + '/p' + f"{row['subject_id']}"
    samples = os.listdir(path)
    
    for sample in samples: # for each sample (subject) in the split, take all their studies
        sample_path = os.path.join(path, sample)
        if os.path.isdir(sample_path):
            if int(sample[1:]) not in labels['study_id'].values:
                continue
            signal, fields = wfdb.rdsamp(sample_path + '/' + sample[1:])

            if row['split'] == 'train':
                train.append(signal)
            elif row['split'] == 'test':
                test.append(signal)
            elif row['split'] == 'val':
                val.append(signal)

In [4]:
reshaped_train = np.empty((len(train), 12), dtype=object)

for i in range(len(train)):
    for j in range(12):
        reshaped_train[i, j] = pd.Series(train[i][:][j]) # reshaping from (# of subjects, 5000, 12) to (# of subjects, 12) where each entry is a pd.series of length 5000

X_train = pd.DataFrame(reshaped_train)

null_counts = X_train.map(lambda x: x.isna().sum() if isinstance(x, pd.Series) else 0).sum().sum()

X_train = X_train.map(lambda x: x.fillna(0) if isinstance(x, pd.Series) else x) # there were NaNs in the data, which is odd, so I'm filling them with 0
print(f"Number of null values in X_test: {null_counts}")
X_train.shape

Number of null values in X_test: 0


(46, 12)

In [5]:
subject_ids = split_csv['subject_id']
filtered_labels = labels[labels['subject_id'].isin(split_csv[split_csv['split'] == 'train']['subject_id'])] # only take the classifier rows that are relevant by subject_id
duplicate_study_ids = filtered_labels[filtered_labels.duplicated(subset=['study_id'], keep=False)]
print(duplicate_study_ids)
filtered_labels = filtered_labels.drop_duplicates(subset=['study_id'])
y_train = filtered_labels['flag']
y_train = y_train.replace({'abnormal': 1, np.nan: 0}) # abnormal = 1, normal = 0
y_train.shape

Empty DataFrame
Columns: [subject_id, study_id, ecg_time, path, itemid, charttime, valuenum, valueuom, flag]
Index: []


  y_train = y_train.replace({'abnormal': 1, np.nan: 0}) # abnormal = 1, normal = 0


(46,)

In [6]:
reshaped_test = np.empty((len(test), 12), dtype=object)
for i in range(len(test)):
    for j in range(12):
        reshaped_test[i, j] = pd.Series(test[i][:][j])

X_test = pd.DataFrame(reshaped_test)

null_counts = X_test.map(lambda x: x.isna().sum() if isinstance(x, pd.Series) else 0).sum().sum()

X_test = X_test.map(lambda x: x.fillna(0) if isinstance(x, pd.Series) else x) # there were NaNs in the data, which is odd, so I'm filling them with 0
print(f"Number of null values in X_test: {null_counts}")
X_test.shape

Number of null values in X_test: 108


(3259, 12)

In [7]:
subject_ids = split_csv['subject_id']
filtered_labels = labels[labels['subject_id'].isin(split_csv[split_csv['split'] == 'test']['subject_id'])]
duplicate_study_ids = filtered_labels[filtered_labels.duplicated(subset=['study_id'], keep=False)]
print(duplicate_study_ids)
filtered_labels = filtered_labels.drop_duplicates(subset=['study_id']) # there are duplicates in the calcium data for the same study_id, which is odd
y_test = filtered_labels['flag']
y_test = y_test.replace({'abnormal': 1, np.nan: 0})
y_test.shape

Empty DataFrame
Columns: [subject_id, study_id, ecg_time, path, itemid, charttime, valuenum, valueuom, flag]
Index: []


  y_test = y_test.replace({'abnormal': 1, np.nan: 0})


(3259,)

In [8]:
classifier = RocketClassifier(rocket_transform='multirocket')
classifier.fit(X_train, y_train)

In [9]:
y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

auroc = roc_auc_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"AUROC: {auroc}")
print(f"F1 Score: {f1}")

Accuracy: 0.7720159558146671
AUROC: 0.501176666310435
F1 Score: 0.03631647211413749


In [10]:
classifier_mini = RocketClassifier(rocket_transform='minirocket')
classifier_mini.fit(X_train, y_train)

In [11]:
y_pred_mini = classifier_mini.predict(X_test)

accuracy_mini = accuracy_score(y_test, y_pred_mini)

auroc_mini = roc_auc_score(y_test, y_pred_mini)

f1_mini = f1_score(y_test, y_pred_mini)

print(f"Accuracy: {accuracy_mini}")
print(f"AUROC: {auroc_mini}")
print(f"F1 Score: {f1_mini}")

Accuracy: 0.7750843817121816
AUROC: 0.5026356554130506
F1 Score: 0.034255599472990776
