In [1]:
import wfdb
import pandas as pd
import numpy as np
from pathos.multiprocessing import ProcessingPool as Pool
from sktime.classification.kernel_based import RocketClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

In [2]:
# root = '/Volumes/SanDisk SSD/physionet.org/files/mimic-iv-ecg/1.0/'

# split_csv = pd.read_csv(root + 'MIMIC-IV-ECG-Ext-Electrolytes/few_shot_splits/128shots/split1/Magnesium50960.csv')

# labels = pd.read_csv(root + 'MIMIC-IV-ECG-Ext-Electrolytes/mimiciv_ECGv1.1_hospV2.2_Magnesium50960.csv')

# train = []
# test = []
# val = []

# for index, row in split_csv.iterrows():
#     path = root + 'files/p' + f"{row['subject_id']}"[:4] + '/p' + f"{row['subject_id']}" + '/s' + f"{row['study_id']}"
    
#     signal, fields = wfdb.rdsamp(path + '/' + f"{row['study_id']}")

#     if row['split'] == 'train':
#         train.append(signal)
#     elif row['split'] == 'test':
#         test.append(signal)
#     elif row['split'] == 'val':
#         val.append(signal)

# len(train), len(val), len(test)


# faster version
def process_row(row):
    signal, fields = wfdb.rdsamp(row['path'] + '/' + str(row['study_id']))
    return row['split'], signal

root = '/Volumes/SanDisk SSD/physionet.org/files/mimic-iv-ecg/1.0/'

# Load the CSV files
split_csv = pd.read_csv(root + 'MIMIC-IV-ECG-Ext-Electrolytes/few_shot_splits/128shots/split1/Magnesium50960.csv')
labels = pd.read_csv(root + 'MIMIC-IV-ECG-Ext-Electrolytes/mimiciv_ECGv1.1_hospV2.2_Magnesium50960.csv')

# Precompute the paths
split_csv['path'] = root + 'files/p' + split_csv['subject_id'].astype(str).str[:4] + '/p' + split_csv['subject_id'].astype(str) + '/s' + split_csv['study_id'].astype(str)

# Initialize lists to store the signals
train = []
test = []
val = []

# Use pathos.multiprocessing to process rows in parallel
with Pool() as pool:
    results = pool.map(process_row, split_csv.to_dict('records'))

# Organize the results into train, test, and val lists
for split, signal in results:
    if split == 'train':
        train.append(signal)
    elif split == 'test':
        test.append(signal)
    elif split == 'val':
        val.append(signal)

len(train), len(val), len(test)

(128, 128, 10000)

In [3]:
reshaped_train = np.empty((len(train), 12), dtype=object)

for i in range(len(train)):
    for j in range(12):
        reshaped_train[i, j] = pd.Series(train[i][:][j]) # reshaping from (# of subjects, 5000, 12) to (# of subjects, 12) where each entry is a pd.series of length 5000

X_train = pd.DataFrame(reshaped_train)

null_counts = X_train.map(lambda x: x.isna().sum() if isinstance(x, pd.Series) else 0).sum().sum()

X_train = X_train.map(lambda x: x.fillna(0) if isinstance(x, pd.Series) else x) # there were NaNs in the data, which is odd, so I'm filling them with 0
print(f"Number of null values in X_train: {null_counts}")
X_train.shape

Number of null values in X_train: 12


(128, 12)

In [4]:
filtered_labels = labels[labels['study_id'].isin(split_csv[split_csv['split'] == 'train']['study_id'])]
y_train = filtered_labels['flag']
y_train = y_train.replace({'abnormal': 1, np.nan: 0}) # abnormal = 1, normal = 0
y_train.shape

  y_train = y_train.replace({'abnormal': 1, np.nan: 0}) # abnormal = 1, normal = 0


(128,)

In [5]:
reshaped_val = np.empty((len(val), 12), dtype=object)
for i in range(len(val)):
    for j in range(12):
        reshaped_val[i, j] = pd.Series(val[i][:][j])

X_val = pd.DataFrame(reshaped_val)

null_counts = X_val.map(lambda x: x.isna().sum() if isinstance(x, pd.Series) else 0).sum().sum()

X_val = X_val.map(lambda x: x.fillna(0) if isinstance(x, pd.Series) else x) # there were NaNs in the data, which is odd, so I'm filling them with 0
print(f"Number of null values in X_val: {null_counts}")
X_val.shape

Number of null values in X_val: 0


(128, 12)

In [6]:
filtered_labels = labels[labels['study_id'].isin(split_csv[split_csv['split'] == 'val']['study_id'])]
y_val = filtered_labels['flag']
y_val = y_val.replace({'abnormal': 1, np.nan: 0})
y_val.shape

  y_val = y_val.replace({'abnormal': 1, np.nan: 0})


(128,)

In [7]:
reshaped_test = np.empty((len(test), 12), dtype=object)
for i in range(len(test)):
    for j in range(12):
        reshaped_test[i, j] = pd.Series(test[i][:][j])

X_test = pd.DataFrame(reshaped_test)

null_counts = X_test.map(lambda x: x.isna().sum() if isinstance(x, pd.Series) else 0).sum().sum()

X_test = X_test.map(lambda x: x.fillna(0) if isinstance(x, pd.Series) else x) # there were NaNs in the data, which is odd, so I'm filling them with 0
print(f"Number of null values in X_test: {null_counts}")
X_test.shape

Number of null values in X_test: 309


(10000, 12)

In [8]:
filtered_labels = labels[labels['study_id'].isin(split_csv[split_csv['split'] == 'test']['study_id'])]
y_test = filtered_labels['flag']
y_test = y_test.replace({'abnormal': 1, np.nan: 0})
y_test.shape

  y_test = y_test.replace({'abnormal': 1, np.nan: 0})


(10000,)

In [9]:
for i in range(1, 100000, 5000):
    classifier = RocketClassifier(rocket_transform='minirocket', num_kernels=10000)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auroc = roc_auc_score(y_val, y_pred)
    print(f"Kernels: {i}, Accuracy: {accuracy:.5f}, F1 Score: {f1:.5f}, AUROC: {auroc:.5f}")
    
    if i == 1:
        scores_df = pd.DataFrame(columns=['Kernels', 'Accuracy', 'F1 Score', 'AUROC'])
    
    scores_df.loc[i] = [i, accuracy, f1, auroc]

Kernels: 1, Accuracy: 0.91406, F1 Score: 0.15385, AUROC: 0.54295
Kernels: 5001, Accuracy: 0.91406, F1 Score: 0.15385, AUROC: 0.54295
Kernels: 10001, Accuracy: 0.90625, F1 Score: 0.14286, AUROC: 0.53875
Kernels: 15001, Accuracy: 0.92188, F1 Score: 0.16667, AUROC: 0.54715
Kernels: 20001, Accuracy: 0.90625, F1 Score: 0.00000, AUROC: 0.48739
Kernels: 25001, Accuracy: 0.91406, F1 Score: 0.00000, AUROC: 0.49160
Kernels: 30001, Accuracy: 0.91406, F1 Score: 0.15385, AUROC: 0.54295
Kernels: 35001, Accuracy: 0.92188, F1 Score: 0.16667, AUROC: 0.54715
Kernels: 40001, Accuracy: 0.91406, F1 Score: 0.15385, AUROC: 0.54295
Kernels: 45001, Accuracy: 0.90625, F1 Score: 0.14286, AUROC: 0.53875
Kernels: 50001, Accuracy: 0.90625, F1 Score: 0.14286, AUROC: 0.53875
Kernels: 55001, Accuracy: 0.89062, F1 Score: 0.12500, AUROC: 0.53035
Kernels: 60001, Accuracy: 0.92188, F1 Score: 0.16667, AUROC: 0.54715
Kernels: 65001, Accuracy: 0.89062, F1 Score: 0.12500, AUROC: 0.53035
Kernels: 70001, Accuracy: 0.89844, F1 S

In [10]:
classifier = RocketClassifier(rocket_transform='minirocket', num_kernels=75001)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

auroc = roc_auc_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"AUROC: {auroc}")
print(f"F1 Score: {f1}")

Accuracy: 0.8598
AUROC: 0.5079499512205601
F1 Score: 0.0907911802853437
