In [4]:
from clinical_ts.timeseries_utils import *
from clinical_ts.ecg_utils import *
from pathlib import Path
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## PTB-XL

In [5]:
target_fs=100 # sampling rate (100 Hz or 500 Hz)
data_folder_ptb_xl = Path("/home/shared/physionet.org/files/ptb-xl/1.0.3")
target_folder_ptb_xl = Path("./processed_ptb_xl_fs"+str(target_fs)) 

In [6]:
# v1 rough thresholds 90
thresholds_v1 = {
    "age": [30, 40, 50, 60, 70, 80],
    "weight": [60, 80, 100],
    "height": [160, 170, 180],
}
# percentile thresholds 92
thresholds_v2 = {
    "age": [42, 53, 60, 67, 75, 89],
    "weight": [57, 65, 73, 82],
    "height": [157, 163, 169, 175],
}

# common practive thresholds 91
thresholds_v3 = {
    "age": [12, 17, 34, 54, 74],
    "weight": [50, 70, 90, 110],
    "height": [150, 159, 169, 179],
}

In [7]:
# Prepare the dataset
df_ptb_xl, lbl_itos_ptb_xl,  mean_ptb_xl, std_ptb_xl = prepare_data_ptb_xl(data_folder_ptb_xl, min_cnt=0, target_fs=target_fs, channels=12, channel_stoi=channel_stoi_default, target_folder=target_folder_ptb_xl, thresholds=thresholds_v1)

df_ptb_xl columns Index(['patient_id', 'age', 'sex', 'height', 'weight', 'nurse', 'site',
       'device', 'recording_date', 'report', 'scp_codes', 'heart_axis',
       'infarction_stadium1', 'infarction_stadium2', 'validated_by',
       'second_opinion', 'initial_autogenerated_report', 'validated_by_human',
       'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems',
       'extra_beats', 'pacemaker', 'strat_fold', 'filename_lr', 'filename_hr'],
      dtype='object')
column_labels ['label_age', 'label_weight', 'label_height']
df_ptb_xl columns Index(['patient_id', 'age', 'sex', 'height', 'weight', 'nurse', 'site',
       'device', 'recording_date', 'report', 'scp_codes', 'heart_axis',
       'infarction_stadium1', 'infarction_stadium2', 'validated_by',
       'second_opinion', 'initial_autogenerated_report', 'validated_by_human',
       'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems',
       'extra_beats', 'pacemaker', 'strat_fold', 'filename_l

  0%|          | 10/21799 [00:01<39:48,  9.12it/s] 

Age value counts:  label_age
[0]    6
[3]    3
[1]    1
[2]    1
Name: count, dtype: int64
Sex value counts:  label_sex
[1]    6
[0]    5
Name: count, dtype: int64
Weight value counts:  label_weight
[1]    5
[2]    3
[0]    3
Name: count, dtype: int64
Height value counts:  label_height
[1]    11
Name: count, dtype: int64
BMI value counts:  label_bmi
[1]    11
Name: count, dtype: int64
Diagnosis value counts:  label_diag
[NORM]    10
[IMI]      1
Name: count, dtype: int64
Form value counts:  label_form
[]         9
[LVOLT]    1
[ABQRS]    1
Name: count, dtype: int64
Rhythm value counts:  label_rhythm
[SR]       9
[SBRAD]    1
[SARRH]    1
Name: count, dtype: int64





In [8]:
print(lbl_itos_ptb_xl.keys())

dict_keys(['label_all', 'label_diag', 'label_form', 'label_rhythm', 'label_diag_subclass', 'label_diag_superclass', 'label_age', 'label_weight', 'label_height'])


In [9]:
#reformat everything as memmap for efficiency
reformat_as_memmap(df_ptb_xl, target_folder_ptb_xl/("memmap.npy"),data_folder=target_folder_ptb_xl,delete_npys=True)

100%|██████████| 11/11 [00:00<00:00, 93.51it/s]




Saved df to processed_ptb_xl_fs100/df_memmap.pkl


Unnamed: 0_level_0,patient_id,age,sex,height,weight,nurse,site,device,recording_date,report,...,label_diag_subclass_numeric,label_diag_superclass_numeric,label_age_numeric,label_weight_numeric,label_height_numeric,data,data_mean,data_std,data_length,data_original
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,sinusrhythmus periphere niederspannung,...,[14],[3],[3],[1],[1],0,"[0.0025305834, 0.002238605, -0.001173855, 0.00...","[0.11038501, 0.085274525, 0.113332465, 0.21522...",1000,00001_hr.npy
2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,sinusbradykardie sonst normales ekg,...,[14],[3],[0],[1],[1],1,"[-0.0035542173, -0.0034309644, 0.0044596153, 0...","[0.1301214, 0.24356228, 0.21888414, 0.42209703...",1000,00002_hr.npy
3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,sinusrhythmus normales ekg,...,[14],[3],[1],[1],[1],2,"[-0.0027685824, 0.0057316544, -0.00033551158, ...","[0.1212705, 0.14372103, 0.11626389, 0.14763561...",1000,00003_hr.npy
4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,sinusrhythmus normales ekg,...,[14],[3],[0],[2],[1],3,"[-0.0011030551, 0.016957743, -0.0035870918, -0...","[0.13666143, 0.4285196, 0.18045565, 0.42638788...",1000,00004_hr.npy
5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,sinusrhythmus normales ekg,...,[14],[3],[0],[1],[1],4,"[-0.0036089725, -0.0025976324, 0.001548249, 0....","[0.08276952, 0.23692654, 0.14219096, 0.3419675...",1000,00005_hr.npy
6,19005.0,18.0,1,,58.0,2.0,0.0,CS-12 E,1984-11-28 13:32:13,sinusrhythmus normales ekg,...,[14],[3],[0],[0],[1],5,"[0.00029942972, 0.0025413984, -0.0010380992, 0...","[0.13084264, 0.1785671, 0.39134002, 0.4245493,...",1000,00006_hr.npy
7,16193.0,54.0,0,,83.0,2.0,0.0,CS-12 E,1984-11-28 13:32:22,"sinusrhythmus linkstyp t abnormal, wahrscheinl...",...,[14],[3],[3],[2],[1],6,"[-0.0006319369, 0.00217862, -0.0015123894, 0.0...","[0.25505987, 0.11079597, 0.14884469, 0.2033381...",1000,00007_hr.npy
8,11275.0,48.0,0,,95.0,2.0,0.0,CS-12 E,1984-12-01 14:49:52,sinusrhythmus linkstyp qrs(t) abnormal infe...,...,[4],[2],[2],[2],[1],7,"[0.0024964563, 0.0043863133, 0.00013577427, -4...","[0.16869786, 0.33999082, 0.20372091, 0.4219237...",1000,00008_hr.npy
9,18792.0,55.0,0,,70.0,2.0,0.0,CS-12 E,1984-12-08 09:44:43,sinusrhythmus normales ekg,...,[14],[3],[3],[1],[1],8,"[-0.0032755015, -0.0052576396, 0.0016908003, -...","[0.08748247, 0.1136266, 0.17873618, 0.35639262...",1000,00009_hr.npy
10,9456.0,22.0,1,,56.0,2.0,0.0,CS-12 E,1984-12-12 14:12:46,sinusrhythmus normales ekg,...,[14],[3],[0],[0],[1],9,"[-0.002399387, 0.0029389348, -0.002406399, -0....","[0.09262304, 0.1526876, 0.12470802, 0.2480076,...",1000,00010_hr.npy


In [10]:
df_ptb_xl.columns

Index(['patient_id', 'age', 'sex', 'height', 'weight', 'nurse', 'site',
       'device', 'recording_date', 'report', 'scp_codes', 'heart_axis',
       'infarction_stadium1', 'infarction_stadium2', 'validated_by',
       'second_opinion', 'initial_autogenerated_report', 'validated_by_human',
       'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems',
       'extra_beats', 'pacemaker', 'strat_fold', 'filename_lr', 'filename_hr',
       'label_all', 'label_diag', 'label_form', 'label_rhythm',
       'label_diag_subclass', 'label_diag_superclass', 'bmi', 'label_bmi',
       'label_age', 'label_sex', 'label_weight', 'label_height', 'dataset',
       'label_all_numeric', 'label_diag_numeric', 'label_form_numeric',
       'label_rhythm_numeric', 'label_diag_subclass_numeric',
       'label_diag_superclass_numeric', 'label_age_numeric',
       'label_weight_numeric', 'label_height_numeric', 'data', 'data_mean',
       'data_std', 'data_length'],
      dtype='object')

In [11]:
# print the first dataline of the dataframe
df_ptb_xl.iloc[0]

patient_id                                                                 15709.0
age                                                                           56.0
sex                                                                              1
height                                                                         NaN
weight                                                                        63.0
nurse                                                                          2.0
site                                                                           0.0
device                                                                   CS-12   E
recording_date                                                 1984-11-09 09:17:34
report                                      sinusrhythmus periphere niederspannung
scp_codes                                 {'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}
heart_axis                                                                     NaN
infa

In [12]:
# check all the columns and the datatypes
df_ptb_xl.dtypes

patient_id                       float64
age                              float64
sex                                int64
height                           float64
weight                           float64
nurse                            float64
site                             float64
device                            object
recording_date                    object
report                            object
scp_codes                         object
heart_axis                        object
infarction_stadium1               object
infarction_stadium2               object
validated_by                     float64
second_opinion                      bool
initial_autogenerated_report        bool
validated_by_human                  bool
baseline_drift                    object
static_noise                      object
burst_noise                       object
electrodes_problems               object
extra_beats                       object
pacemaker                         object
strat_fold      

## PTB-XL dataloader

In [13]:
input_size = 1000  # Sample length

chunkify_train = False
chunk_length_train = input_size if chunkify_train else 0
stride_train = input_size
        
chunkify_valtest = False
chunk_length_valtest = input_size if chunkify_valtest else 0
stride_valtest = input_size

In [14]:
df_mapped, lbl_itos,  mean, std = load_dataset(target_folder_ptb_xl)
print("lbl_itos", lbl_itos.keys())
for key in lbl_itos:
    print(key, lbl_itos[key], len(lbl_itos[key]))
print(df_mapped.columns)

ds_mean = np.array([-0.00184586, -0.00130277,  0.00017031, -0.00091313, -0.00148835,  -0.00174687, -0.00077071, -0.00207407,  0.00054329,  0.00155546,  -0.00114379, -0.00035649])
ds_std = np.array([0.16401004, 0.1647168 , 0.23374124, 0.33767231, 0.33362807,  0.30583013, 0.2731171 , 0.27554379, 0.17128962, 0.14030828,   0.14606956, 0.14656108])

def multihot_encode(x, num_classes):
    res = np.zeros(num_classes,dtype=np.float32)
    for y in x:
        res[y]=1
    return res

# multiple labels
all_meta_labels = ['patient_id', 'age', 'sex', 'height', 'weight', 'nurse', 'site',
       'device', 'recording_date', 'report', 'scp_codes', 'heart_axis',
       'infarction_stadium1', 'infarction_stadium2', 'validated_by',
       'second_opinion', 'initial_autogenerated_report', 'validated_by_human',
       'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems',
       'extra_beats', 'pacemaker', 'strat_fold', 'filename_lr', 'filename_hr',
       'label_all', 'label_diag', 'label_form', 'label_rhythm',
       'label_diag_subclass', 'label_diag_superclass', 'dataset',
       'label_all_numeric', 'label_diag_numeric', 'label_form_numeric',
       'label_rhythm_numeric', 'label_diag_subclass_numeric',
       'label_diag_superclass_numeric', 'data', 'data_mean', 'data_std',
       'data_length']

ptb_xl_label_demographcis = ["label_all", "label_age", "label_sex", "label_height", "label_weight"]
# Label all has 71 classes, label age has 7 classes, label sex has 2 classes, label height has 5 classes, label weight has 5 classes

df_mapped["label"] = df_mapped.apply(
    lambda row: np.concatenate([
        multihot_encode(row[label+"_numeric"], len(lbl_itos[label]))
        for label in ptb_xl_label_demographcis
    ]),
    axis=1
)

# print(df_mapped["label"])
print(len(df_mapped["label"].iloc[0]))
tfms_ptb_xl_cpc = ToTensor()
            
max_fold_id = df_mapped.strat_fold.max()
print(df_mapped["strat_fold"].value_counts())

df_train = df_mapped[df_mapped.strat_fold<max_fold_id-1]
df_val = df_mapped[df_mapped.strat_fold==max_fold_id-1]
df_test = df_mapped[df_mapped.strat_fold==max_fold_id]

lbl_itos dict_keys(['label_all', 'label_diag', 'label_form', 'label_rhythm', 'label_diag_subclass', 'label_diag_superclass', 'label_age', 'label_weight', 'label_height'])
label_all ['1AVB' '2AVB' '3AVB' 'ABQRS' 'AFIB' 'AFLT' 'ALMI' 'AMI' 'ANEUR' 'ASMI'
 'BIGU' 'CLBBB' 'CRBBB' 'DIG' 'EL' 'HVOLT' 'ILBBB' 'ILMI' 'IMI' 'INJAL'
 'INJAS' 'INJIL' 'INJIN' 'INJLA' 'INVT' 'IPLMI' 'IPMI' 'IRBBB' 'ISCAL'
 'ISCAN' 'ISCAS' 'ISCIL' 'ISCIN' 'ISCLA' 'ISC_' 'IVCD' 'LAFB' 'LAO/LAE'
 'LMI' 'LNGQT' 'LOWT' 'LPFB' 'LPR' 'LVH' 'LVOLT' 'NDT' 'NORM' 'NST_' 'NT_'
 'PAC' 'PACE' 'PMI' 'PRC(S)' 'PSVT' 'PVC' 'QWAVE' 'RAO/RAE' 'RVH' 'SARRH'
 'SBRAD' 'SEHYP' 'SR' 'STACH' 'STD_' 'STE_' 'SVARR' 'SVTAC' 'TAB_' 'TRIGU'
 'VCLVH' 'WPW'] 71
label_diag ['1AVB' '2AVB' '3AVB' 'ALMI' 'AMI' 'ANEUR' 'ASMI' 'CLBBB' 'CRBBB' 'DIG'
 'EL' 'ILBBB' 'ILMI' 'IMI' 'INJAL' 'INJAS' 'INJIL' 'INJIN' 'INJLA' 'IPLMI'
 'IPMI' 'IRBBB' 'ISCAL' 'ISCAN' 'ISCAS' 'ISCIL' 'ISCIN' 'ISCLA' 'ISC_'
 'IVCD' 'LAFB' 'LAO/LAE' 'LMI' 'LNGQT' 'LPFB' 'LVH' 'NDT' 'N

KeyError: 'label_sex_numeric'

In [12]:
df_train["label"].iloc[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0.], dtype=float32)

In [13]:
# Here are the PTB-XL dataloaders

ds_train=TimeseriesDatasetCrops(df_train,input_size,num_classes=len(lbl_itos),data_folder=target_folder_ptb_xl,chunk_length=chunk_length_train,min_chunk_length=input_size, stride=stride_train,transforms=tfms_ptb_xl_cpc,annotation=False,col_lbl ="label" ,memmap_filename=target_folder_ptb_xl/("memmap.npy"))
ds_val=TimeseriesDatasetCrops(df_val,input_size,num_classes=len(lbl_itos),data_folder=target_folder_ptb_xl,chunk_length=chunk_length_valtest,min_chunk_length=input_size, stride=stride_valtest,transforms=tfms_ptb_xl_cpc,annotation=False,col_lbl ="label",memmap_filename=target_folder_ptb_xl/("memmap.npy"))
ds_test=TimeseriesDatasetCrops(df_test,input_size,num_classes=len(lbl_itos),data_folder=target_folder_ptb_xl,chunk_length=chunk_length_valtest,min_chunk_length=input_size, stride=stride_valtest,transforms=tfms_ptb_xl_cpc,annotation=False,col_lbl ="label",memmap_filename=target_folder_ptb_xl/("memmap.npy"))

In [14]:
# Save splits into npy files with data and labels
train_data_npy = []
train_label_npy = []
for i in range(len(ds_train)):
    train_data_npy.append(ds_train[i].data)
    train_label_npy.append(ds_train[i].label)
train_data_npy = np.array(train_data_npy)
trai_label_npy = np.array(train_label_npy)

np.save(target_folder_ptb_xl/"ptbxl_train_data.npy", train_data_npy)
np.save(target_folder_ptb_xl/"ptbxl_train_labels.npy", train_label_npy)

val_data_npy = []
val_label_npy = []
for i in range(len(ds_val)):
    val_data_npy.append(ds_val[i].data)
    val_label_npy.append(ds_val[i].label)
val_data_npy = np.array(val_data_npy)
val_label_npy = np.array(val_label_npy)

np.save(target_folder_ptb_xl/"ptbxl_val_data.npy", val_data_npy)
np.save(target_folder_ptb_xl/"ptbxl_val_labels.npy", val_label_npy)


In [15]:
test_data_npy = []
test_label_npy = []
for i in range(len(ds_test)):
    test_data_npy.append(ds_test[i].data)
    test_label_npy.append(ds_test[i].label)
test_data_npy = np.array(test_data_npy)
test_label_npy = np.array(test_label_npy)

np.save(target_folder_ptb_xl/"ptbxl_test_data.npy", test_data_npy)
np.save(target_folder_ptb_xl/"ptbxl_test_labels.npy", test_label_npy)

In [16]:
# Load and check the shape of the saved npy files
train_data = np.load(target_folder_ptb_xl/"ptbxl_train_data.npy")
train_labels = np.load(target_folder_ptb_xl/"ptbxl_train_labels.npy")
print(train_data.shape)
print(train_labels.shape)

(17418, 12, 1000)
(17418, 92)


In [None]:
# Load the numpy data

val_data = np.load(target_folder_ptb_xl/"ptbxl_val_data.npy")


In [28]:
label_all = [
    '1AVB', '2AVB', '3AVB', 'ABQRS', 'AFIB', 'AFLT', 'ALMI', 'AMI', 'ANEUR', 'ASMI',
    'BIGU', 'CLBBB', 'CRBBB', 'DIG', 'EL', 'HVOLT', 'ILBBB', 'ILMI', 'IMI', 'INJAL',
    'INJAS', 'INJIL', 'INJIN', 'INJLA', 'INVT', 'IPLMI', 'IPMI', 'IRBBB', 'ISCAL',
    'ISCAN', 'ISCAS', 'ISCIL', 'ISCIN', 'ISCLA', 'ISC_', 'IVCD', 'LAFB', 'LAO/LAE',
    'LMI', 'LNGQT', 'LOWT', 'LPFB', 'LPR', 'LVH', 'LVOLT', 'NDT', 'NORM', 'NST_',
    'NT_', 'PAC', 'PACE', 'PMI', 'PRC(S)', 'PSVT', 'PVC', 'QWAVE', 'RAO/RAE', 'RVH',
    'SARRH', 'SBRAD', 'SEHYP', 'SR', 'STACH', 'STD_', 'STE_', 'SVARR', 'SVTAC', 'TAB_',
    'TRIGU', 'VCLVH', 'WPW'
]
print("label_all", len(label_all))
label_diag = [
    '1AVB', '2AVB', '3AVB', 'ALMI', 'AMI', 'ANEUR', 'ASMI', 'CLBBB', 'CRBBB', 'DIG',
    'EL', 'ILBBB', 'ILMI', 'IMI', 'INJAL', 'INJAS', 'INJIL', 'INJIN', 'INJLA', 'IPLMI',
    'IPMI', 'IRBBB', 'ISCAL', 'ISCAN', 'ISCAS', 'ISCIL', 'ISCIN', 'ISCLA', 'ISC_',
    'IVCD', 'LAFB', 'LAO/LAE', 'LMI', 'LNGQT', 'LPFB', 'LVH', 'NDT', 'NORM', 'NST_',
    'PMI', 'RAO/RAE', 'RVH', 'SEHYP', 'WPW'
]
print("label_diag", len(label_diag))
label_form = [
    'ABQRS', 'DIG', 'HVOLT', 'INVT', 'LNGQT', 'LOWT', 'LPR', 'LVOLT', 'NDT', 'NST_',
    'NT_', 'PAC', 'PRC(S)', 'PVC', 'QWAVE', 'STD_', 'STE_', 'TAB_', 'VCLVH'
]
print("label_form", len(label_form))
label_rhythm = [
    'AFIB', 'AFLT', 'BIGU', 'PACE', 'PSVT', 'SARRH', 'SBRAD', 'SR', 'STACH', 'SVARR',
    'SVTAC', 'TRIGU'
]
print("label_rhythm", len(label_rhythm))
label_diag_superclass = [
    'CD', 'HYP', 'MI', 'NORM', 'STTC'
]
print("label_diag_superclass", len(label_diag_superclass))
label_diag_subclass = [
    'AMI', 'CLBBB', 'CRBBB', 'ILBBB', 'IMI', 'IRBBB', 'ISCA', 'ISCI', 'ISC_', 'IVCD',
    'LAFB/LPFB', 'LAO/LAE', 'LMI', 'LVH', 'NORM', 'NST_', 'PMI', 'RAO/RAE', 'RVH',
    'SEHYP', 'STTC', 'WPW', '_AVB'
]
print("label_diag_subclass", len(label_diag_subclass))
# add and deduplicate the labels
label_agg = label_diag + label_form + label_rhythm
print("before deduplication", len(label_agg))
label_agg_de = list(set(label_agg))
print("after deduplication", len(label_agg_de))

# get the difference between the two lists
print(label_agg_de)
print(label_agg)

label_all 71
label_diag 44
label_form 19
label_rhythm 12
label_diag_superclass 5
label_diag_subclass 23
before deduplication 75
after deduplication 71
['CLBBB', 'ISCAL', 'SVARR', 'LVH', 'IMI', 'WPW', 'ISCIL', 'STE_', 'BIGU', 'ASMI', 'IVCD', 'ALMI', 'LMI', 'ABQRS', 'ISC_', 'PRC(S)', 'NDT', 'PVC', 'QWAVE', 'PACE', 'SBRAD', 'SARRH', 'SEHYP', 'IPLMI', 'LNGQT', 'NST_', 'PMI', 'LAO/LAE', 'INJIN', 'LOWT', 'INJAS', 'TRIGU', 'IRBBB', 'ISCLA', 'ISCAS', 'STD_', 'RVH', 'NT_', '1AVB', 'CRBBB', 'PAC', 'ILMI', 'LPFB', 'LVOLT', 'LAFB', 'LPR', 'AMI', 'ILBBB', 'DIG', 'ISCAN', 'PSVT', 'RAO/RAE', 'EL', 'SR', 'STACH', 'ANEUR', 'IPMI', 'NORM', 'INJIL', 'HVOLT', 'INVT', 'INJAL', 'VCLVH', 'AFLT', 'TAB_', 'SVTAC', 'AFIB', 'INJLA', 'ISCIN', '3AVB', '2AVB']
['1AVB', '2AVB', '3AVB', 'ALMI', 'AMI', 'ANEUR', 'ASMI', 'CLBBB', 'CRBBB', 'DIG', 'EL', 'ILBBB', 'ILMI', 'IMI', 'INJAL', 'INJAS', 'INJIL', 'INJIN', 'INJLA', 'IPLMI', 'IPMI', 'IRBBB', 'ISCAL', 'ISCAN', 'ISCAS', 'ISCIL', 'ISCIN', 'ISCLA', 'ISC_', 'IVCD', 'LAFB'

In [23]:
# get the difference fields between the two lists
print(label_all, len(label_all))
print(label_agg, len(label_agg))

for label in label_all:
    if label in label_agg:
        label_agg.remove(label)

print(label_agg, len(label_agg))


['1AVB', '2AVB', '3AVB', 'ABQRS', 'AFIB', 'AFLT', 'ALMI', 'AMI', 'ANEUR', 'ASMI', 'BIGU', 'CLBBB', 'CRBBB', 'DIG', 'EL', 'HVOLT', 'ILBBB', 'ILMI', 'IMI', 'INJAL', 'INJAS', 'INJIL', 'INJIN', 'INJLA', 'INVT', 'IPLMI', 'IPMI', 'IRBBB', 'ISCAL', 'ISCAN', 'ISCAS', 'ISCIL', 'ISCIN', 'ISCLA', 'ISC_', 'IVCD', 'LAFB', 'LAO/LAE', 'LMI', 'LNGQT', 'LOWT', 'LPFB', 'LPR', 'LVH', 'LVOLT', 'NDT', 'NORM', 'NST_', 'NT_', 'PAC', 'PACE', 'PMI', 'PRC(S)', 'PSVT', 'PVC', 'QWAVE', 'RAO/RAE', 'RVH', 'SARRH', 'SBRAD', 'SEHYP', 'SR', 'STACH', 'STD_', 'STE_', 'SVARR', 'SVTAC', 'TAB_', 'TRIGU', 'VCLVH', 'WPW'] 71
['1AVB', '2AVB', '3AVB', 'ALMI', 'AMI', 'ANEUR', 'ASMI', 'CLBBB', 'CRBBB', 'DIG', 'EL', 'ILBBB', 'ILMI', 'IMI', 'INJAL', 'INJAS', 'INJIL', 'INJIN', 'INJLA', 'IPLMI', 'IPMI', 'IRBBB', 'ISCAL', 'ISCAN', 'ISCAS', 'ISCIL', 'ISCIN', 'ISCLA', 'ISC_', 'IVCD', 'LAFB', 'LAO/LAE', 'LMI', 'LNGQT', 'LPFB', 'LVH', 'NDT', 'NORM', 'NST_', 'PMI', 'RAO/RAE', 'RVH', 'SEHYP', 'WPW', 'ABQRS', 'DIG', 'HVOLT', 'INVT', 'LNGQT'

In [36]:
print(label_agg)
print(label_all)
71 + 6 + 2 + 4 + 4 + 6 = 93

['WPW', 'ABQRS', 'IVCD', 'LNGQT', 'AFIB', 'LAFB', 'INVT', 'SVTAC', 'STD_', 'INJLA', 'INJIN', 'LAO/LAE', 'LPR', 'STE_', 'ALMI', 'NDT', 'LMI', 'NT_', 'QWAVE', 'INJAS', 'ILMI', 'BIGU', 'SVARR', 'ISCIL', 'ISCAL', 'AFLT', 'INJAL', 'PRC(S)', 'ANEUR', 'TRIGU', 'PACE', 'INJIL', 'TAB_', 'AMI', 'IRBBB', 'ISCLA', 'SBRAD', 'CLBBB', 'LOWT', 'SR', 'LPFB', 'DIG', 'HVOLT', 'EL', 'ISCAS', 'ASMI', 'PSVT', 'STACH', '2AVB', '3AVB', 'IMI', 'ISCAN', 'PMI', 'IPMI', 'RAO/RAE', 'LVOLT', 'PVC', 'CRBBB', 'NORM', 'VCLVH', 'LVH', 'ILBBB', 'ISC_', 'RVH', 'SEHYP', 'SARRH', 'NST_', 'PAC', 'IPLMI', '1AVB', 'ISCIN']
['1AVB', '2AVB', '3AVB', 'ABQRS', 'AFIB', 'AFLT', 'ALMI', 'AMI', 'ANEUR', 'ASMI', 'BIGU', 'CLBBB', 'CRBBB', 'DIG', 'EL', 'HVOLT', 'ILBBB', 'ILMI', 'IMI', 'INJAL', 'INJAS', 'INJIL', 'INJIN', 'INJLA', 'INVT', 'IPLMI', 'IPMI', 'IRBBB', 'ISCAL', 'ISCAN', 'ISCAS', 'ISCIL', 'ISCIN', 'ISCLA', 'ISC_', 'IVCD', 'LAFB', 'LAO/LAE', 'LMI', 'LNGQT', 'LOWT', 'LPFB', 'LPR', 'LVH', 'LVOLT', 'NDT', 'NORM', 'NST_', 'NT_', 'PA

In [29]:
column_to_dedup = ['DIG', 'LNGQT', 'NDT', 'NST_']
print(label_agg)
indexes = []
for column in column_to_dedup:
    if column in label_agg:
        # get the index of the column
        index = label_agg.index(column)
        indexes.append(index)

print(indexes)
# 44, 19, 12
# 40, 19, 12


['1AVB', '2AVB', '3AVB', 'ALMI', 'AMI', 'ANEUR', 'ASMI', 'CLBBB', 'CRBBB', 'DIG', 'EL', 'ILBBB', 'ILMI', 'IMI', 'INJAL', 'INJAS', 'INJIL', 'INJIN', 'INJLA', 'IPLMI', 'IPMI', 'IRBBB', 'ISCAL', 'ISCAN', 'ISCAS', 'ISCIL', 'ISCIN', 'ISCLA', 'ISC_', 'IVCD', 'LAFB', 'LAO/LAE', 'LMI', 'LNGQT', 'LPFB', 'LVH', 'NDT', 'NORM', 'NST_', 'PMI', 'RAO/RAE', 'RVH', 'SEHYP', 'WPW', 'ABQRS', 'DIG', 'HVOLT', 'INVT', 'LNGQT', 'LOWT', 'LPR', 'LVOLT', 'NDT', 'NST_', 'NT_', 'PAC', 'PRC(S)', 'PVC', 'QWAVE', 'STD_', 'STE_', 'TAB_', 'VCLVH', 'AFIB', 'AFLT', 'BIGU', 'PACE', 'PSVT', 'SARRH', 'SBRAD', 'SR', 'STACH', 'SVARR', 'SVTAC', 'TRIGU']
[9, 33, 36, 38]


In [34]:
# find for each field in the difference, which list is it in
for d in diff:
    if d in label_diag:
        print(d, "in label_diag")
    if d in label_form:
        print(d, "in label_form")
    if d in label_rhythm:
        print(d, "in label_rhythm")
    if d in label_diag_superclass:
        print(d, "in label_diag_superclass")
    if d in label_diag_subclass:
        print(d, "in label_diag_subclass")

In [18]:
['1AVB', '2AVB', '3AVB', 'ABQRS', 'AFIB', 'AFLT', 'ALMI', 'AMI', 'ANEUR', 'ASMI', 'BIGU', 'CLBBB', 'CRBBB', 'DIG', 'EL', 'HVOLT', 'ILBBB', 'ILMI', 'IMI', 'INJAL', 'INJAS', 'INJIL', 'INJIN', 'INJLA', 'INVT', 'IPLMI', 'IPMI', 'IRBBB', 'ISCAL', 'ISCAN', 'ISCAS', 'ISCIL', 'ISCIN', 'ISCLA', 'ISC_', 'IVCD', 'LAFB', 'LAO/LAE', 'LMI', 'LNGQT', 'LOWT', 'LPFB', 'LPR', 'LVH', 'LVOLT', 'NDT', 'NORM', 'NST_', 'NT_', 'PAC', 'PACE', 'PMI', 'PRC(S)', 'PSVT', 'PVC', 'QWAVE', 'RAO/RAE', 'RVH', 'SARRH', 'SBRAD', 'SEHYP', 'SR', 'STACH', 'STD_', 'STE_', 'SVARR', 'SVTAC', 'TAB_', 'TRIGU', 'VCLVH', 'WPW']
['1AVB', '2AVB', '3AVB', 'ALMI', 'AMI', 'ANEUR', 'ASMI', 'CLBBB', 'CRBBB', 'DIG', 'EL', 'ILBBB', 'ILMI', 'IMI', 'INJAL', 'INJAS', 'INJIL', 'INJIN', 'INJLA', 'IPLMI', 'IPMI', 'IRBBB', 'ISCAL', 'ISCAN', 'ISCAS', 'ISCIL', 'ISCIN', 'ISCLA', 'ISC_', 'IVCD', 'LAFB', 'LAO/LAE', 'LMI', 'LNGQT', 'LPFB', 'LVH', 'NDT', 'NORM', 'NST_', 'PMI', 'RAO/RAE', 'RVH', 'SEHYP', 'WPW', 'ABQRS', 'DIG', 'HVOLT', 'INVT', 'LNGQT', 'LOWT', 'LPR', 'LVOLT', 'NDT', 'NST_', 'NT_', 'PAC', 'PRC(S)', 'PVC', 'QWAVE', 'STD_', 'STE_', 'TAB_', 'VCLVH', 'AFIB', 'AFLT', 'BIGU', 'PACE', 'PSVT', 'SARRH', 'SBRAD', 'SR', 'STACH', 'SVARR', 'SVTAC', 'TRIGU']

['1AVB',
 '2AVB',
 '3AVB',
 'ALMI',
 'AMI',
 'ANEUR',
 'ASMI',
 'CLBBB',
 'CRBBB',
 'DIG',
 'EL',
 'ILBBB',
 'ILMI',
 'IMI',
 'INJAL',
 'INJAS',
 'INJIL',
 'INJIN',
 'INJLA',
 'IPLMI',
 'IPMI',
 'IRBBB',
 'ISCAL',
 'ISCAN',
 'ISCAS',
 'ISCIL',
 'ISCIN',
 'ISCLA',
 'ISC_',
 'IVCD',
 'LAFB',
 'LAO/LAE',
 'LMI',
 'LNGQT',
 'LPFB',
 'LVH',
 'NDT',
 'NORM',
 'NST_',
 'PMI',
 'RAO/RAE',
 'RVH',
 'SEHYP',
 'WPW',
 'ABQRS',
 'DIG',
 'HVOLT',
 'INVT',
 'LNGQT',
 'LOWT',
 'LPR',
 'LVOLT',
 'NDT',
 'NST_',
 'NT_',
 'PAC',
 'PRC(S)',
 'PVC',
 'QWAVE',
 'STD_',
 'STE_',
 'TAB_',
 'VCLVH',
 'AFIB',
 'AFLT',
 'BIGU',
 'PACE',
 'PSVT',
 'SARRH',
 'SBRAD',
 'SR',
 'STACH',
 'SVARR',
 'SVTAC',
 'TRIGU']