In [1]:
!pip install wfdb



In [2]:
import pandas as pd
import numpy as np
import wfdb
import ast

In [3]:
import os

In [4]:
def load_raw_data(df, sampling_rate, path):
    if sampling_rate == 100:
        data = [wfdb.rdsamp(os.path.join(path,f)) for f in df.filename_lr]
    else:
        data = [wfdb.rdsamp(os.path.join(path,f)) for f in df.filename_hr]
    data = np.array([signal for signal, meta in data])
    return data

In [5]:
path='/kaggle/input/ecg-data/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3/'
sampling_rate=100

# load and convert annotation data
Y = pd.read_csv(os.path.join(path,'ptbxl_database.csv'), index_col='ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))


In [6]:
Y.scp_codes 

ecg_id
1                 {'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}
2                             {'NORM': 80.0, 'SBRAD': 0.0}
3                               {'NORM': 100.0, 'SR': 0.0}
4                               {'NORM': 100.0, 'SR': 0.0}
5                               {'NORM': 100.0, 'SR': 0.0}
                               ...                        
21833    {'NDT': 100.0, 'PVC': 100.0, 'VCLVH': 0.0, 'ST...
21834             {'NORM': 100.0, 'ABQRS': 0.0, 'SR': 0.0}
21835                           {'ISCAS': 50.0, 'SR': 0.0}
21836                           {'NORM': 100.0, 'SR': 0.0}
21837                           {'NORM': 100.0, 'SR': 0.0}
Name: scp_codes, Length: 21799, dtype: object

In [7]:
Y.scp_codes.shape

(21799,)

In [8]:
X = load_raw_data(Y, sampling_rate, path)


In [9]:
agg_df = pd.read_csv(os.path.join(path,'scp_statements.csv'), index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]


In [10]:
def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

In [11]:
Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_diagnostic)

In [125]:
test_fold = 10

X_train = X[np.where(Y.strat_fold != test_fold)]
y_train = Y[(Y.strat_fold != test_fold)].diagnostic_superclass

X_test = X[np.where(Y.strat_fold == test_fold)]
y_test = Y[Y.strat_fold == test_fold].diagnostic_superclass

In [126]:
X_train.shape

(19601, 1000, 12)

In [127]:
y_train.head

<bound method NDFrame.head of ecg_id
1        [NORM]
2        [NORM]
3        [NORM]
4        [NORM]
5        [NORM]
          ...  
21833    [STTC]
21834    [NORM]
21835    [STTC]
21836    [NORM]
21837    [NORM]
Name: diagnostic_superclass, Length: 19601, dtype: object>

In [128]:
y_train.info()

<class 'pandas.core.series.Series'>
Index: 19601 entries, 1 to 21837
Series name: diagnostic_superclass
Non-Null Count  Dtype 
--------------  ----- 
19601 non-null  object
dtypes: object(1)
memory usage: 306.3+ KB


In [129]:
np.unique(y_train).shape[0]


22

In [130]:
print(type(y_train))


<class 'pandas.core.series.Series'>


In [131]:
print(y_train.apply(type).value_counts())

diagnostic_superclass
<class 'list'>    19601
Name: count, dtype: int64


In [132]:
y_train.head()

ecg_id
1    [NORM]
2    [NORM]
3    [NORM]
4    [NORM]
5    [NORM]
Name: diagnostic_superclass, dtype: object

In [133]:
def has_two_unique_values(cell):
  # Change delimiter if needed
    return len(set(cell)) == 2


In [134]:
y_train[:]

ecg_id
1        [NORM]
2        [NORM]
3        [NORM]
4        [NORM]
5        [NORM]
          ...  
21833    [STTC]
21834    [NORM]
21835    [STTC]
21836    [NORM]
21837    [NORM]
Name: diagnostic_superclass, Length: 19601, dtype: object

In [135]:
result=y_train.apply(has_two_unique_values)

In [136]:
result.value_counts()

diagnostic_superclass
False    15931
True      3670
Name: count, dtype: int64

In [137]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y_train_encoded = pd.DataFrame(mlb.fit_transform(y_train), columns=mlb.classes_)


In [138]:
y_train_encoded.shape

(19601, 5)

In [139]:
y_train_encoded.head()

Unnamed: 0,CD,HYP,MI,NORM,STTC
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0


In [140]:
type(y_train_encoded)

pandas.core.frame.DataFrame

In [141]:
def convert_columns_to_int(df, column_names):
    for column in column_names:
        df[column] = df[column].astype(int)
    return df


In [142]:
y_train = convert_columns_to_int(y_train_encoded, ['CD', 'HYP', 'MI','NORM','STTC'])


In [143]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19601 entries, 0 to 19600
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   CD      19601 non-null  int64
 1   HYP     19601 non-null  int64
 2   MI      19601 non-null  int64
 3   NORM    19601 non-null  int64
 4   STTC    19601 non-null  int64
dtypes: int64(5)
memory usage: 765.8 KB


In [116]:
# def merge_columns_to_list(df, column_names, new_column_name):
#     df[new_column_name] = df[column_names].apply(lambda row: list(row), axis=1)
#     return df


In [117]:
# y_train_merged = merge_columns_to_list(y_train, ['CD', 'HYP', 'MI','NORM','STTC'], 'merged_list')
# y_train_merged


Unnamed: 0,CD,HYP,MI,NORM,STTC,merged_list
0,0,0,0,1,0,"[0, 0, 0, 1, 0]"
1,0,0,0,1,0,"[0, 0, 0, 1, 0]"
2,0,0,0,1,0,"[0, 0, 0, 1, 0]"
3,0,0,0,1,0,"[0, 0, 0, 1, 0]"
4,0,0,0,1,0,"[0, 0, 0, 1, 0]"
...,...,...,...,...,...,...
19596,0,0,0,0,1,"[0, 0, 0, 0, 1]"
19597,0,0,0,1,0,"[0, 0, 0, 1, 0]"
19598,0,0,0,0,1,"[0, 0, 0, 0, 1]"
19599,0,0,0,1,0,"[0, 0, 0, 1, 0]"


In [118]:
# y_train_merged.drop(columns=['CD','HYP','MI','NORM','STTC'],inplace=True)

In [119]:
# y_train_merged.head()

Unnamed: 0,merged_list
0,"[0, 0, 0, 1, 0]"
1,"[0, 0, 0, 1, 0]"
2,"[0, 0, 0, 1, 0]"
3,"[0, 0, 0, 1, 0]"
4,"[0, 0, 0, 1, 0]"


In [120]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense


model = Sequential()

model.add(LSTM(units=50, activation='tanh', return_sequences=True, input_shape=(1000, 12)))

model.add(LSTM(units=50, activation='tanh', return_sequences=False))

model.add(Dense(units=5, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


  super().__init__(**kwargs)


In [None]:
history = model.fit(
 X_train, 
 y_train,
   epochs=20,  # Number of epochs
    batch_size=32,  
    validation_split=0.2  )


Epoch 1/20
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 776ms/step - accuracy: 0.4185 - loss: 0.5481 - val_accuracy: 0.3469 - val_loss: 0.5255
Epoch 2/20
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 770ms/step - accuracy: 0.4215 - loss: 0.5110 - val_accuracy: 0.4239 - val_loss: 0.5265
Epoch 3/20
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m377s[0m 769ms/step - accuracy: 0.4663 - loss: 0.5006 - val_accuracy: 0.3553 - val_loss: 0.5587
Epoch 4/20
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m387s[0m 778ms/step - accuracy: 0.4341 - loss: 0.5364 - val_accuracy: 0.3553 - val_loss: 0.5571
Epoch 5/20
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 777ms/step - accuracy: 0.4321 - loss: 0.5368 - val_accuracy: 0.3718 - val_loss: 0.5471
Epoch 6/20
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 775ms/step - accuracy: 0.4406 - loss: 0.5370 - val_accuracy: 0.3578 - val_loss: 0.5539
Epoc