In [1]:
import pandas as pd
import numpy as np
import wfdb
import ast

In [2]:
import os

In [3]:
def load_raw_data(df, sampling_rate, path):
    if sampling_rate == 100:
        data = [wfdb.rdsamp(os.path.join(path,f)) for f in df.filename_lr]
    else:
        data = [wfdb.rdsamp(os.path.join(path,f)) for f in df.filename_hr]
    data = np.array([signal for signal, meta in data])
    return data

In [4]:
path='/kaggle/input/ecg-data/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3/'
sampling_rate=100

# load and convert annotation data
Y = pd.read_csv(os.path.join(path,'ptbxl_database.csv'), index_col='ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))


In [5]:
Y.scp_codes 

ecg_id
1                 {'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}
2                             {'NORM': 80.0, 'SBRAD': 0.0}
3                               {'NORM': 100.0, 'SR': 0.0}
4                               {'NORM': 100.0, 'SR': 0.0}
5                               {'NORM': 100.0, 'SR': 0.0}
                               ...                        
21833    {'NDT': 100.0, 'PVC': 100.0, 'VCLVH': 0.0, 'ST...
21834             {'NORM': 100.0, 'ABQRS': 0.0, 'SR': 0.0}
21835                           {'ISCAS': 50.0, 'SR': 0.0}
21836                           {'NORM': 100.0, 'SR': 0.0}
21837                           {'NORM': 100.0, 'SR': 0.0}
Name: scp_codes, Length: 21799, dtype: object

In [6]:
Y.scp_codes.shape

(21799,)

In [7]:
X = load_raw_data(Y, sampling_rate, path)


In [8]:
agg_df = pd.read_csv(os.path.join(path,'scp_statements.csv'), index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]


In [9]:
def aggregate_diagnostic(y_dic):
    tmp = []
    for key in y_dic.keys():
        if key in agg_df.index:
            tmp.append(agg_df.loc[key].diagnostic_class)
    return list(set(tmp))

In [10]:
Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_diagnostic)

In [11]:
test_fold = 10

X_train = X[np.where(Y.strat_fold != test_fold)]
y_train = Y[(Y.strat_fold != test_fold)].diagnostic_superclass

X_test = X[np.where(Y.strat_fold == test_fold)]
y_test = Y[Y.strat_fold == test_fold].diagnostic_superclass

In [12]:
X_train.shape

(19601, 1000, 12)

In [13]:
y_train.head

<bound method NDFrame.head of ecg_id
1        [NORM]
2        [NORM]
3        [NORM]
4        [NORM]
5        [NORM]
          ...  
21833    [STTC]
21834    [NORM]
21835    [STTC]
21836    [NORM]
21837    [NORM]
Name: diagnostic_superclass, Length: 19601, dtype: object>

In [14]:
y_train.info()

<class 'pandas.core.series.Series'>
Index: 19601 entries, 1 to 21837
Series name: diagnostic_superclass
Non-Null Count  Dtype 
--------------  ----- 
19601 non-null  object
dtypes: object(1)
memory usage: 306.3+ KB


In [15]:
np.unique(y_train).shape[0]


22

In [16]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y_train_encoded = pd.DataFrame(mlb.fit_transform(y_train), columns=mlb.classes_)

In [17]:
y_train_encoded.shape

(19601, 5)

In [18]:
y_train_encoded.head()

Unnamed: 0,CD,HYP,MI,NORM,STTC
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0


In [19]:
type(y_train_encoded)

pandas.core.frame.DataFrame

In [20]:
y_train_encoded['final_output']=y_train_encoded.apply(lambda x:''.join(x.astype(str)),axis=1)

In [21]:
y_train_encoded.head()

Unnamed: 0,CD,HYP,MI,NORM,STTC,final_output
0,0,0,0,1,0,10
1,0,0,0,1,0,10
2,0,0,0,1,0,10
3,0,0,0,1,0,10
4,0,0,0,1,0,10


In [22]:
y_train_encoded['final_output'].nunique()

22

In [23]:
y_train_encoded['final_output'].unique()

array(['00010', '00100', '00000', '00001', '01000', '10000', '00101',
       '11000', '10100', '10001', '01100', '01101', '01001', '10101',
       '10010', '11101', '11001', '00011', '11100', '10011', '01010',
       '11110'], dtype=object)

In [24]:
def adjust_classes(y_train):
    var=y_train
    if var == '00010':
        var=0
    elif var == '00100':
        var =1
    elif var== '00000':
        var=2
    elif var=='00001':
        var =3
    elif var =='01000':
        var=4
    elif var=='10000':
        var =5
    elif var =='00101':
        var =6
    elif var =='11000':
        var=7
    elif var =='10100':
        var=8
    elif var=='10001':
        var =9
    elif var =='01100':
        var =10
    elif var =='01101':
        var =11
    elif var=='01001':
        var=21
    elif var =='10101':
        var=12
    elif var =='10010':
        var=13
    elif var=='11101':
        var=14
    elif var=='11001':
        var=15
    elif var =='00011':
        var=16
    elif var=='11100':
        var=17
    elif var=='10011':
        var=18
    elif var =='01010':
        var=19
    elif var=='11110':
        var=20
    return var

In [25]:
y_train_encoded['final_output']=y_train_encoded['final_output'].apply(adjust_classes)

In [26]:
y_train_encoded

Unnamed: 0,CD,HYP,MI,NORM,STTC,final_output
0,0,0,0,1,0,0
1,0,0,0,1,0,0
2,0,0,0,1,0,0
3,0,0,0,1,0,0
4,0,0,0,1,0,0
...,...,...,...,...,...,...
19596,0,0,0,0,1,3
19597,0,0,0,1,0,0
19598,0,0,0,0,1,3
19599,0,0,0,1,0,0


In [27]:
y_train_encoded['final_output'].value_counts()

final_output
0     8157
1     2276
3     2158
5     1524
8     1164
21     708
6      538
4      479
9      433
2      371
13     362
11     320
7      273
12     202
15     186
10     166
14     140
17     112
16      24
18       5
19       2
20       1
Name: count, dtype: int64

In [28]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [29]:
x_train_normalization=X_train.reshape(-1,1000*12)

In [30]:
x_train=scaler.fit_transform(x_train_normalization)
x_train_normalized=x_train.reshape(19601,1000,12)

In [31]:
class_distribution_norm=y_train_encoded['NORM'].value_counts()
class_distribution_norm

NORM
0    11050
1     8551
Name: count, dtype: int64

In [32]:
class_distribution_cd=y_train_encoded['CD'].value_counts()
class_distribution_cd

CD
0    15199
1     4402
Name: count, dtype: int64

In [33]:
class_distribution_hyp=y_train_encoded['HYP'].value_counts()
class_distribution_hyp

HYP
0    17214
1     2387
Name: count, dtype: int64

In [34]:
class_distribution_mi=y_train_encoded['MI'].value_counts()
class_distribution_mi

MI
0    14682
1     4919
Name: count, dtype: int64

In [35]:
class_distribution_sttc=y_train_encoded['STTC'].value_counts()
class_distribution_sttc

STTC
0    14887
1     4714
Name: count, dtype: int64

In [36]:
indices_0 = np.where(y_train_encoded['NORM'] == 0)[0] 
indices_1 = np.where(y_train_encoded['NORM'] == 1)[0]
selected_indices_0=np.random.choice(indices_0, 4000, replace=False)
selected_indices_1=np.random.choice(indices_1, 4000, replace=False)
x_train_norm0=X_train[selected_indices_0]
x_train_norm1=X_train[selected_indices_1]
y_train_norm0=y_train_encoded.iloc[selected_indices_0,3]
y_train_norm1=y_train_encoded.iloc[selected_indices_1,3]
x_train= np.concatenate([x_train_norm0, x_train_norm1], axis=0)
y_train=np.concatenate([y_train_norm0, y_train_norm1], axis=0)

In [37]:
y_train.shape

(8000,)

In [38]:
x_train.shape

(8000, 1000, 12)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers, callbacks

def create_ecg_cnn_model(input_shape):
    model = models.Sequential()
    
    model.add(layers.Conv1D(filters=64, kernel_size=5, activation=None, input_shape=input_shape))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU(alpha=0.1))  # Use LeakyReLU
    
    model.add(layers.Conv1D(filters=128, kernel_size=3, activation=None))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.MaxPooling1D(pool_size=2))
    
    model.add(layers.Conv1D(filters=256, kernel_size=3, activation=None))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.MaxPooling1D(pool_size=2))
    
    model.add(layers.Conv1D(filters=512, kernel_size=3, activation=None))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.MaxPooling1D(pool_size=2))
    
    model.add(layers.GlobalAveragePooling1D())
    
    model.add(layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(layers.Dropout(0.5))
    
    model.add(layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
    model.add(layers.Dropout(0.5))
    
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

input_shape = (1000, 12)
model = create_ecg_cnn_model(input_shape)

model.summary()

early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

model.fit(x_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping, reduce_lr])




Epoch 1/100
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 27ms/step - accuracy: 0.7559 - loss: 0.9054 - val_accuracy: 0.7430 - val_loss: 0.6963 - learning_rate: 0.0010
Epoch 2/100
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.8196 - loss: 0.5996 - val_accuracy: 0.8094 - val_loss: 0.5368 - learning_rate: 0.0010
Epoch 3/100
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.8392 - loss: 0.4949 - val_accuracy: 0.8305 - val_loss: 0.4785 - learning_rate: 0.0010
Epoch 4/100
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.8528 - loss: 0.4404 - val_accuracy: 0.8492 - val_loss: 0.4180 - learning_rate: 0.0010
Epoch 5/100
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.8650 - loss: 0.3844 - val_accuracy: 0.8602 - val_loss: 0.3751 - learning_rate: 0.0010
Epoch 6/100
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x797c54f48550>

In [60]:
indices_0 = np.where(y_train_encoded['CD'] == 0)[0] 
indices_1 = np.where(y_train_encoded['CD'] == 1)[0]
selected_indices_0=np.random.choice(indices_0, 4000, replace=False)
selected_indices_1=np.random.choice(indices_1, 4000, replace=False)
x_train_norm0=X_train[selected_indices_0]
x_train_norm1=X_train[selected_indices_1]
y_train_norm0=y_train_encoded.iloc[selected_indices_0,3]
y_train_norm1=y_train_encoded.iloc[selected_indices_1,3]
x_train= np.concatenate([x_train_norm0, x_train_norm1], axis=0)
y_train=np.concatenate([y_train_norm0, y_train_norm1], axis=0)
x_train, val_data, y_train, val_labels = train_test_split(
    x_train, y_train, test_size=0.2, random_state=42
)

# Training the model
history = model.fit(
    x_train, y_train,
    epochs=100, 
    batch_size=32, 
    validation_data=(val_data, val_labels), 
    callbacks=[early_stopping]
)

Epoch 1/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.8941 - loss: 0.2766 - val_accuracy: 0.8913 - val_loss: 0.2611
Epoch 2/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.8922 - loss: 0.2579 - val_accuracy: 0.8931 - val_loss: 0.2550
Epoch 3/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.9041 - loss: 0.2530 - val_accuracy: 0.8913 - val_loss: 0.2602
Epoch 4/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.8984 - loss: 0.2542 - val_accuracy: 0.8931 - val_loss: 0.2563
Epoch 5/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.8998 - loss: 0.2523 - val_accuracy: 0.8994 - val_loss: 0.2501
Epoch 6/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.8897 - loss: 0.2673 - val_accuracy: 0.8950 - val_loss: 0.2515
Epoch 7/100
[1m

In [61]:
indices_0 = np.where(y_train_encoded['STTC'] == 0)[0] 
indices_1 = np.where(y_train_encoded['STTC'] == 1)[0]
selected_indices_0=np.random.choice(indices_0, 4000, replace=False)
selected_indices_1=np.random.choice(indices_1, 4000, replace=False)
x_train_norm0=X_train[selected_indices_0]
x_train_norm1=X_train[selected_indices_1]
y_train_norm0=y_train_encoded.iloc[selected_indices_0,3]
y_train_norm1=y_train_encoded.iloc[selected_indices_1,3]
x_train= np.concatenate([x_train_norm0, x_train_norm1], axis=0)
y_train=np.concatenate([y_train_norm0, y_train_norm1], axis=0)
x_train, val_data, y_train, val_labels = train_test_split(
    x_train, y_train, test_size=0.2, random_state=42
)

# Training the model
history = model.fit(
    x_train, y_train,
    epochs=100, 
    batch_size=32, 
    validation_data=(val_data, val_labels), 
    callbacks=[early_stopping]
)

Epoch 1/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.8941 - loss: 0.2596 - val_accuracy: 0.8963 - val_loss: 0.2441
Epoch 2/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.9092 - loss: 0.2321 - val_accuracy: 0.9006 - val_loss: 0.2419
Epoch 3/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.9109 - loss: 0.2339 - val_accuracy: 0.8994 - val_loss: 0.2431
Epoch 4/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.9144 - loss: 0.2203 - val_accuracy: 0.9019 - val_loss: 0.2360
Epoch 5/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.9103 - loss: 0.2282 - val_accuracy: 0.8981 - val_loss: 0.2352
Epoch 6/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.9129 - loss: 0.2182 - val_accuracy: 0.8981 - val_loss: 0.2376
Epoch 7/100
[1m

In [62]:
indices_0 = np.where(y_train_encoded['MI'] == 0)[0] 
indices_1 = np.where(y_train_encoded['MI'] == 1)[0]
selected_indices_0=np.random.choice(indices_0, 4000, replace=False)
selected_indices_1=np.random.choice(indices_1, 4000, replace=False)
x_train_norm0=X_train[selected_indices_0]
x_train_norm1=X_train[selected_indices_1]
y_train_norm0=y_train_encoded.iloc[selected_indices_0,3]
y_train_norm1=y_train_encoded.iloc[selected_indices_1,3]
x_train= np.concatenate([x_train_norm0, x_train_norm1], axis=0)
y_train=np.concatenate([y_train_norm0, y_train_norm1], axis=0)
x_train, val_data, y_train, val_labels = train_test_split(
    x_train, y_train, test_size=0.2, random_state=42
)

# Training the model
history = model.fit(
    x_train, y_train,
    epochs=100, 
    batch_size=32, 
    validation_data=(val_data, val_labels), 
    callbacks=[early_stopping]
)

Epoch 1/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.8987 - loss: 0.2477 - val_accuracy: 0.9031 - val_loss: 0.2405
Epoch 2/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.9037 - loss: 0.2366 - val_accuracy: 0.9031 - val_loss: 0.2367
Epoch 3/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.9060 - loss: 0.2324 - val_accuracy: 0.8956 - val_loss: 0.2413
Epoch 4/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.9093 - loss: 0.2299 - val_accuracy: 0.8850 - val_loss: 0.2494
Epoch 5/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.9181 - loss: 0.2217 - val_accuracy: 0.9006 - val_loss: 0.2373
Epoch 6/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.9112 - loss: 0.2172 - val_accuracy: 0.8956 - val_loss: 0.2385
Epoch 7/100
[1m

In [63]:
indices_0 = np.where(y_train_encoded['HYP'] == 0)[0] 
indices_1 = np.where(y_train_encoded['HYP'] == 1)[0]
selected_indices_0=np.random.choice(indices_0, 2000, replace=False)
selected_indices_1=np.random.choice(indices_1, 2000, replace=False)
x_train_norm0=X_train[selected_indices_0]
x_train_norm1=X_train[selected_indices_1]
y_train_norm0=y_train_encoded.iloc[selected_indices_0,3]
y_train_norm1=y_train_encoded.iloc[selected_indices_1,3]
x_train= np.concatenate([x_train_norm0, x_train_norm1], axis=0)
y_train=np.concatenate([y_train_norm0, y_train_norm1], axis=0)
x_train, val_data, y_train, val_labels = train_test_split(
    x_train, y_train, test_size=0.2, random_state=42
)

# Training the model
history = model.fit(
    x_train, y_train,
    epochs=100, 
    batch_size=32, 
    validation_data=(val_data, val_labels), 
    callbacks=[early_stopping]
)

Epoch 1/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.8914 - loss: 0.2538 - val_accuracy: 0.8800 - val_loss: 0.2790
Epoch 2/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9166 - loss: 0.2153 - val_accuracy: 0.9025 - val_loss: 0.2360
Epoch 3/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9104 - loss: 0.2217 - val_accuracy: 0.9100 - val_loss: 0.2269
Epoch 4/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9180 - loss: 0.2060 - val_accuracy: 0.9100 - val_loss: 0.2284
Epoch 5/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9105 - loss: 0.2189 - val_accuracy: 0.9050 - val_loss: 0.2330
Epoch 6/100
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9155 - loss: 0.2003 - val_accuracy: 0.9038 - val_loss: 0.2321
Epoch 7/100
[1m