* Imports and definitions

In [1]:
import plotly.graph_objects as go
from scipy import *
import numpy as np
import pywt
import pandas as pd
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay


samplingRate = 360  # Hz (given in the doc)

In [2]:
def draw(amplitudes, indices=None):
    if type(indices) == "list":  # if we have certain indices or not
        fig_cont = go.Figure()
        fig_cont.add_trace(
            go.Scatter(x=indices, y=amplitudes, mode="lines", name="Continuous Signal")
        )
    else:
        fig_cont = go.Figure()
        fig_cont.add_trace(
            go.Scatter(y=amplitudes, mode="lines", name="Continuous Signal")
        )
    fig_cont.update_layout(
        title="Continuous Signal",
        xaxis_title="Index",
        yaxis_title="Amplitude",
        width=1000,
        height=500,
    )

    fig_cont.show()  # For notebooks

In [3]:
# HB means HeartBeat
def readHB(fileName):
    signals = []
    with open(fileName) as l:
        line = l.readline()
        while line:
            signal = line.split("|")
            signals.append([float(value) for value in signal[:-1]])
            line = l.readline()
    return signals

In [4]:
dirPath = "/kaggle/input/ecg-signals/Data"

In [5]:
healthySignals = readHB(f"{dirPath}/Normal_Train.txt")

In [6]:
PVCSignals = readHB(f"{dirPath}/PVC_Train.txt")

## After some investigating , I noticed the following:
* healthySignals and PVCSignals each contain 200 signals
* Each signal has 300 points
* Max amplitude is 1.6692
* Min amplitude is -2.1939

In [7]:
draw(healthySignals[104])  # drawing a random signal to see what we are working with

In [8]:
draw(PVCSignals[180])

### There's a huge difference between the healthy and the PVC , hopefully the classification won't be hard

### Applying the Band Pass filter with cutoffs = [0.5,40]

In [9]:
b, a = signal.butter(
    N=4, Wn=[0.5, 40], btype="bandpass", fs=samplingRate
)  # getting the filter coofecients

In [10]:
filteredHealthy = [signal.lfilter(b, a, s) for s in healthySignals]
filteredPVC = [signal.lfilter(b, a, s) for s in PVCSignals]
filteredHealtyTest = [signal.lfilter(b, a, s) for s in readHB(f"{dirPath}/Normal_Test.txt")]
filteredPVCTest = [signal.lfilter(b, a, s) for s in readHB(f"{dirPath}/PVC_Test.txt")]

### The effect of filtering is very clear. Removed all unneeded noise

In [11]:
draw(filteredHealthy[104])

### Now will apply DWT and take the details coefficients because they contain the difference between normal and PVC signals

In [12]:
healthyDetails = [pywt.dwt(signal, "db4")[1] for signal in filteredHealthy]
PVCDetails = [pywt.dwt(signal, "db4")[1] for signal in filteredPVC]
healthyTestDetails = [pywt.dwt(signal, "db4")[1] for signal in filteredHealtyTest]
PVCTestDetails = [pywt.dwt(signal, "db4")[1] for signal in filteredPVCTest]

In [13]:
draw(healthyDetails[0])

In [14]:
draw(PVCDetails[0])

In [15]:
draw(healthyTestDetails[0])

### We can see that the difference between them is noticeable 

### We will use min-max scaling so the signal would range from [0,1]

In [16]:
def normalize(signal):
    return (signal - np.min(signal)) / (np.max(signal) - np.min(signal))

In [17]:
normalizedHealthy = [normalize(signal) for signal in healthyDetails]
normalizedPVC = [normalize(signal) for signal in PVCDetails]
normalizedHealthyTest = [normalize(signal) for signal in healthyTestDetails]
normalizedPVCTest = [normalize(signal) for signal in PVCTestDetails]

### A small comparison between processed signal vs raw signal

In [18]:
draw(PVCSignals[0])

In [19]:
draw(normalizedPVC[0])

* PVC : 1
* Healthy : 0

In [20]:
### Creating the DF and shuffling so its ready for training

In [21]:
data = normalizedHealthy + normalizedPVC
labels = list(np.zeros(200)) + list(np.ones(200))

df = pd.DataFrame({"x": data, "y": labels})
df = shuffle(df).reset_index(drop=True)

In [22]:
df

Unnamed: 0,x,y
0,"[0.5711671189355647, 0.5733331637514854, 0.571...",0.0
1,"[0.5468809245436146, 0.6457410851840105, 0.216...",1.0
2,"[0.4694308689854131, 0.3157035122797163, 1.0, ...",1.0
3,"[0.576817773449838, 0.5515553436457983, 0.6638...",0.0
4,"[0.5517256102384055, 0.5648281314692918, 0.498...",0.0
...,...,...
395,"[0.4550954160532016, 0.529602030652474, 0.2005...",1.0
396,"[0.5331833807041677, 0.5348793384344153, 0.534...",0.0
397,"[0.4638379780462974, 0.30198534955047845, 1.0,...",0.0
398,"[0.37333285511192127, 0.48618270022087695, 0.0...",1.0


In [23]:
### Creating the TestDF and shuffling so its ready for testing

In [24]:
testData = normalizedHealthyTest + normalizedPVCTest
testLabels = list(np.zeros(200)) + list(np.ones(200))

testDF = pd.DataFrame({"x": data, "y": labels})
testDF = shuffle(df).reset_index(drop=True)

### Start training

In [25]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(list(df["x"]),list(df["y"]))

### Predict data and Measuring accuracy

In [26]:
yPredict = knn.predict(list(testDF['x']))
yPredict

array([0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0.,
       0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 0.,
       1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0.,
       1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0.,
       1., 0., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 1., 1.,
       1., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0.,
       0., 0., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 0.,
       1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 1.

In [27]:
yTest = list(testDF['y'])
accuracy_score(yTest, yPredict)

0.9925

In [28]:
print(classification_report(yTest, yPredict))


              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99       200
         1.0       1.00      0.98      0.99       200

    accuracy                           0.99       400
   macro avg       0.99      0.99      0.99       400
weighted avg       0.99      0.99      0.99       400

