In [12]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
# from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
# from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import signal
import pywt
import biosppy

def bandpassFilter(data):
    nyq = 0.5 * 500
    low = 3 / nyq
    high = 12 / nyq
    b, a = signal.butter(3, [low, high], btype='band')
    filtered_data = signal.filtfilt(b, a, data)
    return filtered_data


def pan_tompkins(ecg_data, fs=500, rel_amplitude=0.5, min_rr=120, max_rr=200):
    diff_data = np.diff(ecg_data)
    squared_data = diff_data**2
    window_size = int(0.02 * fs)
    average_data = np.convolve(squared_data, np.ones(window_size) / window_size, mode='same')
    high_threshold = rel_amplitude * np.max(average_data)
    low_threshold = 0.5 * high_threshold
    peaks = []
    for i in range(len(ecg_data)):
        if average_data[i] > high_threshold:
            peaks.append(i)
        if len(peaks) > 1 and i - peaks[-2] < min_rr:
            peaks.pop()
        elif len(peaks) > 2 and i - peaks[-1] > max_rr:
            peaks.pop()

    return np.array(peaks)


def preProcessing(fileName):
    
    with open(fileName, 'rb') as file:
        data = np.fromfile(file, dtype='int16')

    #correcting the baseline of the data

    corrected_data = signal.detrend(data)

    # creating a butter-worth filter or 2nd order
    # these are the best frequencie for this data

    cutoff_freq = 10
    sample_freq = 500 #Hz

    b, a = signal.butter(2, cutoff_freq / (sample_freq / 2))

    #filtering the signal

    filtered_data = signal.filtfilt(b, a, corrected_data)

    # applying wavelet transform on the baseline corrected data and then ignoring the high frequency and low frequency components
    arr = pywt.wavedec(corrected_data, 'sym4', level=4)

    # arr[0] = np.zeros_like(arr[0])
    arr[1] = np.zeros_like(arr[1])
    # arr[2] = np.zeros_like(arr[2])
    # arr[3] = np.zeros_like(arr[3])
    arr[4] = np.zeros_like(arr[4])

    wavdec_filtered_signal = pywt.waverec(arr, 'sym4')

    final_signal = bandpassFilter(wavdec_filtered_signal)

    results = biosppy.signals.ecg.christov_segmenter(signal=final_signal, sampling_rate=500)

    rTemp = results['rpeaks']
    
    r_peaks = [rTemp[0]]
    
    i = 1
    while i<len(rTemp):
        if rTemp[i]-300>rTemp[i-1]:
            r_peaks.append(rTemp[i])
        i += 1
        
    r_interval = [0]*(len(r_peaks)-1)

    for i in range(1, len(r_peaks)):
        r_interval[i-1] = r_peaks[i] - r_peaks[i-1]
        
    
    return r_interval



def getDataFrame(filename):

    r_interval = preProcessing(filename)
    
    df = pd.DataFrame(columns=[f'x{i}' for i in range(15)])
    df.loc[len(df)] = r_interval[:15]
    return df


def trainModel():
    
    # Read the CSV file into a DataFrame
    df = pd.read_csv('final_dataframe_cpy.csv')

    # Shuffle the DataFrame
    df = df.sample(frac=1).reset_index(drop=True)

    # Print the shuffled DataFrame
#     display(df.head())
    
    # Split the data into features (X) and labels (y)
    X = df.drop('y', axis=1)
    y = df['y']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    display(X_train)

    # Train different classifier like logistic regression, random forest, decision tree classifier so on using sk learn ml model APIs

#     model = LogisticRegression()
#     model = DecisionTreeClassifier()
#     model = GradientBoostingClassifier()
#     model = KNeighborsClassifier()
#     model = LGBMClassifier()
#     model = XGBClassifier()
    model = SGDClassifier()
    model.fit(X_train, y_train)

    # Evaluate the model
    # for classification we use accuracy and F1 score
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1SCore = f1_score(y_test, y_pred)

    print('Accuracy:', accuracy)
    print('F1Score:', f1SCore)

    return model



if __name__ == "__main__":
    
    
    model = trainModel()
    
    


      # plotting bradiacardia

      # plt.figure(figsize=(13, 3))
      # plt.plot(arr1[0])
      # plt.title("Bradycardia")
      # plt.xlabel('samples')
      # plt.ylabel('amplitude')
      # plt.scatter(arr1[1], arr1[0][arr1[1]], c='red')
      # plt.grid(True)
      # plt.ylim([-50, 150])
      # plt.tight_layout()

      # plt.show()



array([[-0.25243563, -0.1862904 , -0.39182564, ..., -0.0388762 ,
        -0.10048325, -0.31177791],
       [-0.14630885, -0.17047004, -0.38461413, ..., -0.6614262 ,
        -0.25237352, -0.07386656],
       [-1.22747546, -1.23834392, -0.89663169, ..., -1.39383795,
        -1.50323456, -1.39152938],
       ...,
       [-0.133043  ,  0.27249986,  0.35817191, ..., -0.99101149,
        -1.36921373, -1.58368854],
       [ 0.2251349 , -0.32867358,  0.47355615, ...,  0.76677673,
        -0.50254572,  0.40195612],
       [-0.48458797, -0.46314659, -0.21153777, ...,  0.66607011,
         0.48027366,  0.5483631 ]])

Accuracy: 0.6666666666666666
F1Score: 0.7272727272727274


In [14]:
# print(model.predict(getDataFrame("tachycardia.dat").values))

print(model.predict(getDataFrame("normal.dat").values))

print(model.predict(getDataFrame("tachycardia.dat").values))

# for i in range(1, 41):
#     filename = "Abnormal/"+str(i)+".dat"
#     print(model.predict(getDataFrame(filename).values))

[0]
[0]
