#Install Dependencies

In [None]:
!pip install h5py pandas numpy scikit-learn tensorflow keras imbalanced-learn

#Import Libraries

In [None]:
import h5py
import pandas as pd
import numpy as np
import os
import joblib
from scipy.signal import butter, filtfilt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Process original data

In [None]:
file_part_number = 0
excel_file_path = '/content/drive/MyDrive/DATASETS/exams.xlsx'
excel_data = pd.read_excel(excel_file_path)

hdf5_file_paths = [
    '/content/drive/MyDrive/DATASETS/exams_part0.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part1.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part2.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part3.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part4.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part5.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part6.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part7.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part8.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part9.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part10.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part11.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part12.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part13.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part14.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part15.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part16.hdf5',
    '/content/drive/MyDrive/DATASETS/exams_part17.hdf5'
]

hdf5_file_path = hdf5_file_paths[file_part_number]

combined_data = pd.DataFrame()

part_name = hdf5_file_path.split('/')[-1]  # Get the HDF5 file name (e.g., 'exams_part6.hdf5')

# Load the HDF5 file
with h5py.File(hdf5_file_path, 'r') as hdf:
    exam_id = hdf['exam_id'][:]
    tracings = hdf['tracings'][:]

# Filter the Excel data to match the current part
filtered_data = excel_data[excel_data['trace_file'] == part_name]

# Create a DataFrame for the current HDF5 file
df_hdf5 = pd.DataFrame({
    'exam_id': exam_id,
    'tracings': list(tracings)
})

# Merge the filtered Excel data with the HDF5 data on 'exam_id'
merged_data = pd.merge(df_hdf5, filtered_data, on='exam_id')

# Concatenate the merged data with the combined DataFrame
combined_data = pd.concat([combined_data, merged_data], ignore_index=True)

# Check the shape of the combined data
print(f"Shape of the combined dataset: {combined_data.shape}")

# Optionally, check the shape of the tracings for the first few entries
for i in range(5):
    print(f"Shape of the tracings array for exam_id {combined_data['exam_id'].iloc[i]}: {combined_data['tracings'].iloc[i].shape}")

##Check that if all arrhythmia columns are false ecg must be labeled normal and remove rows that don't match the condition.

In [None]:
# List of arrhythmia columns
arrhythmia_columns = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF']

# Condition: If all arrhythmia columns are False, then 'normal_ecg' must be True
# Step 1: Identify rows where all arrhythmia columns are False
arrhythmia_false_condition = combined_data[arrhythmia_columns].sum(axis=1) == 0

# Step 2: Identify rows where 'normal_ecg' is False
normal_ecg_false_condition = combined_data['normal_ecg'] == False

# Step 3: Combine the conditions: we want to remove rows where all arrhythmias are False and 'normal_ecg' is also False
violation_condition = arrhythmia_false_condition & normal_ecg_false_condition

# Step 4: Filter out the rows that violate the condition
cleaned_data = combined_data[~violation_condition]

# Display the number of rows removed
rows_removed = len(combined_data) - len(cleaned_data)
print(f"Number of rows removed: {rows_removed}")

# Display the shape of the cleaned dataset
print(f"Shape of the cleaned dataset: {cleaned_data.shape}")

##Check that ecg does not have both an arrhythmia and the normal label, and remove ones that violate the condition

In [None]:
# Condition: If at least one arrhythmia column is True, 'normal_ecg' must be False
# Step 1: Identify rows where at least one arrhythmia column is True
arrhythmia_true_condition = cleaned_data[arrhythmia_columns].sum(axis=1) > 0

# Step 2: Identify rows where 'normal_ecg' is True (this is invalid if an arrhythmia is present)
normal_ecg_true_condition = cleaned_data['normal_ecg'] == True

# Step 3: Combine the conditions: we want to remove rows where any arrhythmia column is True and 'normal_ecg' is also True
violation_condition = arrhythmia_true_condition & normal_ecg_true_condition

# Step 4: Filter out the rows that violate the condition
final_cleaned_data = cleaned_data[~violation_condition]

# Display the number of rows removed
rows_removed = len(cleaned_data) - len(final_cleaned_data)
print(f"Number of rows removed: {rows_removed}")

# Display the shape of the cleaned dataset
print(f"Shape of the final cleaned dataset: {final_cleaned_data.shape}")

##Visualize the balance of data

In [None]:
# List of the categories (arrhythmia columns + normal ECG)
categories = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF', 'normal_ecg']

# Count the number of 'True' values in each category
category_counts = final_cleaned_data[categories].sum()

# Display the count for each category
print("ECG counts for each category:")
print(category_counts)



##Drop unnecesary columns

In [None]:
# Drop unnecessary columns
columns_to_drop = ['age', 'is_male', 'nn_predicted_age', 'patient_id', 'death', 'timey', 'trace_file', 'exam_id']
final_cleaned_data = final_cleaned_data.drop(columns=columns_to_drop)

# Verify the cleaned data
print(final_cleaned_data.head())

##Remove rows where multiple arrhythmias are indicated as true

In [None]:
# List of arrhythmia columns (excluding 'normal_ecg')
arrhythmia_columns = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF']

# Count how many rows have multiple arrhythmias as True
multiple_arrhythmias_mask = (final_cleaned_data[arrhythmia_columns].sum(axis=1) > 1)
num_multiple_arrhythmias = multiple_arrhythmias_mask.sum()

print(f"Number of rows with multiple arrhythmias: {num_multiple_arrhythmias}")

# Drop rows where multiple arrhythmias are True
final_cleaned_data = final_cleaned_data[~multiple_arrhythmias_mask]

# Count rows where 'normal_ecg' is False
false_normal_ecg = (final_cleaned_data['normal_ecg'] == False).sum()

print(f"Number of rows where 'normal_ecg' is False: {false_normal_ecg}")

##Check that columns are balanced after all the manipulation

In [None]:
# List of arrhythmia columns (excluding 'normal_ecg')
arrhythmia_columns = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF']

# Sum the total number of 'True' values across all arrhythmia columns combined
total_arrhythmia_true_values = final_cleaned_data[arrhythmia_columns].values.sum()

# Display the total
print(f"Total 'True' values across all arrhythmia columns: {total_arrhythmia_true_values}")

final_cleaned_data = final_cleaned_data.sample(frac=1, random_state=42).reset_index(drop=True)
print(final_cleaned_data.head())

##Save the files

In [None]:
destination_folder = '/content/drive/MyDrive/DATASETS/'  # Adjust to your Google Drive folder

# Step 1: Save the multi-dimensional ECG tracings separately as NumPy arrays
np.save(destination_folder + f'tracings_part_{file_part_number}.npy', final_cleaned_data['tracings'].values)

# Step 2: Drop the 'tracings' column
balanced_data = final_cleaned_data.drop(columns=['tracings'])

# Step 3: Save the rest of the DataFrame to HDF5 in your Google Drive
balanced_data.to_hdf(destination_folder + f'data_part_{file_part_number}.h5', key='df', mode='w')

#Combine the individual parts into training and testing sets. The training set will be further divided into training and validation data later.

In [None]:
# Base path to your Google Drive dataset folder
base_path = '/content/drive/MyDrive/DATASETS/'

# Initialize an empty DataFrame to hold the combined data
combined_data = pd.DataFrame()

# Loop through file_part_numbers from 0 to 13
for file_part_number in range(14, 18):  # 0 to 13 inclusive
    # Define file names for the current part
    files = [f"tracings_part_{file_part_number}.npy", f'data_part_{file_part_number}.h5']
    tracings_file = base_path + files[0]
    data_file = base_path + files[1]

    # Load the current part's data
    print(f"Loading data for part {file_part_number}...")
    data = pd.read_hdf(data_file)
    tracings = np.load(tracings_file, allow_pickle=True)

    # Add the tracings back into the DataFrame
    data['tracings'] = list(tracings)

    # Append the current part's data to the combined DataFrame
    combined_data = pd.concat([combined_data, data], ignore_index=True)

    print(f"Data for part {file_part_number} loaded and appended.")

# After the loop completes, combined_data will contain all parts
print("All parts loaded and combined successfully.")
print(combined_data.shape)

##Visualize the balance of the combined data

In [None]:
# List of the categories (arrhythmia columns + normal ECG)
categories = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF', 'normal_ecg']

# Count the number of 'True' values in each category
category_counts = combined_data[categories].sum()

# Display the count for each category
print("ECG counts for each category:")
print(category_counts)

combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)
print(combined_data.head())

##Saved the combined parts

In [None]:
destination_folder = '/content/drive/MyDrive/DATASETS/'  # Adjust to your Google Drive folder

# Step 1: Save the multi-dimensional ECG tracings separately as NumPy arrays
np.save(destination_folder + f'tracings_part_14to17.npy', combined_data['tracings'].values)

# Step 2: Drop the 'tracings' column
balanced_data = combined_data.drop(columns=['tracings'])

# Step 3: Save the rest of the DataFrame to HDF5 in your Google Drive
balanced_data.to_hdf(destination_folder + f'data_part_14to17.h5', key='df', mode='w')

#Balnce and filter combined parts

##Rebalance columns so as not to overfit to normal_ecg

In [None]:
files = ["tracings_part_0to13.npy", "data_part_0to13.h5"]

# Load data
base_path = '/content/drive/MyDrive/DATASETS/'
tracings_file = os.path.join(base_path, files[0])
data_file = os.path.join(base_path, files[1])
data = pd.read_hdf(data_file)
tracings = np.load(tracings_file, allow_pickle=True)
data['tracings'] = list(tracings)

# List of the categories (arrhythmia columns + normal ECG)
categories = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF', 'normal_ecg']

# Count the number of 'True' values in each category
category_counts = data[categories].sum()

# Display the count for each category
print("ECG counts for each category:")
print(category_counts)

##Drop a random selection of normal ecgs and visualize new balance

In [None]:
# List of categories (arrhythmia columns + normal ECG)
categories = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF', 'normal_ecg']

# Separate normal_ecg entries from the rest
normal_ecg_entries = data[data['normal_ecg'] == True]
other_entries = data[data['normal_ecg'] == False]

# Randomly sample 4500 'normal_ecg' entries
normal_ecg_sampled = normal_ecg_entries.sample(n=4500, random_state=42)

# Combine sampled normal_ecg entries with non-normal_ecg entries
balanced_data = pd.concat([normal_ecg_sampled, other_entries], ignore_index=True)

# Extract tracings and labels to ensure they are properly aligned
X = np.array([tracing for tracing in balanced_data['tracings']])
y = balanced_data[categories].values  # Ensure labels are extracted correctly

# List of the categories (arrhythmia columns + normal ECG)
categories = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF', 'normal_ecg']

# Count the number of 'True' values in each category
category_counts = balanced_data[categories].sum()

# Display the count for each category
print("ECG counts for each category:")
print(category_counts)

##Apply a highpass filter to the data to get rid of baseline drift

In [None]:
# High-pass filter functions
def butter_highpass(cutoff, fs, order=5):
    nyquist = 0.5 * fs  # Nyquist Frequency
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

def highpass_filter(data, cutoff, fs, order=5):
    b, a = butter_highpass(cutoff, fs, order=order)
    y = filtfilt(b, a, data, axis=0)  # Apply along each lead (axis=0)
    return y

# Apply high-pass filter to all tracings
cutoff_frequency = 1  # in Hz
sampling_frequency = 400  # in Hz
filter_order = 5

print("Applying high-pass filter to the data...")
X_filtered = np.array([
    highpass_filter(tracing, cutoff_frequency, sampling_frequency, order=filter_order)
    for tracing in X
])

##Save the balanced and filtered data

In [None]:
# Reassign filtered tracings to the balanced_data DataFrame
balanced_data['tracings'] = list(X_filtered)

# Save the filtered data
destination_folder = '/content/drive/MyDrive/RedoneDataOct/'  # Adjust as necessary

# Step 1: Save the multi-dimensional ECG tracings separately as NumPy arrays
np.save(destination_folder + 'tracings_part_14to17_filtered.npy', X_filtered)

# Step 2: Drop the 'tracings' column
balanced_data = balanced_data.drop(columns=['tracings'])

# Step 3: Save the rest of the DataFrame to HDF5
balanced_data.to_hdf(destination_folder + 'data_part_14to17_filtered.h5', key='df', mode='w')

#Split data into training and validation sets

In [None]:
files = ["tracings_part_0to13_filtered.npy", "data_part_0to13_filtered.h5"]

base_path = '/content/drive/MyDrive/RedoneDataOct/'

tracings_file = os.path.join(base_path, files[0])
data_file = os.path.join(base_path, files[1])

data = pd.read_hdf(data_file)
tracings = np.load(tracings_file, allow_pickle=True)
data['tracings'] = list(tracings)

# Extract labels
label_columns = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF', 'normal_ecg']

# Define X (features) and y (labels)
X = np.array([tracing for tracing in data['tracings']])
y = data[['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF', 'normal_ecg']].values

##Save the parts individually

In [None]:
# Step 1: Split the data
# Use train_test_split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 2: Initialize the scaler
scaler = StandardScaler()

# Step 3: Fit the scaler only on the training data (lead-wise)
X_train_scaled = np.array([scaler.fit_transform(tracing) for tracing in X_train])

# Step 4: Apply the same transformation to the test data using the fitted scaler
X_test_scaled = np.array([scaler.transform(tracing) for tracing in X_test])

# Save the scaler
joblib.dump(scaler, '/content/drive/MyDrive/RedoneDataOct/Processed/Generalscaler.pkl')

# Define the destination folder
destination_folder = '/content/drive/MyDrive/RedoneDataOct/Processed/'

# Save X_train_scaled and X_test_scaled as NumPy arrays
np.save(destination_folder + 'X_train_scaled.npy', X_train_scaled)
np.save(destination_folder + 'X_test_scaled.npy', X_test_scaled)

# Save y_train and y_test (these can be saved using joblib or NumPy)
np.save(destination_folder + 'y_train.npy', y_train)
np.save(destination_folder + 'y_test.npy', y_test)