# Approach A Data Preprocessing step 1:
This notebook removes cases where all arrythmias are false but normal ecg is also false.
It produces smaller sized .hdf5 files and corresponding csv files with patient information

In [3]:
import pandas as pd

# Here we load the original CSV file
csv_path = 'exams.csv'
df = pd.read_csv(csv_path)

# This filters rows where trace_file is 'exams_partX.hdf5'
df_filtered = df[df['trace_file'] == 'exams_partX.hdf5']

# Here we select the relevant columns
columns_to_keep = ['exam_id', '1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF', 'normal_ecg', 'trace_file']
df_filtered = df_filtered[columns_to_keep]

# This further filters to exclude rows where all arrhythmia columns are False but normal_ecg is also False
# Creating a mask for arrhythmias being all False
arrhythmia_columns = ['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF']
arrhythmias_all_false = (df_filtered[arrhythmia_columns] == False).all(axis=1)

# create a mask for normal_ecg being False
normal_ecg_false = df_filtered['normal_ecg'] == False

# Exclude rows where all arrhythmias are False and normal_ecg is also False
df_filtered = df_filtered[~(arrhythmias_all_false & normal_ecg_false)]

# This saves the filtered data to a new CSV file
df_filtered.to_csv('Part_X_cleaned_data.csv', index=False)

print("Filtered CSV file 'Part_X_cleaned_data.csv' has been created successfully.")


Filtered CSV file 'Part_3_cleaned_data.csv' has been created successfully.


The below code is an intermediary step to count the number of arrhythmias and normal rhythms remaining

In [4]:
import pandas as pd

# This loads the filtered CSV file
cleaned_csv_path = 'Part_X_cleaned_data.csv'
df_cleaned = pd.read_csv(cleaned_csv_path)

# This counts the number of normal ECGs
num_normal_ecgs = df_cleaned['normal_ecg'].sum()

# THis counts the number of occurrences for each arrhythmia
arrhythmia_counts = df_cleaned[['1dAVb', 'RBBB', 'LBBB', 'SB', 'ST', 'AF']].sum()

# THis prints the results
print(f"Number of normal ECGs: {num_normal_ecgs}")
print("\nNumber of each arrhythmia:")
print(arrhythmia_counts)


Number of normal ECGs: 7670

Number of each arrhythmia:
1dAVb    336
RBBB     563
LBBB     347
SB       319
ST       453
AF       415
dtype: int64


In [5]:
import pandas as pd
import h5py
import numpy as np

# THis loads the cleaned CSV file to get the relevant exam_ids
cleaned_csv_path = 'Part_X_cleaned_data.csv'
df_cleaned = pd.read_csv(cleaned_csv_path)
filtered_exam_ids = set(df_cleaned['exam_id'].values)  # Convert to set for faster lookup

# This opens the original HDF5 file and create a new HDF5 file for filtered data
original_hdf5_path = 'exams_partX.hdf5'
filtered_hdf5_path = 'Part_X_filtered_data.hdf5'

with h5py.File(original_hdf5_path, 'r') as original_file, h5py.File(filtered_hdf5_path, 'w') as filtered_file:
    # This reads the datasets from the original file
    original_exam_ids = original_file['exam_id'][:]
    original_tracings = original_file['tracings'][:]
    
    # this finds indices of the exam_ids that are in the filtered_exam_ids
    indices_to_keep = [i for i, exam_id in enumerate(original_exam_ids) if exam_id in filtered_exam_ids]
    
    # these indices are used to to filter the data
    filtered_exam_ids = original_exam_ids[indices_to_keep]
    filtered_tracings = original_tracings[indices_to_keep, :, :]
    
    # This create datasets in the new HDF5 file with the filtered data
    filtered_file.create_dataset('exam_id', data=filtered_exam_ids)
    filtered_file.create_dataset('tracings', data=filtered_tracings)

print(f"Filtered HDF5 file '{filtered_hdf5_path}' has been created successfully.")


Filtered HDF5 file 'Part_3_filtered_data.hdf5' has been created successfully.
