# CinC-2021 
## Train - Val - Test Split


In [1]:
import os
import random

import numpy as np
import wfdb
import pandas as pd

In [2]:
os.chdir('..')

In [3]:
records_file_path = 'cinc-2021_data/RECORDS'

with open(records_file_path, 'r') as file:
    record_paths = file.read().splitlines()

In [4]:
def get_files(directory):
    for f in os.listdir(directory):
        if f.endswith('.hea'):
            yield directory + f[:-4]

In [5]:
not_used_sources = {'ptb', 'st_petersburg_incart'}
all_files = list()
for path in record_paths:
    path_components = set(path.split('/'))
    if not not_used_sources & path_components:
        all_files.extend(list(get_files('cinc-2021_data/' + path)))

In [6]:
random.seed(42)
random.shuffle(all_files)

# Split into train, val, test
train_split = int(0.6 * len(all_files))
val_split = int(0.8 * len(all_files))

train_files = all_files[:train_split]
val_files = all_files[train_split:val_split]
test_files = all_files[val_split:]

## Extract data for data visualization
todo: check dx mapping

In [7]:
# Dx mapping
# https://www.kaggle.com/datasets/bjoernjostein/physionet-snomed-mappings
dx_mapping_df = pd.read_csv('cinc-2021_data/dx_mapping.csv')
dx_mapping_dict = (pd.Series(
    dx_mapping_df['Abbreviation'].values, 
    index=dx_mapping_df['SNOMED CT Code'].astype(str)).to_dict())

In [8]:
def extract_record_data_to_csv(record_path, filename):
    try:
        record = wfdb.rdrecord(record_path)

        # Extract Comment
        age, sex, dx = None, None, None
        for comment in record.comments:
            if comment.startswith('Age'):
                age = comment.split(': ')[1]
            elif comment.startswith('Sex'):
                sex = comment.split(': ')[1]
                if pd.isna(sex):
                    sex = 'Unknown'
            elif comment.startswith('Dx'):
                dx_code = comment.split(': ')[1].split(',')
                dx = [dx_mapping_dict[code] for code in dx_code
                      if code in dx_mapping_dict]
        
        
        record_name = record.record_name
        source = record_path.split('/')[2]
        fs = record.fs
        sig_len = record.sig_len
        
        with open(f'cinc-2021_data/preprocessed/{filename}', 'a') as f:
            f.write(f'{record_name},{source},{age},{sex},"{dx}",{sig_len},{fs}\n')

    except Exception as e:
        print(f"Error processing {record_path}: {e}")

In [9]:
# save data to csv
filename = 'patient_data.csv'
with open(f'cinc-2021_data/preprocessed/{filename}', 'w') as f:
    f.write(f'Record,Source,Age,Sex,Diagnoses,Signal Length,Sample Frequency\n')

for f in train_files + val_files:
    extract_record_data_to_csv(f, filename)

## Extract ecg signal (X)
Maybe also apply signal preprocessing here 

In [25]:
def extract_signal_to_csv(record_path, filename):
    try:
        record = wfdb.rdrecord(record_path)

        record_name = record.record_name
        fs = record.fs
        ecg_signal = '|'.join([';'.join([str(i) for i in j]) for j in record.p_signal])
        
        with open(f'cinc-2021_data/preprocessed/{filename}', 'a') as f:
            f.write(f"{record_name},{fs},{ecg_signal}\n")

    except Exception as e:
        print(f"Error processing {record_path}: {e}")

In [26]:
# save train data to csv
filename = 'X_train.csv'
with open(f'cinc-2021_data/preprocessed/{filename}', 'w') as f:
    f.write(f'Record,Sample Frequency,ECG Signal\n')

for f in train_files:
    extract_signal_to_csv(f, filename)

In [27]:
# save val data to csv
filename = 'X_val.csv'
with open(f'cinc-2021_data/preprocessed/{filename}', 'w') as f:
    f.write(f'Record,Sample Frequency,ECG Signal\n')

for f in val_files:
    extract_signal_to_csv(f, filename)

In [28]:
# save test data to csv
filename = 'X_test.csv'
with open(f'cinc-2021_data/preprocessed/{filename}', 'w') as f:
    f.write(f'Record,Sample Frequency,ECG Signal\n')

for f in test_files:
    extract_signal_to_csv(f, filename)

## Extract diagnose (y)

In [29]:
def extract_diagnose_to_csv(record_path, filename):
    try:
        record = wfdb.rdrecord(record_path)

        # Extract Comment
        diagnoses = None
        for comment in record.comments:
            if comment.startswith('Dx'):
                diagnoses = comment.split(': ')[1].split(',')
        
        
        record_name = record.record_name
        
        with open(f'cinc-2021_data/preprocessed/{filename}', 'a') as f:
            f.write(f"{record_name},{diagnoses}\n")

    except Exception as e:
        print(f"Error processing {record_path}: {e}")

In [30]:
# save train data to csv
filename = 'Y_train.csv'
with open(f'cinc-2021_data/preprocessed/{filename}', 'w') as f:
    f.write(f'Record,Diagnoses\n')

for f in train_files:
    extract_diagnose_to_csv(f, filename)

In [31]:
# save val data to csv
filename = 'Y_val.csv'
with open(f'cinc-2021_data/preprocessed/{filename}', 'w') as f:
    f.write(f'Record,Diagnoses\n')

for f in val_files:
    extract_diagnose_to_csv(f, filename)

In [32]:
# save test data to csv
filename = 'Y_test.csv'
with open(f'cinc-2021_data/preprocessed/{filename}', 'w') as f:
    f.write(f'Record,Diagnoses\n')

for f in test_files:
    extract_diagnose_to_csv(f, filename)

In [68]:
# convert array to string
a = '|'.join([';'.join([str(i) for i in j]) for j in record.p_signal])

In [None]:
# convert string to array
np.array([x.split(';') for x in a.split('|')]).shape