## Exploring Physionet 2022 Dataset

In [10]:
import pandas as pd
import numpy as np
import os

In [56]:
physio22_path = '/Users/donu/Desktop/S25/ELEC 594/datasets/physio22/'
physio_train_anno_path = physio22_path +'training_data.csv'
train_path = physio22_path+'training_data/'
val_path = physio22_path+'validation_data/'
test_path = physio22_path+'test_data/'

### 1. Exploring Training Data

In [6]:
physio22_data  = pd.read_csv(physio_train_anno_path)
num_recordings = physio22_data.shape[0]

In [17]:
# how many sounds are in the training dset
training_sounds = list(filter(lambda x: x.endswith('.wav'), os.listdir(physio22_path+'training_data/')))
print(f"{len(training_sounds)} wavs from {num_recordings} patients")

3163 wavs from 942 patients


In [91]:
physio22_data.head()

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
1,9979,AV+PV+TV+MV,Child,Female,103.0,13.1,False,Present,AV+MV+PV+TV,TV,...,High,Harsh,,,,,,Abnormal,CC2015,
2,9983,AV+PV+TV+MV,Child,Male,115.0,19.1,False,Unknown,,,...,,,,,,,,Abnormal,CC2015,
3,13918,AV+PV+TV+MV,Child,Male,98.0,15.9,False,Present,TV,TV,...,Low,Blowing,,,,,,Abnormal,CC2015,
4,14241,AV+PV+TV+MV,Child,Male,87.0,11.2,False,Present,AV+MV+PV+TV,PV,...,Low,Harsh,,,,,,Abnormal,CC2015,


In [None]:
# neonate means < 28 days old
physio22_data["Age"].unique()

array(['Child', 'Adolescent', 'Infant', nan, 'Neonate'], dtype=object)

In [29]:
sum(physio22_data["Age"].isin(['Child', 'Adolescent', 'Infant', 'Neonate']))

868

In [None]:
no_murmur = physio22_data["Murmur"]=='Absent'
has_murmur = physio22_data['Murmur']=='Present'

physio22_murmurs = physio22_data[has_murmur]
physio22_normal = physio22_data[no_murmur]
# !impt: it's possible to use severity classification to futher evaluate denoising performance

In [50]:
np.sum(physio22_data["Outcome"]== "Abnormal")
# interesting how number of abnormal outcomes far exceeds number of murmurs
# can be useful for detecting benign murmurs later?

456

In [39]:
physio22_murmurs.head()

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID
1,9979,AV+PV+TV+MV,Child,Female,103.0,13.1,False,Present,AV+MV+PV+TV,TV,...,High,Harsh,,,,,,Abnormal,CC2015,
3,13918,AV+PV+TV+MV,Child,Male,98.0,15.9,False,Present,TV,TV,...,Low,Blowing,,,,,,Abnormal,CC2015,
4,14241,AV+PV+TV+MV,Child,Male,87.0,11.2,False,Present,AV+MV+PV+TV,PV,...,Low,Harsh,,,,,,Abnormal,CC2015,
8,29045,AV+PV+TV+MV,Child,Female,88.0,12.5,False,Present,AV+MV+PV+TV,TV,...,Low,Blowing,,,,,,Abnormal,CC2015,
11,33151,AV+PV+TV+MV,Child,Female,141.0,30.9,False,Present,MV+TV,TV,...,Low,Harsh,,,,,,Abnormal,CC2015,


In [38]:
physio22_normal.head()

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
5,14998,AV+PV+TV+MV,Child,Male,,,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
6,23625,AV+PV+TV+MV,Child,Female,92.0,14.0,False,Absent,,,...,,,,,,,,Abnormal,CC2015,50379.0
7,24160,AV+PV+TV+MV,Child,Female,98.0,17.66,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
10,31737,AV+PV+TV+MV,Child,Female,90.0,14.4,False,Absent,,,...,,,,,,,,Abnormal,CC2015,


In [None]:
# it's pretty familiar then, to work with the training data
# there's some work to be done for the test/val data though 
# preprocessing is familiar


In [55]:
physio22_data.columns

Index(['Patient ID', 'Recording locations:', 'Age', 'Sex', 'Height', 'Weight',
       'Pregnancy status', 'Murmur', 'Murmur locations',
       'Most audible location', 'Systolic murmur timing',
       'Systolic murmur shape', 'Systolic murmur grading',
       'Systolic murmur pitch', 'Systolic murmur quality',
       'Diastolic murmur timing', 'Diastolic murmur shape',
       'Diastolic murmur grading', 'Diastolic murmur pitch',
       'Diastolic murmur quality', 'Outcome', 'Campaign', 'Additional ID'],
      dtype='object')

### 2. Creating Validation and Test Data

In [None]:
def extract_fields(directory,text_file_name):
    data = {}
    with open(os.path.join(directory, text_file_name), "r") as f:
        # special treatment for first lines
        first_line = f.readline().strip()
        parts = first_line.split()
        if len(parts) == 3:
            data["ID"] = parts[0]
            data["Recording_Locations_Count"] = parts[1]
            data["Sample_Rate"] = parts[2]

        # the other lines
        for line in f:
            line = line.strip()
            if line.startswith("#"):
                line = line[1:]
                if ":" in line:
                    key, value = line.split(":", 1)
                    key = key.strip()
                    value = value.strip()
                    if value.lower() == "nan":
                        value = None
                    data[key] = value
    return data

In [90]:
%%script false --no-raise-error 
val_rows = [extract_fields(val_path,val_text) for val_text in val_texts]
test_rows = [extract_fields(test_path,test_text) for test_text in test_texts]

val_df = pd.DataFrame(val_rows)
test_df = pd.DataFrame(test_rows)

val_df.to_csv(physio22_path+'val_annotations.csv',index=False)
test_df.to_csv(physio22_path+'test_annotations.csv',index=False)

In [None]:
# !impt: murmurs are located in certain valves
# ... so, only those valves should be marked as murmurs for CWL
# if murmur and in av, mv, pv, tv
# # cwl as murmur
# else: do nothing