## Exploring Physionet 2022 Dataset

In [160]:
import pandas as pd
import numpy as np
import os

In [None]:
physio22_path = '/Users/donu/Desktop/semesters/S25/ELEC 594/datasets/physio22/'
physio_train_anno_path = physio22_path +'training_data.csv'
train_path = physio22_path+'training_data/'
val_path = physio22_path+'validation_data/'
test_path = physio22_path+'test_data/'

### 1. Exploring Training Data

In [328]:
physio_train_anno_path

'/Users/donu/Desktop/S25/ELEC 594/datasets/physio22/training_data.csv'

In [330]:
train_data  = pd.read_csv(physio_train_anno_path)
num_recordings = train_data.shape[0]

In [163]:
# how many sounds are in the training dset
training_sounds = list(filter(lambda x: x.endswith('.wav'), os.listdir(physio22_path+'training_data/')))
print(f"{len(training_sounds)} wavs from {num_recordings} patients")

3163 wavs from 942 patients


In [164]:
train_data.head()

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
1,9979,AV+PV+TV+MV,Child,Female,103.0,13.1,False,Present,AV+MV+PV+TV,TV,...,High,Harsh,,,,,,Abnormal,CC2015,
2,9983,AV+PV+TV+MV,Child,Male,115.0,19.1,False,Unknown,,,...,,,,,,,,Abnormal,CC2015,
3,13918,AV+PV+TV+MV,Child,Male,98.0,15.9,False,Present,TV,TV,...,Low,Blowing,,,,,,Abnormal,CC2015,
4,14241,AV+PV+TV+MV,Child,Male,87.0,11.2,False,Present,AV+MV+PV+TV,PV,...,Low,Harsh,,,,,,Abnormal,CC2015,


In [165]:
# neonate means < 28 days old
train_data["Age"].unique()

array(['Child', 'Adolescent', 'Infant', nan, 'Neonate'], dtype=object)

In [166]:
sum(train_data["Age"].isin(['Child', 'Adolescent', 'Infant', 'Neonate']))

868

In [167]:
find_murmur_rows = lambda x: x['Murmur'] == 'Present'
find_normal_rows = lambda x: x['Murmur'] == 'Absent'
find_unknown_rows = lambda x: x['Murmur'] == 'Unknown'

In [168]:
no_murmur = find_normal_rows(train_data)
has_murmur = find_murmur_rows(train_data)
unknown_murmur = find_unknown_rows(train_data)

physio22_murmurs = train_data[has_murmur]
physio22_normal = train_data[no_murmur]
physio22_unknown = train_data[unknown_murmur]
# !impt: it's possible to use severity classification to futher evaluate denoising performance

In [180]:
print("Murmur counts", len(physio22_murmurs))
print("Normal counts", len(physio22_normal))

Murmur counts 179
Normal counts 695


In [169]:
np.sum(train_data["Outcome"]== "Abnormal")
# interesting how number of abnormal outcomes far exceeds number of murmurs
# can be useful for detecting benign murmurs later?

456

In [170]:
physio22_murmurs.head()

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID
1,9979,AV+PV+TV+MV,Child,Female,103.0,13.1,False,Present,AV+MV+PV+TV,TV,...,High,Harsh,,,,,,Abnormal,CC2015,
3,13918,AV+PV+TV+MV,Child,Male,98.0,15.9,False,Present,TV,TV,...,Low,Blowing,,,,,,Abnormal,CC2015,
4,14241,AV+PV+TV+MV,Child,Male,87.0,11.2,False,Present,AV+MV+PV+TV,PV,...,Low,Harsh,,,,,,Abnormal,CC2015,
8,29045,AV+PV+TV+MV,Child,Female,88.0,12.5,False,Present,AV+MV+PV+TV,TV,...,Low,Blowing,,,,,,Abnormal,CC2015,
11,33151,AV+PV+TV+MV,Child,Female,141.0,30.9,False,Present,MV+TV,TV,...,Low,Harsh,,,,,,Abnormal,CC2015,


In [171]:
physio22_normal.head()

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
5,14998,AV+PV+TV+MV,Child,Male,,,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
6,23625,AV+PV+TV+MV,Child,Female,92.0,14.0,False,Absent,,,...,,,,,,,,Abnormal,CC2015,50379.0
7,24160,AV+PV+TV+MV,Child,Female,98.0,17.66,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
10,31737,AV+PV+TV+MV,Child,Female,90.0,14.4,False,Absent,,,...,,,,,,,,Abnormal,CC2015,


In [None]:
# it's pretty familiar then, to work with the training data
# there's some work to be done for the test/val data though 
# preprocessing is familiar


In [None]:
train_data.columns

Index(['Patient ID', 'Recording locations:', 'Age', 'Sex', 'Height', 'Weight',
       'Pregnancy status', 'Murmur', 'Murmur locations',
       'Most audible location', 'Systolic murmur timing',
       'Systolic murmur shape', 'Systolic murmur grading',
       'Systolic murmur pitch', 'Systolic murmur quality',
       'Diastolic murmur timing', 'Diastolic murmur shape',
       'Diastolic murmur grading', 'Diastolic murmur pitch',
       'Diastolic murmur quality', 'Outcome', 'Campaign', 'Additional ID'],
      dtype='object')

### 2. Creating Validation and Test Data

In [346]:
def extract_fields(directory,text_file_name):
    data = {}
    with open(os.path.join(directory, text_file_name), "r") as f:
        # special treatment for first lines
        first_line = f.readline().strip()
        parts = first_line.split()
        if len(parts) == 3:
            data["Patient ID"] = parts[0]

        # valve lines
        valve_codes = []
        for line in f:
            line = line.strip()
            if line.startswith("#Age"):   # stop when metadata begins
                # Push line back into processing loop
                # We found metadata start, so break out of valve loop
                # But keep the current line for metadata parsing
                first_meta_line = line
                break
            if line[:2] in ("AV", "PV", "TV", "MV"):
                valve_codes.append(line[:2])
        
        # Store valve codes as a comma-separated string (or list)
        data["Recording locations"] = "+".join(valve_codes)
        if first_meta_line.startswith("#"):
            key, value = first_meta_line[1:].split(":", 1)
            data[key.strip()] = value.strip()
        # the other lines
        for line in f:
            line = line.strip()
            if line.startswith("#"):
                line = line[1:]
                if ":" in line:
                    key, value = line.split(":", 1)
                    key = key.strip()
                    value = value.strip()
                    if value.lower() == "nan":
                        value = None
                    data[key] = value
    return data

In [372]:
%%script false --no-raise-error 
val_texts = [f for f in os.listdir(val_path) if f.endswith(".txt")]
test_texts = [f for f in os.listdir(test_path) if f.endswith(".txt")]

val_rows = [extract_fields(val_path,val_text) for val_text in val_texts]
test_rows = [extract_fields(test_path,test_text) for test_text in test_texts]

val_df = pd.DataFrame(val_rows)
test_df = pd.DataFrame(test_rows)

val_df.to_csv(physio22_path+'val_annotations.csv',index=False)
test_df.to_csv(physio22_path+'test_annotations.csv',index=False)

In [None]:
# !impt: murmurs are located in certain valves
# ... so, only those valves should be marked as murmurs for CWL
# if murmur and in av, mv, pv, tv
# # cwl as murmur
# else: do nothing

### 3. Mapping Recordings to Murmurs

In [348]:
# load csvs
val_df = pd.read_csv(physio22_path+'val_annotations.csv')
test_df = pd.read_csv(physio22_path+'test_annotations.csv')

In [252]:
# do all murmurs have corresponding locations?
val_murmurs = val_df[find_murmur_rows(val_df)]

#val_murmurs["Murmur locations"]
# ^these all have murmur locations

In [350]:
# also check for test
test_murmurs = test_df[find_murmur_rows(test_df)]

print(f"""# of murmur recordings vs # annotated murmur recordings: 
      {len(test_murmurs)}, {sum(test_murmurs['Murmur locations'].notnull())}""") # nice

print("# of systolic, diastolic strength annotations:")
print(sum(test_murmurs["Systolic murmur grading"].notnull()))
print(sum(test_murmurs["Diastolic murmur grading"].notnull()))

# of murmur recordings vs # annotated murmur recordings: 
      99, 99
# of systolic, diastolic strength annotations:
99
4


In [351]:
# for training data
sum(physio22_murmurs["Systolic murmur grading"].notnull())
# ^ 178. there are 179 recordings with murmurs. so nearly all murmurs are systolic
sum(physio22_murmurs["Diastolic murmur grading"].notnull())

5

In [352]:
def find_murmur_strength(pandas_row):
  systolic_strength = pandas_row["Systolic murmur grading"]
  diastolic_strength = pandas_row["Diastolic murmur grading"]
  # i am completely making these up
  # 1 ~= weak, 2 ~= moderate, 3~=strong
  systolic_map = {"I/VI":1, "II/VI":1, "III/VI":2, "IV/VI": 2, "V/VI":3, "VI/VI": 3}
  diastolic_map = {"I/IV":1, "II/IV":2, "III/IV":2, "IV/IV":3}
  murmur_strengths = [0,0]
  
  if type(systolic_strength)==str:
    murmur_strengths[0]= systolic_map[systolic_strength]
  if type(diastolic_strength)==str:
    murmur_strengths[1]= diastolic_map[diastolic_strength]
  
  return max(murmur_strengths)

#test_df.apply(find_murmur_strength,axis=1)
#train_data.apply(find_murmur_strength,axis=1)
# ^ those work

In [None]:
# 1. iterate through ['train','test','val']
# # maps to folders
# # 
# for each. 
# extract ID, and split Recording locations
# for each recording location
# # search for matches in folder
# # if murmur locations is defined and recording locaiton is in it
# # run the strength function and assign
# # other

# wait, it would be soooo much more useful to create another metadata file here!!!! do that next. 

In [373]:
%%script false --no-raise-error 
# the below cell is irrelevant 

# much progress
# # got familiar with physionet 22
# # created metadata for val/test
# # mechanism to identify murmur strength
# # data exploration

# ok, how to make the metadata files
# filename matches
# murmur presence (bool)
# might as well retain things like outcome, height, weight, age, pregnancy status, murmur
# if so, murmur location

results = []
reduced_train_data = train_data[~find_unknown_rows(train_data)]
reduced_val_data = val_df[~find_unknown_rows(val_df)]
reduced_test_data = test_df[~find_unknown_rows(test_df)]
# simple setup
path_maps = {'train':train_path,'val': val_path,'test': test_path}
path_dfs = {'train':train_data, 'val': val_df, 'test': test_df}

# basically, just find all matches. store as list or something. then, find segmentation file for each. 
# # ^ actually no, kind of pointless since so deterministic
for dset in ['train','val','test']:
  dset_path = path_maps[dset]
  dset_df = path_dfs[dset]
  
  files = os.listdir(dset_path)
  
  # BIG PROBLEM 
  for _,row in dset_df.iterrows():
    pid = str(row["Patient ID"])
    
    file_matches = list(filter(lambda x: pid==x.split("_")[0], files))
    # ok, now what
    # want to see if murmurs are present
    all_valves = set(row["Recording locations"].split("+"))
    murmur_valves = set()

    if type(row["Murmur locations"])==str:
      murmur_valves = set(row["Murmur locations"].split("+"))
      # the things is, you'll want to split train/val/test s.t. no patient occurs in more than one. 
      # you can do that kind of easily though. 
    print(file_matches,murmur_valves)
    # almost no preprocessing to be done then, tbh. should merge train, val and test metadata though. 
# want all data sources equally represented in train/val/test
# anyway just want to merge data now
    
  

In [334]:
#!impt: it will be quite hard to maintain on a per-valve level whether theres's a murmur or not
# also, doing so would make it much harder to segment train/val/test data.
# what i will do, is say that if a murmur is in one valve, that it's in all valves too.
# this is theoretically pretty false, although for the physionet data it's mostly the case
# implicitly this will also train the model to be wary of deleting murmurs. interesting. 
train_data[~train_data["Murmur locations"].isna()][["Recording locations","Murmur locations","Systolic murmur grading"]].head()

Unnamed: 0,Recording locations,Murmur locations,Systolic murmur grading
1,AV+PV+TV+MV,AV+MV+PV+TV,III/VI
3,AV+PV+TV+MV,TV,I/VI
4,AV+PV+TV+MV,AV+MV+PV+TV,II/VI
8,AV+PV+TV+MV,AV+MV+PV+TV,II/VI
11,AV+PV+TV+MV,MV+TV,I/VI


In [344]:
print(set(val_df.columns).difference(set(train_data.columns)))
print(set(train_data.columns).difference(val_df.columns))
# ok so basically just get rid of recording_locations_count and sample_rate

{'Recording_Locations_Count', 'Sample_Rate'}
set()


{'Sample_Rate', 'Most audible location', 'Recording_Locations_Count', 'Recording locations', 'Diastolic murmur timing', 'Pregnancy status', 'Systolic murmur shape', 'Systolic murmur grading', 'Diastolic murmur pitch', 'Age', 'Sex', 'Diastolic murmur shape', 'Height', 'Systolic murmur pitch', 'Diastolic murmur grading', 'Systolic murmur timing', 'Murmur locations', 'Murmur', 'Systolic murmur quality', 'Weight', 'Patient ID', 'Campaign', 'Diastolic murmur quality', 'Additional ID', 'Outcome'}


In [None]:
all_data = pd.concat([train_data,val_df,test_df])
print(len(all_data['Patient ID'].unique()), len(all_data))
# ok great. 
# now simply save as CSV and leave
all_data.to_csv(physio22_path+'all_annotations.csv',index=False)

1568 1568
