In [1]:
# going to see how to load physionet16 and interact with its data while my model takes an hour to train

In [2]:
from scipy.io import wavfile
from scipy import signal
from time import time
import random
import librosa
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [3]:
physio16_path = '/Users/donu/Desktop/S25/ELEC 594/datasets/physio16/'
physio_anno_path = physio16_path +'annotations/Online_Appendix_training_set.csv'

In [20]:
physio16_data  = pd.read_csv(physio_anno_path)
num_recordings = physio16_data.shape[0]

In [19]:
physio16_data

Unnamed: 0,Challenge record name,Database,Original record name,Diagnosis,Class (-1=normal 1=abnormal),# Beat (automated algorithm),# Beats requiring hand correction,Gender,Age (year),Height (m),...,# Recording in each subject,Transducer site on body,Recording state,Murmurs (2=None 3=Weak 4=Strong 6=Unclear),"Murmur Location (2=Sys,3=Dia,4=Both)",Arrhythmia (2=None 3=One arrhythmic beat 4=Medium Arrhytmia 5=Severe Arrhytmia 6=Unclear 7=Two arrhythmic beats),Respiration noise (2=None 3=Insignificant 4=Weak Resp. Noise<2 s 5=Weak Resp. Noise<3 s 6=Weak Resp. Noise <4 s 7=Weak Resp. Noise >4 s 8=Powerfull Resp. Noise<2 s 9=Powerfull Resp. Noise<3 s 10=Powerfull Resp. noise <4 s 11=Powerfull Resp. Noise >4 s 12=Dont know 13=Weak Resp. Noise<1 s 14=Resp. Noise<1 s 15=Powerfull Resp. noise <1 s 16=Weak Rec. Noise<1 s 17=Powerfull Rec. Noise<1 s),Ambient noise (2=None 3=Insignificant 4=Weak Rec. BG<2 s 5=Weak Rec. BG<3 s 6=Weak Rec. BG <4 s 7=Weak Rec. BG >4 s 8=BG. Noise<2 s 9=BG. Noise<3 s 10= BG. noise <4 s 11=BG. Noise >4 s 12=Powerfull BG. Noise<2 s 13=Powerfull BG. Noise<3 s 14=Powerfull BG. noise <4 s 15=Powerfull BG. Noise >4 s 16=Dont know 17=Weak BG. Noise<1 s 18=BG.Noise<1 s 19=Powerfull BG. Noise<1 s),Recording noise (2=None 3=Insignificant 4=Beg. Rec. Noise 5=End Rec. Noise 6=Beg-End Reco Noise 7=Weak Rec. Noise<2 s 8=Weak Rec. Noise<3 s 9=Weak Rec. Noise <4 s 10=Weak Rec. Noise >4 s 11=Powerfull Rec. Noise<2 s 12=Powerfull Rec. Noise<3 s 13=Powerfull Rec. noise <4 s 14=Powerfull Rec. Noise >4 s 15=Spikes 16=Dont know 17=Weak Rec. Noise<1 s 18=Powerfull Rec. Noise<1 s),Abdominal sounds (2=None 3=Insignificant 4=Sto. Noise<2 s 5=Sto. Noise<3 s 6=Sto. Noise <4 s 7=Sto. Noise >4 s 8=Dont know 9=Sto. Noise<1 s)
0,a0001,training-a,C45S1,MVP,1,36,0,,,,...,,Left of parasternum,,,,,,,,
1,a0002,training-a,C19S3,MVP,1,27,7,,,,...,,Parasternum when sit,,,,,,,,
2,a0003,training-a,C23S0,MVP,1,25,0,,,,...,,Left of parasternum,,,,,,,,
3,a0004,training-a,C35S18,Benign,1,35,0,,,,...,,Left of parasternum,,,,,,,,
4,a0005,training-a,C63S20,Benign,1,52,0,,,,...,,Parasternum when squat,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3148,f0110,training-f,a110,Normal,-1,46,0,F,27.0,,...,,,,,,,,,,
3149,f0111,training-f,a111,Normal,-1,51,0,F,58.0,,...,,,,,,,,,,
3150,f0112,training-f,a112,Normal,-1,21,21,M,71.0,,...,,,,,,,,,,
3151,f0113,training-f,a68-1,Normal,-1,32,15,M,26.0,,...,,,,,,,,,,


### Data exploration

In [5]:
normal_str = "Class (-1=normal 1=abnormal)"
murmur_str = "Murmur Location (2=Sys,3=Dia,4=Both)"

In [6]:
type(np.unique(physio16_data[murmur_str])[-1])

numpy.float64

In [7]:
np.sum(~pd.isna(physio16_data[murmur_str]))

490

In [8]:
physio16_data.shape

(3153, 26)

In [9]:
np.sum(physio16_data[normal_str]==1) # 665 abnormal

665

In [10]:
100*np.sum(physio16_data[normal_str]==1)/3153 # percent abnormal

21.091024421186173

In [11]:
just_murmurs = physio16_data.loc[~pd.isna(physio16_data[murmur_str])] # where murmurs are

In [12]:
np.sum(just_murmurs[normal_str]==1)/np.sum(just_murmurs[normal_str]!=0)
# prevalence of murmurs basically uncorrelated with presence of heart abnormalities in this dataset
# this is very strange...

0.21224489795918366

In [None]:
physio16_data['Database'].unique() # physio16_path+this value+ 'Challenge record name' value

array(['training-a', 'training-b', 'training-c', 'training-d',
       'training-e', 'training-f'], dtype=object)

In [36]:
murmur_str # this tells you what the numbers mean. want to find where murmurs are

'Murmur Location (2=Sys,3=Dia,4=Both)'

In [54]:
murmur_locs = physio16_data[murmur_str]>1 # ok this works

In [55]:
murmur_present = physio16_data.loc[murmur_locs] # there are definitely murmurs in these recordings

In [None]:
# proportion of murmur recordings that are associated with abnormal heartbeats
np.sum(murmur_present[normal_str]==1)/np.sum(murmur_present[normal_str]!=0)
# naturally

1.0