In [1]:
import librosa as lr
from IPython.display import Audio
import numpy as np
import pandas as pd
import os
import seaborn as sns

In [2]:
# Load patient id numbers and corresponding diseases
patient_data = pd.read_csv(r"C:\Users\Anil Sharma\Documents\archive\Respiratory_Sound_Database\Respiratory_Sound_Database\patient_diagnosis.csv", names = ['pid', 'disease'])
patient_data.head()

Unnamed: 0,pid,disease
0,101,URTI
1,102,Healthy
2,103,Asthma
3,104,COPD
4,105,URTI


In [3]:
# Load text files containing patient annotations for each sound recording 
path = r'C:\Users\Anil Sharma\Documents\archive\Respiratory_Sound_Database\Respiratory_Sound_Database\audio_and_txt_files'
file_names = [s.split(',')[0] for s in os.listdir(path) if '.txt' in s] # Put all file names in list

files_data=[]
for file in file_names:
    file_data = pd.read_csv(path + '\\' + file, sep = '\t',names=['start','end','crackles','wheezes'])
    name_data = file.split('_')
    file_data['pid'] = name_data[0]
    file_data['mode'] = name_data[-2]
    file_data['filename'] = file
    file_data['recording_location'] = name_data[-3]
    file_data['recording_device'] = name_data[4].split('.')[0]
    files_data.append(file_data)
files_df=pd.concat(files_data)
files_df.reset_index()
files_df

Unnamed: 0,start,end,crackles,wheezes,pid,mode,filename,recording_location,recording_device
0,0.036,0.579,0,0,101,sc,101_1b1_Al_sc_Meditron.txt,Al,Meditron
1,0.579,2.450,0,0,101,sc,101_1b1_Al_sc_Meditron.txt,Al,Meditron
2,2.450,3.893,0,0,101,sc,101_1b1_Al_sc_Meditron.txt,Al,Meditron
3,3.893,5.793,0,0,101,sc,101_1b1_Al_sc_Meditron.txt,Al,Meditron
4,5.793,7.521,0,0,101,sc,101_1b1_Al_sc_Meditron.txt,Al,Meditron
...,...,...,...,...,...,...,...,...,...
6,11.721,13.693,1,0,226,sc,226_1b1_Pl_sc_LittC2SE.txt,Pl,LittC2SE
7,13.693,15.536,0,0,226,sc,226_1b1_Pl_sc_LittC2SE.txt,Pl,LittC2SE
8,15.536,17.493,0,0,226,sc,226_1b1_Pl_sc_LittC2SE.txt,Pl,LittC2SE
9,17.493,19.436,1,0,226,sc,226_1b1_Pl_sc_LittC2SE.txt,Pl,LittC2SE


In [16]:
# Merge patient disease with annotation dataframe
files_df['pid'] = files_df['pid'].astype('int32')
patient_data['pid'] = patient_data['pid'].astype('int32')
df = pd.merge(files_df, patient_data, on = 'pid')
df

Unnamed: 0,start,end,crackles,wheezes,pid,mode,filename,recording_location,recording_device,disease
0,0.036,0.579,0,0,101,sc,101_1b1_Al_sc_Meditron.txt,Al,Meditron,URTI
1,0.579,2.450,0,0,101,sc,101_1b1_Al_sc_Meditron.txt,Al,Meditron,URTI
2,2.450,3.893,0,0,101,sc,101_1b1_Al_sc_Meditron.txt,Al,Meditron,URTI
3,3.893,5.793,0,0,101,sc,101_1b1_Al_sc_Meditron.txt,Al,Meditron,URTI
4,5.793,7.521,0,0,101,sc,101_1b1_Al_sc_Meditron.txt,Al,Meditron,URTI
...,...,...,...,...,...,...,...,...,...,...
6893,11.721,13.693,1,0,226,sc,226_1b1_Pl_sc_LittC2SE.txt,Pl,LittC2SE,Pneumonia
6894,13.693,15.536,0,0,226,sc,226_1b1_Pl_sc_LittC2SE.txt,Pl,LittC2SE,Pneumonia
6895,15.536,17.493,0,0,226,sc,226_1b1_Pl_sc_LittC2SE.txt,Pl,LittC2SE,Pneumonia
6896,17.493,19.436,1,0,226,sc,226_1b1_Pl_sc_LittC2SE.txt,Pl,LittC2SE,Pneumonia


In [31]:
# Export dataframe
df.to_csv(r'C:\Users\Anil Sharma\Documents\dataframe.csv')

In [25]:
# Gives me number of crackles, wheezes, both symptoms, and no labels in all dataset
no_label_list = []
crack_list = []
wheeze_list = []
both_sym_list = []
filename_list = []
for i in range(len(df)):
    no_labels = len(df[(df['crackles'] == 0) & (df['wheezes'] == 0)].index)
    n_crackles = len(df[(df['crackles'] == 1) & (df['wheezes'] == 0)].index)
    n_wheezes = len(df[(df['crackles'] == 0) & (df['wheezes'] == 1)].index)
    both_sym = len(df[(df['crackles'] == 1) & (df['wheezes'] == 1)].index)


In [32]:
print(no_labels, n_crackles, n_wheezes, both_sym)

3642 1864 886 506
