In [1]:
import numpy as np 
import pandas as pd 
import librosa.display
import librosa as lb
import soundfile as sf
import os

# **Load labels data from file**

In [2]:
patient_diagnosis_df = pd.read_csv('/kaggle/input/respiratory-sound-database/Respiratory_Sound_Database/Respiratory_Sound_Database/patient_diagnosis.csv',names=['pid','disease'])
patient_diagnosis_df.head()

Unnamed: 0,pid,disease
0,101,URTI
1,102,Healthy
2,103,Asthma
3,104,COPD
4,105,URTI


# **Load demographic information**

In [3]:
demographic_info_f = "/kaggle/input/respiratory-sound-database/demographic_info.txt"
with open(demographic_info_f) as f:
    f.readline()
    lines = [line.strip().split() for line in f.readlines()]

demographic_info_df = pd.DataFrame(lines, columns=["pid", "age", "sex", "adult_bmi", "child_weight", "child_height"]).replace('NA', 0)

In [4]:
demographic_info_df['pid'] = demographic_info_df['pid'].astype(int)
demographic_info_df

Unnamed: 0,pid,age,sex,adult_bmi,child_weight,child_height
0,101,3,F,0,19,99
1,102,0.75,F,0,9.8,73
2,103,70,F,33,0,0
3,104,70,F,28.47,0,0
4,105,7,F,0,32,135
...,...,...,...,...,...,...
121,222,60,M,0,0,0
122,223,0,0,0,0,0
123,224,10,F,0,32.3,143
124,225,0.83,M,0,7.8,74


# **Load all .txt files information**

In [5]:
path='/kaggle/input/respiratory-sound-database/Respiratory_Sound_Database/Respiratory_Sound_Database/audio_and_txt_files/'
files=[s.split('.')[0] for s in os.listdir(path) if '.txt' in s]

# **Create Investigation and Patient classes**

In [6]:
INVESTIGATION_LOCS = ['Al', 'Ar', 'Pl', 'Pr', 'Ll', 'Lr', 'Tc']

class Investigation:
    def __init__(self, location):
        self.location = location
        self.is_investigated = False
        self.crackle = 0
        self.wheeze = 0
        self.investigation_counter = 0
        
class Patient:
    def __init__(self, id):
        self.id = id
        self.investigations = [Investigation(loc) for loc in INVESTIGATION_LOCS]
    
    def is_loc_investigated(self, loc):
        return get_investigation(loc).is_investigated
    
    def get_investigation(self, loc):
        return [investigation for investigation in self.investigations if investigation.location == loc][0]


In [7]:
rec_info = []

for i in range(101, 227):
    
    patient = Patient(i)
    patient_files = [f for f in files if f.startswith(str(i))]
    
    
    for file_name in patient_files:
        location = file_name[8:10]
        data = pd.read_csv(path + file_name + '.txt',sep='\t',names=['start','end','crackles','wheezels'])
        
        inv = patient.get_investigation(location)
        
        if inv.is_investigated:
            inv.crackle += np.sum(data['crackles']) / len(data['crackles'])
            inv.wheeze += np.sum(data['wheezels']) / len(data['wheezels'])
        else:
            inv.crackle = np.sum(data['crackles']) / len(data['crackles'])
            inv.wheeze = np.sum(data['wheezels']) / len(data['wheezels'])
            inv.is_investigated = True
    
    def get_patient_data():
        patient_data = [i]
        
        for loc in INVESTIGATION_LOCS:
            inv = patient.get_investigation(loc)
            patient_data.append(inv.is_investigated)
            patient_data.append(inv.crackle)
            patient_data.append(inv.wheeze)
        
        return patient_data
            

    rec_info.append(get_patient_data())

cols = ['pid']

for loc in INVESTIGATION_LOCS:
    cols.append('is_' + loc)
    cols.append(loc + '_crackles')
    cols.append(loc + '_wheezes')

rec_info_df = pd.DataFrame(rec_info, 
                           columns=cols)
rec_info_df.head(20)

Unnamed: 0,pid,is_Al,Al_crackles,Al_wheezes,is_Ar,Ar_crackles,Ar_wheezes,is_Pl,Pl_crackles,Pl_wheezes,...,Pr_wheezes,is_Ll,Ll_crackles,Ll_wheezes,is_Lr,Lr_crackles,Lr_wheezes,is_Tc,Tc_crackles,Tc_wheezes
0,101,True,0.0,0.0,False,0.0,0.0,False,0.0,0.0,...,0.0,False,0.0,0.0,False,0.0,0.0,False,0.0,0.0
1,102,False,0.0,0.0,True,0.0,0.0,False,0.0,0.0,...,0.0,False,0.0,0.0,False,0.0,0.0,False,0.0,0.0
2,103,False,0.0,0.0,True,0.0,0.666667,False,0.0,0.0,...,0.0,False,0.0,0.0,False,0.0,0.0,False,0.0,0.0
3,104,True,0.0,0.0,True,0.0,0.714286,True,0.0,0.0,...,0.0,True,0.142857,0.0,True,0.0,0.0,False,0.0,0.0
4,105,False,0.0,0.0,False,0.0,0.0,False,0.0,0.0,...,0.0,False,0.0,0.0,False,0.0,0.0,True,0.0,0.0
5,106,False,0.0,0.0,False,0.0,0.0,True,0.888889,0.0,...,0.777778,False,0.0,0.0,False,0.0,0.0,False,0.0,0.0
6,107,True,3.625,0.583333,True,2.819444,2.125,True,1.375,0.0,...,1.055556,True,3.75,0.0,True,3.194444,0.666667,True,1.166667,0.888889
7,108,True,0.0,0.0,False,0.0,0.0,False,0.0,0.0,...,0.0,False,0.0,0.0,False,0.0,0.0,False,0.0,0.0
8,109,True,0.0,0.0,True,0.0,0.0,True,0.0,0.0,...,0.0,True,0.875,0.0,True,1.0,0.0,False,0.0,0.0
9,110,True,0.857143,0.428571,False,0.0,0.0,False,0.0,0.0,...,0.0,True,0.0,0.428571,True,1.0,0.0,False,0.0,0.0


# **Marge and save datasets**

In [8]:
rec_info_df = pd.merge(rec_info_df, demographic_info_df, left_on='pid', right_on='pid')
rec_info_df = pd.merge(rec_info_df, patient_diagnosis_df, left_on='pid', right_on='pid')
rec_info_df.to_csv('train_data.csv')
rec_info_df.head()

Unnamed: 0,pid,is_Al,Al_crackles,Al_wheezes,is_Ar,Ar_crackles,Ar_wheezes,is_Pl,Pl_crackles,Pl_wheezes,...,Lr_wheezes,is_Tc,Tc_crackles,Tc_wheezes,age,sex,adult_bmi,child_weight,child_height,disease
0,101,True,0.0,0.0,False,0.0,0.0,False,0.0,0.0,...,0.0,False,0.0,0.0,3.0,F,0.0,19.0,99,URTI
1,102,False,0.0,0.0,True,0.0,0.0,False,0.0,0.0,...,0.0,False,0.0,0.0,0.75,F,0.0,9.8,73,Healthy
2,103,False,0.0,0.0,True,0.0,0.666667,False,0.0,0.0,...,0.0,False,0.0,0.0,70.0,F,33.0,0.0,0,Asthma
3,104,True,0.0,0.0,True,0.0,0.714286,True,0.0,0.0,...,0.0,False,0.0,0.0,70.0,F,28.47,0.0,0,COPD
4,105,False,0.0,0.0,False,0.0,0.0,False,0.0,0.0,...,0.0,True,0.0,0.0,7.0,F,0.0,32.0,135,URTI


# **Get data from files name**

In [9]:
def get_filename_info(file):
    return file.split('_')

In [10]:
files_data = []
for file in files:
    data = pd.read_csv(path + file + '.txt',sep='\t',names=['start','end','crackles','weezels'])
    name_data = get_filename_info(file)
    data['pid'] = name_data[0]
    data['mode'] = name_data[-2]
    data['location'] = name_data[-3]
    data['equipment'] = name_data[-1]
    data['filename'] = file
    files_data.append(data)
files_df=pd.concat(files_data, ignore_index=True)
files_df.reset_index()
files_df.head(15)


Unnamed: 0,start,end,crackles,weezels,pid,mode,location,equipment,filename
0,0.036,2.436,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron
1,2.436,5.25,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron
2,5.25,8.422,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron
3,8.422,11.222,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron
4,11.222,13.807,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron
5,13.807,17.122,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron
6,17.122,19.979,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron
7,3.464,5.868,0,0,172,mc,Ar,AKGC417L,172_1b4_Ar_mc_AKGC417L
8,5.868,8.473,1,1,172,mc,Ar,AKGC417L,172_1b4_Ar_mc_AKGC417L
9,8.473,11.027,0,1,172,mc,Ar,AKGC417L,172_1b4_Ar_mc_AKGC417L


# **Sound data preprocessing**

In [11]:
def get_pure_sample(raw_data,start,end,sr=22050):
    max_ind = len(raw_data) 
    start_ind = min(int(start * sr), max_ind)
    end_ind = min(int(end * sr), max_ind)
    return raw_data[start_ind: end_ind]

In [12]:
os.makedirs('processed_audio_files')


# **Add new files names and save data**

In [13]:
i, c = 0, 0
filename_list = []
for index, row in files_df.iterrows():
    maxLen = 6
    start = row['start']
    end = row['end']
    filename = row['filename']
    
    if end - start > maxLen:
        end = start + maxLen
    
    audio_file_loc = path + filename + '.wav'
    
    if index > 0:
        if files_df.iloc[index-1]['filename'] == filename:
            i += 1
        else:
            i = 0
    filename= filename + '_' + str(i) + '.wav'
    
    save_path='processed_audio_files/' + filename
    c += 1
    filename_list.append(filename)
    audioArr, sampleRate = lb.load(audio_file_loc)
    pureSample = get_pure_sample(audioArr,start,end,sampleRate)
    
    reqLen = 6 * sampleRate
    padded_data = lb.util.pad_center(pureSample, reqLen)
    
    sf.write(file=save_path,data=padded_data,samplerate=sampleRate)
print('Total Files Processed: ',c)

Total Files Processed:  6898


In [14]:
files_df['filename_new'] = filename_list
files_df.to_csv('files_info.csv')
files_df.head()

Unnamed: 0,start,end,crackles,weezels,pid,mode,location,equipment,filename,filename_new
0,0.036,2.436,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_0.wav
1,2.436,5.25,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_1.wav
2,5.25,8.422,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_2.wav
3,8.422,11.222,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_3.wav
4,11.222,13.807,0,0,168,sc,Al,Meditron,168_1b1_Al_sc_Meditron,168_1b1_Al_sc_Meditron_4.wav


In [15]:
patient_diagnosis_df['disease'].value_counts()

COPD              64
Healthy           26
URTI              14
Bronchiectasis     7
Pneumonia          6
Bronchiolitis      6
LRTI               2
Asthma             1
Name: disease, dtype: int64