RESPIRATORY SOUND DATABASE DATA PREPARATION AND FEATURES EXTRACTION

In [None]:
#DATASET LINK: https://bhichallenge.med.auth.gr/ICBHI_2017_Challenge
#LABELS LINK: https://bhichallenge.med.auth.gr/sites/default/files/ICBHI_final_database/ICBHI_Challenge_diagnosis.txt

#COPY LABELS INTO A CSV FILE

#PREPARED DATASET LINK: https://drive.google.com/file/d/16SPfcsTIFESHvkZCbCjYsjE5A8d7eRod/view?usp=sharing

In [1]:
import pandas as pd

In [6]:
labels = pd.read_csv("labels.csv", sep='\t', names=["Patient_ID", "Diagnosis"])
labels

Unnamed: 0,Patient_ID,Diagnosis
0,101,URTI
1,102,Healthy
2,103,Asthma
3,104,COPD
4,105,URTI
...,...,...
121,222,COPD
122,223,COPD
123,224,Healthy
124,225,Healthy


In [7]:
audio_files_path = "ICBHI_final_database/"

In [10]:
datadf = pd.DataFrame(columns=["filename", "diagnosis"])
datadf

Unnamed: 0,filename,diagnosis


In [11]:
import os
import wave

filenames_list = []
diagnosis_list = []
directory = audio_files_path  # Replace with the actual directory path

# Get a list of all files in the directory
files = os.listdir(directory)

# Iterate over each file in the directory
for file in files:
    if file.endswith('.wav'):
        file_path = os.path.join(directory, file)
        with wave.open(file_path, 'r') as wav_file:
            # Extract the first three letters of the filename
            first_three_letters = file[:3]
            patient_id = int(first_three_letters)
            diagnosis = labels[labels["Patient_ID"] == patient_id]["Diagnosis"].values[0]

            filenames_list.append(file)
            diagnosis_list.append(diagnosis)

In [14]:
datadf["filename"] = filenames_list
datadf["diagnosis"] = diagnosis_list

In [15]:
datadf.head()

Unnamed: 0,filename,diagnosis
0,101_1b1_Al_sc_Meditron.wav,URTI
1,101_1b1_Pr_sc_Meditron.wav,URTI
2,102_1b1_Ar_sc_Meditron.wav,Healthy
3,103_2b2_Ar_mc_LittC2SE.wav,Asthma
4,104_1b1_Al_sc_Litt3200.wav,COPD


In [16]:
datadf.diagnosis.value_counts()

COPD              793
Pneumonia          37
Healthy            35
URTI               23
Bronchiectasis     16
Bronchiolitis      13
LRTI                2
Asthma              1
Name: diagnosis, dtype: int64

In [None]:
'''
FROM CHAT GPT
#Order these following health conditions into healthy, mild, moderate and severe
#COPD , Pneumonia, Healthy,URTI , Bronchiectasis , Bronchiolitis , LRTI ,Asthma

Based on a general understanding, here's how you can order the given health conditions 
into healthy, mild, moderate, and severe categories:

Healthy:
- Healthy

Mild:
- URTI (Upper Respiratory Tract Infection)
- Bronchiolitis

Moderate:
- COPD (Chronic Obstructive Pulmonary Disease)
- Bronchiectasis
- Asthma

Severe:
- Pneumonia
- LRTI (Lower Respiratory Tract Infection)

Please note that this categorization is based on a general perception and may not 
encompass all possible scenarios. The severity of these conditions can vary depending 
on individual cases and other factors. It's always advisable to consult medical professionals 
for accurate assessment and categorization of health conditions.
'''


In [17]:
diagnosis_label_map = {"Healthy": "healthy", "URTI": "mild", "Bronchiolitis":"mild", "COPD": "moderate", "Bronchiectasis": "moderate", "Asthma": "moderate", "Pneumonia": "severe", "LRTI": "severe"}
datadf = datadf.replace({"diagnosis": diagnosis_label_map})

In [18]:
datadf.diagnosis.value_counts()

moderate    810
severe       39
mild         36
healthy      35
Name: diagnosis, dtype: int64

In [19]:
import pandas as pd
import os
import numpy as np
import IPython.display as ipd
import numpy as np
import librosa
import librosa.display
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Attention
from tensorflow.keras.layers import LSTM, Dense
import matplotlib.pyplot as plt
%matplotlib inline

In [20]:
def features_extractor(file_name):
    audio, sample_rate = librosa.load(file_name)

    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=16)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_fft=2048, hop_length=512, n_mels=10)
    mel_scaled_features = np.mean(mel_spectrogram.T,axis=0)

    zcr = librosa.feature.zero_crossing_rate(audio)
    zcr_scaled_features = np.mean(zcr.T,axis=0)

    spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sample_rate)
    spectral_centroid_scaled_features = np.mean(spectral_centroid.T,axis=0)

    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sample_rate)
    spectral_rolloff_scaled_features = np.mean(spectral_rolloff.T,axis=0)

    stft = np.abs(librosa.stft(audio))
    chroma = librosa.feature.chroma_stft(S=stft, sr=sample_rate)
    chroma_scaled_features = np.mean(chroma.T,axis=0)


    return mfccs_scaled_features,mel_scaled_features,zcr_scaled_features,spectral_centroid_scaled_features,spectral_rolloff_scaled_features, chroma_scaled_features

In [21]:
import numpy as np
from tqdm import tqdm
extracted_features=[]
for index_num,row in tqdm(datadf.iterrows()):
    file_name = os.path.join(audio_files_path,str(row["filename"]))
    final_class_labels=row["diagnosis"]
    mfcc,melspec,zcr,sc,sr,chroma = features_extractor(file_name)
    extracted_features.append([mfcc[0],mfcc[1],mfcc[2],mfcc[3],mfcc[4],mfcc[5],mfcc[6],mfcc[7],mfcc[8],\
                               mfcc[9],mfcc[10],mfcc[11],mfcc[12],mfcc[13],mfcc[14],mfcc[15],\
                               melspec[0],melspec[1],melspec[2],melspec[3],melspec[4],melspec[5],\
                               melspec[6],melspec[7],melspec[8],melspec[9],zcr[0],sc[0],sr[0],\
                               chroma[0],final_class_labels])

  return pitch_tuning(
920it [04:23,  3.49it/s]


In [22]:
### converting extracted_features to Pandas dataframe
extracted_features_df = pd.DataFrame(extracted_features,columns=['mfcc1','mfcc2','mfcc3','mfcc4','mfcc5',\
                                                                 'mfcc6','mfcc7','mfcc8','mfcc9','mfcc10',\
                                                                 'mfcc11','mfcc12','mfcc13','mfcc14','mfcc15',\
                                                                 'mfcc16','melspec1','melspec2','melspec3',\
                                                                 'melspec4','melspec5','melspec6','melspec7',\
                                                                 'melspec8','melspec9','melspec10','zcr','sc',\
                                                                 'sr','chroma','severity'])
extracted_features_df.head(10)

Unnamed: 0,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,...,melspec6,melspec7,melspec8,melspec9,melspec10,zcr,sc,sr,chroma,severity
0,-528.425232,104.517365,69.907585,42.572124,31.213381,27.49058,22.207005,16.5511,15.276698,16.680298,...,7.689418e-06,3.043499e-06,1.521822e-06,8.912642e-07,5.816044e-07,0.002117,129.340184,102.857267,0.806715,mild
1,-582.5047,95.394691,57.40218,31.435291,28.148777,33.527466,31.569719,22.792305,16.643297,15.632527,...,1.274524e-06,7.732711e-07,4.714158e-07,3.268165e-07,2.486452e-07,0.001885,156.085273,81.761223,0.817237,mild
2,-596.806091,116.009735,60.592537,25.570189,26.233389,38.501705,36.034607,18.649155,3.548372,0.805531,...,4.175917e-07,2.307056e-07,1.557648e-07,1.265123e-07,1.06912e-07,0.002278,192.965319,138.804226,0.81663,healthy
3,-418.433228,68.255913,48.02573,33.063881,27.97485,26.455481,22.683329,18.157923,15.907789,15.104877,...,6.502951e-05,3.438663e-05,1.98222e-05,1.149959e-05,6.471815e-06,0.00111,140.01002,43.75337,0.82527,moderate
4,-441.498169,100.994278,74.064339,43.248154,19.710049,7.843437,5.497488,8.012251,11.732348,14.880522,...,2.643959e-08,1.510869e-09,3.641445e-10,1.135635e-10,4.673208e-11,0.006757,115.155966,221.921254,0.723837,moderate
5,-443.037079,106.827316,74.783844,40.041946,16.149172,6.407521,5.622064,7.306179,8.732014,10.394088,...,6.836962e-09,3.81366e-10,1.347897e-10,6.364875e-11,3.798238e-11,0.007559,140.613149,287.220102,0.779298,moderate
6,-477.403412,67.018906,54.495525,39.22147,26.022156,17.418203,13.355596,12.385002,12.951957,13.97243,...,3.131104e-08,2.092449e-09,5.866057e-10,2.236975e-10,1.142512e-10,0.003819,83.400168,154.560246,0.706874,moderate
7,-424.292725,59.958569,50.331985,38.341972,27.44766,19.618916,15.312139,14.078797,14.962395,16.677263,...,1.571585e-07,1.354139e-08,4.377539e-09,1.920584e-09,1.093174e-09,0.0025,62.584413,110.257975,0.768407,moderate
8,-432.466248,93.294693,72.102501,46.873489,25.506245,11.539235,4.865261,4.025266,7.170238,11.816373,...,3.355049e-08,1.097161e-09,3.39967e-10,1.445412e-10,8.05501e-11,0.006088,114.484782,221.874728,0.722068,moderate
9,-456.2453,86.772209,66.549385,43.219105,24.172537,11.905,5.613225,4.08299,6.394207,10.865976,...,2.982193e-07,2.864167e-08,1.036415e-08,4.956832e-09,2.980234e-09,0.004972,104.790436,194.873381,0.798426,moderate


In [23]:
extracted_features_df.to_csv("respiratory_sound_db_database.csv", index=False)