#Audio Preprocessing

# importing .txt files and libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import seaborn as sns
import librosa.display
import os

# Preprocessing .txt files

In [2]:
import os
path='D:\\Downloads\\Lung-Disease-Audio-Classification\\ICBHI_final_database\\'
files=[s.split('.')[0] for s in os.listdir(path) if '.txt' in s]
files[:5]

['103_2b2_Ar_mc_LittC2SE',
 '105_1b1_Tc_sc_Meditron',
 '106_2b1_Pl_mc_LittC2SE',
 '106_2b1_Pr_mc_LittC2SE',
 '107_2b3_Al_mc_AKGC417L']

In [3]:
#splitting filename after every underscore
def getFilenameInfo(file):
    return file.split('_')

In [4]:
getFilenameInfo('160_1b3_Al_mc_AKGC417L')

['160', '1b3', 'Al', 'mc', 'AKGC417L']

#### Creating a dataframe with columns containing starting time, end time presence or weezels ,Patient ID ,mode of collection and file name of the audio

In [5]:
files_data=[]
for file in files:
    data=pd.read_csv(path + file + '.txt',sep='\t',names=['start','end','crackles','weezels'])
    name_data=getFilenameInfo(file)
    data['pid']=name_data[0]
    data['mode']=name_data[-2]
    data['filename']=file
    files_data.append(data)
files_df=pd.concat(files_data)
files_df.reset_index()
files_df.head()

Unnamed: 0,start,end,crackles,weezels,pid,mode,filename
0,0.364,3.25,0.0,1.0,103,mc,103_2b2_Ar_mc_LittC2SE
1,3.25,6.636,0.0,0.0,103,mc,103_2b2_Ar_mc_LittC2SE
2,6.636,11.179,0.0,1.0,103,mc,103_2b2_Ar_mc_LittC2SE
3,11.179,14.25,0.0,1.0,103,mc,103_2b2_Ar_mc_LittC2SE
4,14.25,16.993,0.0,1.0,103,mc,103_2b2_Ar_mc_LittC2SE


#### Processing the files_df dataframe

In [6]:
files_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6207 entries, 0 to 5
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   start     6207 non-null   object 
 1   end       6110 non-null   float64
 2   crackles  6110 non-null   float64
 3   weezels   6110 non-null   float64
 4   pid       6207 non-null   object 
 5   mode      6207 non-null   object 
 6   filename  6207 non-null   object 
dtypes: float64(3), object(4)
memory usage: 387.9+ KB


In [7]:
files_df[pd.isnull(files_df).any(axis=1)]

Unnamed: 0,start,end,crackles,weezels,pid,mode,filename
0,'101_1b1_Al_sc_AKGC417L',,,,filename,filename,filename_differences
1,'101_1b1_Pr_sc_AKGC417L',,,,filename,filename,filename_differences
2,'102_1b1_Ar_sc_AKGC417L',,,,filename,filename,filename_differences
3,'105_1b1_Tc_sc_LittC2SE',,,,filename,filename,filename_differences
4,'108_1b1_Al_sc_LittC2SE',,,,filename,filename,filename_differences
...,...,...,...,...,...,...,...
1,"Patient number (101,102,...,226)",,,,filename,filename,filename_format
2,Recording index,,,,filename,filename,filename_format
3,"Chest location (Trachea (Tc), {Anterior (A), P...",,,,filename,filename,filename_format
4,Acquisition mode (sequential/single channel (s...,,,,filename,filename,filename_format


In [8]:
#removing all Nan values
files_df.dropna(inplace=True)

In [9]:
files_df['start'] = files_df['start'].astype(float)

In [10]:
files_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6110 entries, 0 to 10
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   start     6110 non-null   float64
 1   end       6110 non-null   float64
 2   crackles  6110 non-null   float64
 3   weezels   6110 non-null   float64
 4   pid       6110 non-null   object 
 5   mode      6110 non-null   object 
 6   filename  6110 non-null   object 
dtypes: float64(4), object(3)
memory usage: 381.9+ KB


In [11]:
files_df.head()

Unnamed: 0,start,end,crackles,weezels,pid,mode,filename
0,0.364,3.25,0.0,1.0,103,mc,103_2b2_Ar_mc_LittC2SE
1,3.25,6.636,0.0,0.0,103,mc,103_2b2_Ar_mc_LittC2SE
2,6.636,11.179,0.0,1.0,103,mc,103_2b2_Ar_mc_LittC2SE
3,11.179,14.25,0.0,1.0,103,mc,103_2b2_Ar_mc_LittC2SE
4,14.25,16.993,0.0,1.0,103,mc,103_2b2_Ar_mc_LittC2SE


In [24]:
# Saving the DataFrame to a CSV in a specified directory on your local device
files_df.to_csv('files_df.csv', index = False)
!cp files_df.csv "D:\\Downloads\\Lung-Disease-Audio-Classification\\"

'cp' is not recognized as an internal or external command,
operable program or batch file.


#Audio Preprocessing

In [23]:
os.mkdir('D:\\Downloads\\Lung-Disease-Audio-Classification\\processed_audio_files')

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'D:\\Downloads\\Lung-Disease-Audio-Classification\\processed_audio_files'

In [17]:
#Takes a numpy array and spilts its using start and end args raw_data=numpy array of audio sample start=time end=time sr=sampling_rate mode=mono/stereo
    
def getPureSample(raw_data,start,end,sr=22050):
    
    max_ind = len(raw_data) 
    start_ind = min(int(start * sr), max_ind)
    end_ind = min(int(end * sr), max_ind)
    return raw_data[start_ind: end_ind]

In [18]:
for index,row in files_df.iterrows():
    print("Index ->",index)
    print("Data->\n",row)
    break

Index -> 0
Data->
 start                        0.364
end                           3.25
crackles                       0.0
weezels                        1.0
pid                            103
mode                            mc
filename    103_2b2_Ar_mc_LittC2SE
Name: 0, dtype: object


#### Preprocess all the audios in the dataset using a for loop.
First we store start and end values of audios and then access the audio themselves using their filenames.
<p>Then we change the lengths of audios if they are shorter or longer then a particular length(6 seconds). Shorter audios are padded and longer audios are clipped.</p>
<p> The audios are then saved in new folder called preprocessed audios where they will be used for training on a neural network</p>

In [21]:
import librosa as lb
import soundfile as sf
i,c=0,0
for index,row in files_df.iterrows():
    maxLen=6
    start=row['start']
    end=row['end']
    filename=row['filename']
    
    #If len > maxLen , change it to maxLen
    if end-start>maxLen:
        end=start+maxLen
    
    audio_file_loc=path + filename + '.wav'
    
    if index > 0:
        #check if more cycles exits for same patient if so then add i to change filename
        if files_df.iloc[index-1]['filename']==filename:
            i+=1
        else:
            i=0
    filename= filename + '.wav'
    
    save_path='D:\\Downloads\\Lung-Disease-Audio-Classification\\processed_audio_files\\' + filename
    c+=1
    
    audioArr,sampleRate=lb.load(audio_file_loc)
    pureSample=getPureSample(audioArr,start,end,sampleRate)
    
    
    reqLen = 6 * sampleRate
    padded_data = lb.util.pad_center(pureSample, size = reqLen)
    # if len(pureSample) < reqLen:
    #     pad_width = (reqLen - len(pureSample)) // 2
    #     padded_data = np.pad(pureSample, (pad_width, reqLen - len(pureSample) - pad_width), mode='constant')
    # else:
    #     padded_data = pureSample  # No padding needed if already at required length
    
    
    sf.write(file=save_path,data=padded_data,samplerate=sampleRate)
print('Total Files Processed: ',c)

Total Files Processed:  6110


In total we processed 6110 files.