In [1]:
import pandas as pd
import os
import IPython.display as ipd
import librosa as lr
import soundfile as sf
pd.options.display.max_colwidth = 1000


In [2]:
# Import csv containing all start and end indices for crackles/wheezes
data_dir = os.path.join(os.getcwd(), 'data')
time_indices_df = pd.read_csv(
    os.path.join(data_dir, 'breath_cycles.csv'), index_col=0)
time_indices_df


Unnamed: 0,Start,End,Crackles,Wheezes,duration,start_index,end_index,start_index_int,end_index_int,y_segment,source
0,0.036,0.579,0,0,0.543,793.80,12766.95,793,12766,[-0.04873171 -0.04849143 -0.04823481 ... -0.08675144 -0.08687682\n -0.08693124],101_1b1_Al_sc_Meditron
1,0.579,2.450,0,0,1.871,12766.95,54022.50,12766,54022,[-0.08709788 -0.08723557 -0.08731082 ... -0.02195927 -0.02181599\n -0.02169704],101_1b1_Al_sc_Meditron
2,2.450,3.893,0,0,1.443,54022.50,85840.65,54022,85840,[-0.02180955 -0.02179321 -0.02166934 ... -0.12314763 -0.12324389\n -0.12332696],101_1b1_Al_sc_Meditron
3,3.893,5.793,0,0,1.900,85840.65,127735.65,85840,127735,[-0.12341598 -0.12354963 -0.12356759 ... 0.04257121 0.04275629\n 0.04287489],101_1b1_Al_sc_Meditron
4,5.793,7.521,0,0,1.728,127735.65,165838.05,127735,165838,[ 0.04297349 0.04315756 0.04330058 ... -0.0639165 -0.06329159\n -0.06283387],101_1b1_Al_sc_Meditron
...,...,...,...,...,...,...,...,...,...,...,...
6,11.721,13.693,1,0,1.972,258448.05,301930.65,258448,301930,[-0.04730458 -0.04737562 -0.04761335 ... 0.03817879 0.03843744\n 0.03883443],226_1b1_Pl_sc_LittC2SE
7,13.693,15.536,0,0,1.843,301930.65,342568.80,301930,342568,[0.03921689 0.03942744 0.03974621 ... 0.02950672 0.02939099 0.02952257],226_1b1_Pl_sc_LittC2SE
8,15.536,17.493,0,0,1.957,342568.80,385720.65,342568,385720,[0.02966898 0.02972315 0.02984862 ... 0.00599799 0.00647474 0.00691334],226_1b1_Pl_sc_LittC2SE
9,17.493,19.436,1,0,1.943,385720.65,428563.80,385720,428563,[0.00744156 0.00786917 0.00829501 ... 0.16827123 0.1684439 0.16842398],226_1b1_Pl_sc_LittC2SE


In [3]:
# Create a dictionary that contains the file path and the start/end index pairings for each file in a list of tuples
split_times_dict = {}
for index, row in time_indices_df.iterrows():
    filename = os.path.join(data_dir,
                            'pre_processed_audio/audio_4kHz_filtered',
                            (row["source"] + '.wav'))
    start_time = row["Start"]
    end_time = row["End"]

    if filename not in split_times_dict:
        split_times_dict[filename] = []
    split_times_dict[filename].append((start_time, end_time))

print(split_times_dict)


{'/home/raghav1881/archive/Respiratory_Sound_Database/Respiratory_Sound_Database/data/pre_processed_audio/audio_4kHz_filtered/101_1b1_Al_sc_Meditron.wav': [(0.036, 0.579), (0.579, 2.45), (2.45, 3.893), (3.893, 5.793), (5.793, 7.521), (7.521, 9.279), (9.279, 11.15), (11.15, 13.036), (13.036, 14.721), (14.721, 16.707), (16.707, 18.507), (18.507, 19.964)], '/home/raghav1881/archive/Respiratory_Sound_Database/Respiratory_Sound_Database/data/pre_processed_audio/audio_4kHz_filtered/101_1b1_Pr_sc_Meditron.wav': [(0.036, 1.264), (1.264, 3.422), (3.422, 5.55), (5.55, 7.436), (7.436, 9.221), (9.221, 11.264), (11.264, 13.264), (13.264, 15.179), (15.179, 17.207), (17.207, 19.179), (19.179, 19.936)], '/home/raghav1881/archive/Respiratory_Sound_Database/Respiratory_Sound_Database/data/pre_processed_audio/audio_4kHz_filtered/102_1b1_Ar_sc_Meditron.wav': [(0.264, 1.736), (1.736, 3.293), (3.293, 5.307), (5.307, 6.636), (6.636, 8.036), (8.036, 9.607), (9.607, 11.036), (11.036, 13.036), (13.036, 14.664),

In [4]:
# Create a function that will take each audio file and split it according to start/end index
def split_audio_file(filename, split_times):
    out_path = os.path.join(data_dir, 'pre_processed_audio/chunks')
    # Load the audio file
    audio, sr = lr.load(filename, sr=None)

    # Split the audio into chunks using the split times
    chunks = []
    chunk_names = []
    for start_time, end_time in split_times:
        chunk = audio[int(start_time * sr):int(end_time * sr)]
        chunks.append(chunk)

    # Export each chunk as a separate file
    for i, chunk in enumerate(chunks):
        tmp_filename = filename.split('/')[-1].split('.')[0]
        chunk_filename = f"{out_path}/{tmp_filename}_chunk{i}.wav"
        chunk_names.append(chunk_filename)
        sf.write(chunk_filename, chunk, sr)
    return(chunk_names)

# Iterate over the dictionary of split times and split each audio file
all_chunk_names = []
for filename, split_times in split_times_dict.items():
    all_chunk_names.append(split_audio_file(filename, split_times))


In [5]:
final_chunk = [l for k in all_chunk_names for l in k]
print(len(final_chunk))
time_indices_df['chunk_names'] = final_chunk

6898


In [7]:
time_indices_df.to_csv(os.path.join(data_dir, 'breath_cycles_updated.csv'))
time_indices_df['chunk_names']

0      /home/raghav1881/archive/Respiratory_Sound_Database/Respiratory_Sound_Database/data/pre_processed_audio/chunks/101_1b1_Al_sc_Meditron_chunk0.wav
1      /home/raghav1881/archive/Respiratory_Sound_Database/Respiratory_Sound_Database/data/pre_processed_audio/chunks/101_1b1_Al_sc_Meditron_chunk1.wav
2      /home/raghav1881/archive/Respiratory_Sound_Database/Respiratory_Sound_Database/data/pre_processed_audio/chunks/101_1b1_Al_sc_Meditron_chunk2.wav
3      /home/raghav1881/archive/Respiratory_Sound_Database/Respiratory_Sound_Database/data/pre_processed_audio/chunks/101_1b1_Al_sc_Meditron_chunk3.wav
4      /home/raghav1881/archive/Respiratory_Sound_Database/Respiratory_Sound_Database/data/pre_processed_audio/chunks/101_1b1_Al_sc_Meditron_chunk4.wav
                                                                            ...                                                                        
6      /home/raghav1881/archive/Respiratory_Sound_Database/Respiratory_Sound_Database/da