In [1]:
from scipy.io import wavfile
import matplotlib.pyplot as plt
from IPython.display import Audio
import numpy as np
from scipy.fft import rfft, rfftfreq
from scipy.signal import butter, lfilter, medfilt, cheby1, sosfilt
from scipy.stats import median_abs_deviation
import pandas as pdf
import math
%matplotlib inline

data_path = "/home/collins/Desktop/projects/baymax/data/Respiratory_Sound_Database/"
audio_file_path = f"{data_path}audio_and_txt_files"

# patient_number = "122" # Those with pneumonia in the dataset are 122, 135, 140, 191, 219 and 226
# sound_location = ""

# Colors
plot_colors = {
    "Tc": "#32F2F5",
    "Al": "#FA9A0A",
    "Ll": "#F53532",
    "Pl": "#2AC126",
    "Ar": "#0A6AFA",
    "Pr": "#BD26C1",
    "Lr": "#32F2F5"
}

def remove_spikes(data):
    # Calculate the median absolute deviation (MAD) of the signal
    # mad = math.floor(median_abs_deviation(data))

    # Determine the window size based on the MAD
    # ws = mad * 10
    
    # Ensure window size is odd
    ws = 501 # ws if ws % 2 else ws + 1
    
    # print(ws)
    
    filtered_signal = medfilt(data)
    
#     b, a = butter(3, [50/(0.5*sampling_rate), 2500/(0.5*sampling_rate)], 'band')    
#     data = lfilter(b, a, data)

    return filtered_signal

# High pass filter function to remove heart sounds
def filterNoise(sampling_rate, data):
    # Band pass filter with cutoff frequencies
    # as a fraction of nquist frequency (1/2 the sampling rate)
    b, a = butter(3, [15/(0.5*sampling_rate), 1700/(0.5*sampling_rate)], 'band')
    # b, a = cheby1(3, [15/(0.5*sampling_rate), 1700/(0.5*sampling_rate)], 'band')
    # data = sosfilt(b, a, data)
    data = lfilter(b, a, data)
    # Median filter to remove random spikes
    # window_size = 5 # sampling_rate if sampling_rate % 2 else sampling_rate + 1
    # data = medfilt(data, window_size)
    # data = remove_spikes(data)
    
    # wavfile.write("output_audio/example.wav", sampling_rate, filtered.astype(np.int16))
    return data


def fourierTransform():
    # Fourier transform for Frequency domain
    # plt.subplot(1,2,2)
    plt.xlabel("Frequency")
    plt.ylabel("Power")
    plt.title(f"Frequency domain of {diagnosis} patient {patient_number}")
    
    for z in files:
        file_name = files[z]
        if file_name:
            sr, data = wavfile.read(f"{audio_file_path}/{file_name}")
            number_of_samples = data.shape[0]
            yf = rfft(data)
            # yf_normalized = np.abs(yf) / np.max(np.abs(yf))
            xf = rfftfreq(number_of_samples, 1/sr)
            plt.plot(xf, np.abs(yf), label=z)


    plt.legend()
    plt.savefig(f"waveforms/{patient_number}_{diagnosis}_freqdom.png")
    plt.close()

    return


def plotSoundWaveform(patient_number, files, diagnosis, chest_location=''):
    # Time domain
    fig, axs = plt.subplots(2,1, figsize=(10, 6))

    axs[0].set_xlabel("Time [s]")
    axs[1].set_xlabel("Time [s]")
    axs[0].set_ylabel("Amplitude")
    axs[1].set_ylabel("Amplitude")
    axs[0].set_title(f"Original {chest_location} waveform")
    axs[1].set_title(f"Band pass filtered {chest_location} waveform")
    

    for y in files:
        file_name = files[y]
        if file_name:
            sr, data = wavfile.read(f"{audio_file_path}/{file_name}")
            time = np.arange(data.shape[0])/sr

            axs[0].plot(time, data, label=y, color=plot_colors[y])
            data_filtered = filterNoise(sr, data) # Remove cardiac sounds
            axs[1].plot(time, data_filtered, label=y, color=plot_colors[y])

            # data_normalized = np.abs(data) / np.max(np.abs(data))

    axs[0].legend()
    axs[1].legend()
    
    fig.tight_layout()
    plt.savefig(f"waveforms/{patient_number}_{diagnosis}_timedom{chest_location}.png")
    plt.close()

    return

In [2]:
# Prepare data - List with dicts of format
# {"patient_number": {"Al": {"annotation":{""}}, "Tc": "", "Ar": "", "": ""}}
import csv
import os


demographics_file = f"{data_path}/demographic_info.csv"
events_path = f"{data_path}events"
diagnosis_file = f"{data_path}patient_diagnosis.csv"


df = pdf.read_csv(diagnosis_file, header=None, names=["patient_no", "diagnosis"])
df = df[df['diagnosis'].isin(["Healthy", "Pneumonia"])] # Only deal with pneumonia and healthy


df2 = pdf.read_csv(demographics_file, header=None, names=["patient_no", "age", "sex", "adult_bmi", "child_weight", "child_height"])
df3 = pdf.merge(df, df2, on="patient_no")

In [3]:
# Add columns for each chest location
df3["Tc"] = None
df3["Al"] = None
df3["Pl"] = None
df3["Ll"] = None
df3["Ar"] = None
df3["Pr"] = None
df3["Lr"] = None

In [4]:
for d in os.listdir(audio_file_path):
    p = d.split("_")

    if p[4].split(".")[1] == "wav":
        df3.loc[df3['patient_no'] == int(p[0]), p[2]] = d

In [5]:
# for _, row in df3.iterrows():
#     plotSoundWaveform(
#         row['patient_no'],
#         {
#          "Tc": row['Tc'], "Al": row['Al'],
#          "Pl": row['Pl'], "Ll": row['Ll'],
#          "Ar": row['Ar'], "Pr": row['Pr'],
#          "Lr": row['Lr']
#         },
#         row['diagnosis']
#     )

In [6]:
# Plots for single chest locations
# for _, row in df3.iterrows():
#     for c_loc in ['Tc', 'Pl', 'Pr', 'Ll', 'Lr', 'Al', 'Ar']:
#         if row[c_loc]:
#             plotSoundWaveform(
#                 row['patient_no'],
#                 {
#                     c_loc: row[c_loc]
#                 },
#                 row['diagnosis'],
#                 c_loc
#             )

In [7]:
# Pneumonia patient and healthy patient samples
# for _, pp in df3.loc[df3["patient_no"].isin([135,159])].iterrows():
#     plotSoundWaveform(
#         pp['patient_no'],
#         {
#          "Tc": pp['Tc'], "Al": pp['Al'],
#          "Pl": pp['Pl'], "Ll": pp['Ll'],
#          "Ar": pp['Ar'], "Pr": pp['Pr'],
#          "Lr": pp['Lr']
#         },
#         pp['diagnosis']
#     )

In [8]:
# Plots for single chest locations
# for _, pp in df3.loc[df3["patient_no"].isin([135,159])].iterrows():
#     plotSoundWaveform(
#         pp['patient_no'],
#         {
#             "Ar": pp['Ar']
#         },
#         pp['diagnosis']
#     )

In [9]:
df3.loc[df3.diagnosis == "Healthy", 'diagnosis'] = 0
df3.loc[df3.diagnosis == "Pneumonia", 'diagnosis'] = 1
# df3.loc[df3.age, 'age'] *= 12
# df3['age'] = df3.apply(lambda row: row.age * 12, axis = 1)
# df3["age"] = pdf.to_numeric(df3["age"])

# Age groups
df3.loc[df3.age <= 5, 'age'] = 1
df3.loc[(df3.age > 5) & (df3.age < 65), 'age'] = 2
df3.loc[df3.age >= 65, 'age'] = 3

df3

# df3.loc[df3.age >= 18] = 'G3'
# df3.loc[(df3['age'] > 5) & (df3['age'] < 18)] = 'G2'

# df3

Unnamed: 0,patient_no,diagnosis,age,sex,adult_bmi,child_weight,child_height,Tc,Al,Pl,Ll,Ar,Pr,Lr
0,102,0,1.0,F,,9.8,73.0,,,,,102_1b1_Ar_sc_Meditron.wav,,
1,121,0,2.0,F,,65.0,170.0,121_1p1_Tc_sc_Meditron.wav,,,,,,
2,122,1,3.0,M,33.0,,,122_2b1_Tc_mc_LittC2SE.wav,122_2b1_Al_mc_LittC2SE.wav,,,122_2b2_Ar_mc_LittC2SE.wav,,
3,123,0,1.0,M,,25.0,125.0,,123_1b1_Al_sc_Meditron.wav,,,,,
4,125,0,2.0,M,,62.0,170.0,125_1b1_Tc_sc_Meditron.wav,,,,,,
5,126,0,1.0,F,,10.18,80.0,,126_1b1_Al_sc_Meditron.wav,,,,,
6,127,0,1.0,M,,12.6,98.0,,,,,127_1b1_Ar_sc_Meditron.wav,,
7,135,1,3.0,M,21.0,,,135_2b3_Tc_mc_LittC2SE.wav,135_2b1_Al_mc_LittC2SE.wav,135_2b2_Pl_mc_LittC2SE.wav,,135_2b3_Ar_mc_LittC2SE.wav,135_2b3_Pr_mc_LittC2SE.wav,
8,136,0,1.0,M,,16.2,110.0,,,,,136_1b1_Ar_sc_Meditron.wav,,
9,140,1,3.0,F,23.0,,,140_2b3_Tc_mc_LittC2SE.wav,,,140_2b2_Ll_mc_LittC2SE.wav,,,


In [10]:
# Example sound with a wheeze

In [11]:
# Example sound with a crackle

In [12]:
# Example sound with both wheeze and crackle