In [None]:
# Import & Install Dependencies
# !pip install tensorflow tensorflow-gpu tensorflow-io matplotlib

In [None]:
# Load dependencies!
import os

import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
import tensorflow_io as tfio

<div class="alert alert-block alert-info">
Data Loading Function!
</div>

In [None]:
# Defining paths!
Asthma_File = os.path.join('Data','Archive_2','Asthma','103_2b2_Ar_mc_LittC2SE.wav')
Bronchiectasis_File = os.path.join('Data','Archive_2','Bronchiectasis','111_1b2_Tc_sc_Meditron.wav')
Bronchiolitis_File = os.path.join('Data','Archive_2','Bronchiolitis','149_1b1_Al_sc_Meditron.wav')
COPD_File = os.path.join('Data','Archive_2','COPD_Amended','104_1b1_Ar_sc_Litt3200.wav')
Healthy_File = os.path.join('Data','Archive_2','Healthy','102_1b1_Ar_sc_Meditron.wav')
LRTI_File = os.path.join('Data','Archive_2','LRTI','108_1b1_Al_sc_Meditron.wav')
Pneumonia_File = os.path.join('Data','Archive_2','Pneumonia','122_2b1_Al_mc_LittC2SE.wav')
URTI_File = os.path.join('Data','Archive_2','URTI','101_1b1_Al_sc_Meditron.wav')

In [None]:
def load_wav(filename):
    # Load wav file
    file_contents = tf.io.read_file(filename)
    # Decode wav (tensors by channel)
    wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)
    # Removes trailing axis
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    # Goes from 44100Hz to 16000Hz - amp of audio signal
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

In [None]:
asth_wave = load_wav(Asthma_File)
bronchsis_wave = load_wav(Bronchiectasis_File)
bronchtis_wave = load_wav(Bronchiolitis_File)
COPD_wave = load_wav(COPD_File)
health_wave = load_wav(Healthy_File)
LRTI_wave = load_wav(LRTI_File)
Pneu_wave = load_wav(Pneumonia_File)
URTI_wave = load_wav(URTI_File)

In [None]:
plt.plot(Pneu_wave)
plt.plot(asth_wave)
# plt.plot(bronchsis_wave)
# plt.plot(bronchtis_wave)
plt.plot(COPD_wave)
# plt.plot(health_wave)
# plt.plot(LRTI_wave)
# plt.plot(URTI_wave)
plt.show()

Wave to Spectrogram!

In [None]:
# Defining paths (again)!
Asthma = os.path.join('Data','Archive_2','Asthma')
Bronchiectasis = os.path.join('Data','Archive_2','Bronchiectasis')
Bronchiolitis = os.path.join('Data','Archive_2','Bronchiolitis')
COPD = os.path.join('Data','Archive_2','COPD_Amended')
Healthy = os.path.join('Data','Archive_2','Healthy')
LRTI = os.path.join('Data','Archive_2','LRTI')
Pneumonia = os.path.join('Data','Archive_2','Pneumonia')
URTI = os.path.join('Data','Archive_2','URTI')

In [None]:
# Tensorflow Dataset
# Looks for any wav files within each directory
AsthmaTFDS = tf.data.Dataset.list_files(Asthma+'\*.wav')
BronchiectasisTFDS = tf.data.Dataset.list_files(Bronchiectasis+'\*.wav')
BronchiolitisTFDS = tf.data.Dataset.list_files(Bronchiolitis+'\*.wav')
COPDTFDS = tf.data.Dataset.list_files(COPD+'\*.wav')
HealthyTFDS = tf.data.Dataset.list_files(Healthy+'\*.wav')
LRTITFDS = tf.data.Dataset.list_files(LRTI+'\*.wav')
PneumoniaTFDS = tf.data.Dataset.list_files(Pneumonia+'\*.wav')
URTITFDS = tf.data.Dataset.list_files(URTI+'\*.wav')

In [None]:
# One-Hot encoding so the model can understand the different labels and classify recordings
num_classes = 8
def one_hot_label_maker(audio, label):
    one_hot_label = tf.one_hot(label, num_classes)
    return audio, one_hot_label

In [None]:
# Was a debug piece of code to print outputs so it was known what was going in
# def debug(a, b):
    # print(a)
    # print(b)

In [None]:
Asthma_Samples = tf.data.Dataset.zip((AsthmaTFDS, tf.data.Dataset.from_tensor_slices(tf.fill(len(AsthmaTFDS), 0))))

Bronchiectasis_Samples = tf.data.Dataset.zip((BronchiectasisTFDS, tf.data.Dataset.from_tensor_slices(tf.fill(len(BronchiectasisTFDS), 1))))

Bronchiolitis_Samples = tf.data.Dataset.zip((BronchiolitisTFDS, tf.data.Dataset.from_tensor_slices(tf.fill(len(BronchiolitisTFDS), 2))))

COPD_Samples = tf.data.Dataset.zip((COPDTFDS, tf.data.Dataset.from_tensor_slices(tf.fill(len(COPDTFDS), 3))))

Healthy_Samples = tf.data.Dataset.zip((HealthyTFDS, tf.data.Dataset.from_tensor_slices(tf.fill(len(HealthyTFDS), 4))))

LRTI_Samples = tf.data.Dataset.zip((LRTITFDS, tf.data.Dataset.from_tensor_slices(tf.fill(len(LRTITFDS), 5))))

Pneumonia_Samples = tf.data.Dataset.zip((PneumoniaTFDS, tf.data.Dataset.from_tensor_slices(tf.fill(len(PneumoniaTFDS), 6))))

URTI_Samples = tf.data.Dataset.zip((URTITFDS, tf.data.Dataset.from_tensor_slices(tf.fill(len(URTITFDS), 7))))

all_data = []
print(len(all_data))
all_data = Asthma_Samples.concatenate(Bronchiectasis_Samples)
print(len(all_data))
all_data = all_data.concatenate(Bronchiolitis_Samples)
print(len(all_data))
all_data = all_data.concatenate(COPD_Samples)
print(len(all_data))
all_data = all_data.concatenate(Healthy_Samples)
print(len(all_data))
all_data = all_data.concatenate(LRTI_Samples)
print(len(all_data))
all_data = all_data.concatenate(Pneumonia_Samples)
print(len(all_data))
all_data = all_data.concatenate(URTI_Samples)
print(len(all_data))

In [None]:
all_data = all_data.map(one_hot_label_maker)

In [None]:
# Was debugging to know the format of what was being passed, so the one_hot_label_maker function could be created correctly.
# all_data = all_data.map(debug)

In [None]:
data_iterator = all_data.shuffle(500).as_numpy_iterator()
for item in all_data:
    print(data_iterator.next())

Preprocessing and Function Building

In [None]:
def preprocess(file_path, label):
    wav = load_wav(file_path)
    wav = wav[:320000]
    zero_padding = tf.zeros([320000] - tf.shape(wav), dtype=tf.float32)
    wav = tf.concat([zero_padding, wav],0)
    spectrogram = tf.signal.stft(wav, frame_length=256, frame_step=128)
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.expand_dims(spectrogram, axis=2)
    return spectrogram, label

In [None]:
wav = None
# wav = load_wav(Asthma_File) # 320000
# wav = load_wav(Bronchiectasis_File) # 320000
# wav = load_wav(Bronchiolitis_File) # 320000
# wav = load_wav(COPD_File) # 253696
# wav = load_wav(Healthy_File) # 320000
# wav = load_wav(LRTI_File) # 320000
# wav = load_wav(Pneumonia_File) # 320000
# wav = load_wav(URTI_File) # 320000
wav

In [None]:
filepath, label = Healthy_Samples.shuffle(buffer_size=10000).as_numpy_iterator().next()

In [None]:
# type(label)

In [None]:
spectrogram, label = preprocess(filepath, label)

In [None]:
spectrogram

In [None]:
plt.figure(figsize=(20,30))
plt.imshow(tf.transpose(spectrogram)[0])
plt.show()

In [None]:
type(label)

Training and Testing Partitions!

In [None]:
# Tensorflow Data Pipeline
all_data = all_data.map(preprocess)
all_data = all_data.cache()
all_data = all_data.shuffle(buffer_size=1000)
all_data = all_data.batch(16)
all_data = all_data.prefetch(8)

In [None]:
len(all_data)

In [None]:
# Testing and Training Partitions
train = all_data.take(13) #80% - 92
test = all_data.skip(13).take(3) #20% - 92/23

In [None]:
samples, labels = train.as_numpy_iterator().next()

In [None]:
samples.shape

In [None]:
labels

Building the model!!

In [None]:
# Import dependencies
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, Flatten

In [None]:
model = Sequential()
model.add(Conv2D(16, (3,3), activation='relu', input_shape=(2499, 129,1)))
model.add(Conv2D(16, (3,3), activation='relu'))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(8, activation='softmax'))

In [None]:
model.compile('Adam', loss='categorical_crossentropy', metrics=[tf.keras.metrics.Recall(), tf.keras.metrics.Precision(), tf.keras.metrics.Accuracy()])

In [None]:
model.summary()

Training Time!!!

In [None]:
hist = model.fit(train, epochs=16, validation_data=test)
# Wanting loss decrease, recall & precision increase
# G-mean, F1 implementation

In [None]:
hist.history

In [None]:
plt.title('Loss')
plt.plot(hist.history['loss'], 'tab:purple')
plt.plot(hist.history['val_loss'], 'deeppink')

In [None]:
plt.title('Precision')
plt.plot(hist.history['precision'], 'tab:purple')
plt.plot(hist.history['val_precision'], 'deeppink')

In [None]:
plt.title('Recall')
plt.plot(hist.history['recall'], 'tab:purple')
plt.plot(hist.history['val_recall'], 'deeppink')

In [None]:
x_test, y_test = test.as_numpy_iterator().next()

In [None]:
y_test.shape

In [None]:
yhat = model.predict(x_test)

In [None]:
yhat

In [None]:
def get_disease(label_classification):
    if label_classification == 0:
        return "Asthma"
    elif label_classification == 1:
        return "Bronchiectasis"
    elif label_classification == 2:
        return "Bronchiolitis"
    elif label_classification == 3:
        return "COPD"
    elif label_classification == 4:
        return "Healthy"
    elif label_classification == 5:
        return "LRTI"
    elif label_classification == 6:
        return "Pneumonia"
    elif label_classification == 7:
        return "URTI"
    else:
        return "Unknown"

In [None]:
max_indices = np.argmax(yhat[:, 1:], axis=1)

group_label = max_indices + 1

for i, label in enumerate(group_label):
    print(f"Clip {i} is predicted as being {get_disease(label)}")

In [None]:
max_indices_control = np.argmax(y_test[:, 1:], axis=1)

group_label_control = max_indices_control + 1

for i, label in enumerate(group_label_control):
    print(f"Clip {i} is {get_disease(label)}")

In [None]:
# Saving the model
# Had problems with library versions, so was unable to save model.
model.save("C://Users//jasmi//Dissertation Project")

In [None]:
# cross validation - statistical significance
# Ensure eroneous samples are represented, or small groups

In [None]:
# Save machine learning file
# Read me file
# Images - appendix
# Screen capture - Talk through code

In [None]:
# Randomly select audio from COPD, that used instead of mass amount of COPD