# Cough detection
## Authors
- [quirinoc](https://github.com/quirinoc)
- Cabello
- Karol
- Guillermo
- Daniela

## Based on 
### [Music Genre Classification using Hidden Markov Models - Salih Boutadghart](https://blog.goodaudience.com/music-genre-classification-using-hidden-markov-models-4a7f14eb0fd4)

## Prepare modules

In [None]:
! pip install -r requirements.txt --quiet

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path as path

from python_speech_features import mfcc, logfbank
from scipy.io import wavfile
from hmmlearn import hmm
from sklearn.metrics import confusion_matrix
from random import randint, shuffle

import itertools
import os
import glob

## EDA

In [None]:
sampling_freq, audio = wavfile.read("data/cough/15435__acclivity__goatsinthedust.wav")
mfcc_features = mfcc(audio, sampling_freq)
filterbank_features = logfbank(audio, sampling_freq)

In [None]:
print ('\nMFCC:\nNumber of windows =', mfcc_features.shape[0])
print ('Length of each feature =', mfcc_features.shape[1])
print ('\nFilter bank:\nNumber of windows =', filterbank_features.shape[0])
print ('Length of each feature =', filterbank_features.shape[1])

In [None]:
from random import randint

In [None]:
import glob
import os.path as path
from random import randint
categories = ['cough', 'no_cough']
figure = plt.figure(figsize=(20,3))
for idx, category in enumerate(categories): 
    example_data_path = path.join('data', category)
    file_paths = glob.glob(path.join(example_data_path, '*.wav'))
    sampling_freq, audio = wavfile.read(file_paths[randint(0, len(file_paths) - 1)])
    mfcc_features = mfcc(audio, sampling_freq, nfft=1024)
    plt.yscale('linear')
    plt.matshow((mfcc_features.T)[:,:300])
    plt.text(150, -10, category, horizontalalignment='center', fontsize=20)

plt.yscale('linear')
plt.show()

In [None]:
input_folder = 'data'
categories = ['cough','no_cough']

hmm_models = {}

mfcc_data_split = {category : [] for category in categories}
mfcc_data = {}

# Parse the input directory
for category in categories:
    # Get the name of the subfolder
    subfolder = os.path.join(input_folder, category)
    if not os.path.isdir(subfolder):
        print(f'Invalid dir: {subfolder} does not exists!'); break
        
    # Iterate through the audio files (leaving 1 file for testing in each class)
    for filepath in glob.glob(path.join(subfolder, '*.wav')):
        try:
            sampling_freq, audio = wavfile.read(filepath)
        except:
            continue
        # Extract MFCC features
        mfcc_features = mfcc(audio, sampling_freq)
        
        mfcc_data_split[category].append(mfcc_features)

In [None]:
# Split into Training / Testing
split_per = 0.7
train_data = {}
test_data = {}
train_data_split = {}
test_data_split = {}
for category in mfcc_data_split:
    data = mfcc_data_split[category]
    shuffle(data)
    n = int(len(data) * 0.7)
    train, test = data[:n], data[n:]
    train_data_split[category] = train
    test_data_split[category] = test
    train_data[category] = np.concatenate(train_data_split[category], axis=0)
    test_data[category] = np.concatenate(test_data_split[category], axis=0)

In [None]:
class HMMTrainer(object):
    def __init__(self, model_name='GaussianHMM', n_components=4, cov_type='diag', n_iter=1000):
        self.model_name = model_name
        self.n_components = n_components
        self.cov_type = cov_type
        self.n_iter = n_iter
        if self.model_name == 'GaussianHMM':
            self.model = hmm.GaussianHMM(n_components=self.n_components,covariance_type=self.cov_type,n_iter=self.n_iter)
        else:
            raise TypeError('Invalid model type') 

    def train(self, X):
        np.seterr(all='ignore')
        self.model.fit(X)
        # Run the model on input data
    def get_score(self, input_data):
        return self.model.score(input_data)

In [None]:
# Create models
for category in ['cough', 'no_cough']:
    hmm_trainer = HMMTrainer(n_components=10)
    hmm_models[category] = hmm_trainer
    

In [None]:
for category in ['cough', 'no_cough']:
    shuffle(train_data[category])
    hmm_models[category].train(train_data[category][:10000])

In [None]:
def get_pred(models, data):
    max_score = float('-inf')
    max_score_category = None
    for category in models:
        score = models[category].get_score(data)
        if score > max_score:
            max_score = score
            max_score_category = category
    return max_score_category

In [None]:
def test_category(models, data, label):
    pred_labels = []
    for mfcc_filter in data:
        category = get_pred(models, mfcc_filter)
        pred_labels.append(
            category == label
        )
    return pred_labels

## Testing

In [None]:
input_folder = 'data/'
categories = ['cough','no_cough']
real_labels = []
pred_labels = []
for category in categories:
    for mfcc_features in test_data_split[category]:
        pred = get_pred(hmm_models, mfcc_features)
        real_labels.append(category)
        pred_labels.append(pred)

        
cough_c = 0
no_cough_c = 0
total = 0
for pred, real in zip(pred_labels, real_labels):
    if pred == real:
        if real == 'cough':
            cough_c += 1
        elif real == 'no_cough':
            no_cough_c +=1
    total += 1
print(cough_c / len(test_data_split['cough']))
print(no_cough_c / len(test_data_split['no_cough']))
print(c / len(pred))

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
        
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
cm = confusion_matrix(real_labels, pred_labels)
print(cm)
np.set_printoptions(precision=2)
classes = ["cough","no_cough"]
plt.figure()
plot_confusion_matrix(cm, classes=classes, normalize=True,
                      title='Normalized confusion matrix')

plt.show()