# Cough detection
## Authors
- [quirinoc](https://github.com/quirinoc)
- Cabello
- Karol
- Guillermo
- Daniela

## Based on 
### [Music Genre Classification using Hidden Markov Models - Salih Boutadghart](https://blog.goodaudience.com/music-genre-classification-using-hidden-markov-models-4a7f14eb0fd4)

In [None]:
! pip install -r requirements.txt --quiet

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os.path as path

from python_speech_features import mfcc, logfbank
from scipy.io import wavfile
from hmmlearn import hmm
from sklearn.metrics import confusion_matrix
from random import randint

import itertools
import os
import glob

In [None]:
sampling_freq, audio = wavfile.read("data/cough/15435__acclivity__goatsinthedust.wav")
mfcc_features = mfcc(audio, sampling_freq)
filterbank_features = logfbank(audio, sampling_freq)

In [None]:
print ('\nMFCC:\nNumber of windows =', mfcc_features.shape[0])
print ('Length of each feature =', mfcc_features.shape[1])
print ('\nFilter bank:\nNumber of windows =', filterbank_features.shape[0])
print ('Length of each feature =', filterbank_features.shape[1])

In [None]:
from random import randint

In [None]:
import glob
import os.path as path
from random import randint
categories = ['cough', 'no_cough']
figure = plt.figure(figsize=(20,3))
for idx, category in enumerate(categories): 
    example_data_path = path.join('data', category)
    file_paths = glob.glob(path.join(example_data_path, '*.wav'))
    sampling_freq, audio = wavfile.read(file_paths[randint(0, len(file_paths) - 1)])
    mfcc_features = mfcc(audio, sampling_freq, nfft=1024)
    plt.yscale('linear')
    plt.matshow((mfcc_features.T)[:,:300])
    plt.text(150, -10, category, horizontalalignment='center', fontsize=20)

plt.yscale('linear')
plt.show()

In [None]:
class HMMTrainer(object):
    def __init__(self, model_name='GaussianHMM', n_components=4, cov_type='diag', n_iter=1000):
        self.model_name = model_name
        self.n_components = n_components
        self.cov_type = cov_type
        self.n_iter = n_iter
        self.models = []
        if self.model_name == 'GaussianHMM':
            self.model = hmm.GaussianHMM(n_components=self.n_components,covariance_type=self.cov_type,n_iter=self.n_iter)
        else:
            raise TypeError('Invalid model type') 

    def train(self, X):
        np.seterr(all='ignore')
        self.models.append(self.model.fit(X))
        # Run the model on input data
    def get_score(self, input_data):
        return self.model.score(input_data)

In [None]:
input_folder = 'data'
categories = ['cough','no_cough']

hmm_models = {}

mfcc_data = {}

# Parse the input directory
for category in categories:
    # Get the name of the subfolder
    subfolder = os.path.join(input_folder, category)
    if not os.path.isdir(subfolder):
        print(f'Invalid dir: {subfolder} does not exists!'); break
        
    # Initialize variables
    X = np.array([])
    # Iterate through the audio files (leaving 1 file for testing in each class)
    for filepath in glob.glob(path.join(subfolder, '*.wav')):
        try:
            sampling_freq, audio = wavfile.read(filepath)
        except:
            continue
        # Extract MFCC features
        mfcc_features = mfcc(audio, sampling_freq)
        # Append to the variable X
        if len(X) == 0:
            X = mfcc_features
        else:
            X = np.append(X, mfcc_features, axis=0)
    
    # Add data
    mfcc_data[category] = X

In [None]:
# Train both models
for category in ['cough', 'no_cough']:
    hmm_trainer = HMMTrainer(n_components=10)
    hmm_models[category] = hmm_trainer
    hmm_models[category].train(mfcc_data[category][:100,:])

In [None]:
def wav_to_mfcc(path):
    sampling_freq, audio = wavfile.read(filepath)
    mfcc_features = mfcc(audio, sampling_freq)
    return mfcc_features

In [None]:
def test_category(models, data, label):
    pred_labels = []
    category = get_pred(models, data)
    pred_labels.append(
        category == label
    )
    return pred_labels

In [None]:
def get_pred(models, data):
    max_score = float('-inf')
    max_score_category = None
    for category in models:
        score = models[category].get_score(data)
        print(score, category)
        if score > max_score:
            max_score = score
            max_score_category = category
    print(max_score_category)
    return max_score_category

## Testing

In [None]:
cough_prediction = test_category(hmm_models, mfcc_data['cough'][:100,:], 'cough')
no_cough_prediction = test_category(hmm_models, mfcc_data['no_cough'][:100,:], 'no_cough')

In [None]:
print('Accuracy for cough', sum(cough_prediction) / len(cough_prediction))
print('Accuracy for no_cough', sum(no_cough_prediction) / len(no_cough_prediction))

In [None]:
input_folder = 'data/'
real_labels = []
categories = ['cough','no_cough']
pred_labels = []
for category in categories:
    subfolder = os.path.join(input_folder, dirname)
    if not os.path.isdir(subfolder):
        continue
    # Extract the label
    label_real = subfolder[subfolder.rfind('/') + 1:]

    for filename in [x for x in os.listdir(subfolder) if x.endswith('.wav')][:-1]:
        real_labels.append(label_real)
        filepath = os.path.join(subfolder, filename)
        try:
            sampling_freq, audio = wavfile.read(filepath)
        except: continue
        mfcc_features = mfcc(audio, sampling_freq)
        max_score = -9999999999999999999
        output_label = None
        for item in hmm_models:
            hmm_model, label = hmm_models[item], item
            score = hmm_model.get_score(mfcc_features)
            if score > max_score:
              max_score = score
              output_label = label
pred_labels.append(output_label)