In [1]:
import os
import numpy as np
import librosa

from collections import namedtuple
from sklearn.mixture import GaussianMixture

# 0. Very brief introduction

In this notebook a classic GMM voice recognition method is presented.

# 1. Definitions

## 1.1 Convenient datatypes

For convenience we use 3 datatypes defined below, as all the objects have to be labeled in order to make learning and testing convenient.
They can be essentially replaced by just a single named tuple like

```python
LabeledObject = namedtuple('LabeledObject', ['label', 'object'])
```

but I found more verbose notation to be more intuitive when reading code.

In [12]:
# (Label, Corresponding model).
LabeledModel = namedtuple('LabeledModel', ['label', 'model'])

# (Label, Path to files of that label).
LabeledPaths = namedtuple('LabeledPaths', ['label', 'paths'])

# (Label, scoring resut for given label).
LabeledResult = namedtuple('LabeledResult', ['label', 'score'])

## 1.2 Paths file parsing

This function parses given file of form

```
label1-metadata1/subfolder1/sample-01.wav
label1-metadata2/subfolder2/sample-02.wav
label1-metadata3/subfolder3/sample-03.wav

label2-metadata1/subfolder1/sample-01.wav
label2-metadata2/subfolder2/sample-02.wav
label2-metadata3/subfolder3/sample-03.wav

...
```

into a list of LabeledPaths.
Blank lines act as separators between differently-labeled models.

In [3]:
def get_data_paths(paths_file: str, prefix: str='',
                   name_stop_symb: str='-') -> list[LabeledPaths]:
    """Construct a list of Examples (label, filepaths) from a given file
    containing filepaths separated by whitespaces.
    """
    
    # To be a list of LabeledPaths.
    training_set = []
    with open(paths_file) as file:
        
        # After first read get label and first path.
        while line := file.readline().rstrip():
                
            # Set part of the path before `name_stop_symb' as label.
            label = line.split(name_stop_symb)[0]
            ex = LabeledPaths(label, [os.path.join(prefix, line)])

            # Continue to read paths until there's no left.
            while line := file.readline().rstrip():
                ex.paths.append(os.path.join(prefix, line))

            training_set.append(ex)
    
    return training_set

## 1.3 Feature extraction

For feature extraction we use very convenient Librosa library providing one-liners for extraction of MFCCs and their any-order differences.
Before extraction of features for each audio we first normalize it to bring all the clips to the same level.

`get_features_path` extracts features from files given, concatenates and returns them.

In [4]:
def get_features(utterance: np.ndarray, sr: int, n_mfccs: int=20):
    """Get features from a given librosa-returned array. Returns n_mfccs * 3
    features: MFCC, their 1st and 2nd differences concatenated horizontally.
    """
    
    mfccs  = librosa.feature.mfcc(utterance, sr, n_mfcc=n_mfccs)
    mfccs1 = librosa.feature.delta(mfccs)
    mfccs2 = librosa.feature.delta(mfccs, order=2)
    
    # Librosa returns features in shape (n_mfcc, ...). We want mfccs to
    # be "columns" in our dataset => we transpose them.
    features = np.hstack((mfccs.T, mfccs1.T, mfccs2.T))
    return features

def get_features_path(filepaths: list[str], n_mfccs: int=20) -> np.ndarray:
    """Gets features from given files. Uses get_features for feature-extraction.
    """
    
    features = np.empty((0, n_mfccs * 3))
    for path in filepaths:
        utt, sr = librosa.load(path)
        utt = librosa.util.normalize(utt)
        features_utt = get_features(utt, sr, n_mfccs=n_mfccs)
        
        features = np.vstack((features, features_utt))
    
    return features

## 1.4 Model construction

In [5]:
def get_models(examples: list[LabeledPaths]) -> list[LabeledModel]:
    """Create and train models for a given set of LabeledPaths (label, filepaths)
    created via get_data_paths function. Returns labeled models of type
    (label, model). Labels are preserved from corresponding LabeledPaths.
    """
    
    # To be a list of labeled models.
    labeled_models = []
    for example in examples:
        
        # Get features from files.
        features = get_features_path(example.paths)
        
        # Create and train GMM.
        gmm = GaussianMixture(n_components=16,
                              covariance_type='diag',
                              n_init=3,
                              max_iter=200)
        gmm.fit(features)
        
        # Add generated model to the list.
        lm = LabeledModel(example.label, gmm)
        labeled_models.append(lm)

    return labeled_models

def test_models(test_examples:  list[LabeledPaths],
                labeled_models: list[LabeledModel]) -> float:
    """Test models and return accuracy. Takes list of LabeledPaths (label, filepaths)
    and tests labeled models (label, model) against these examples.
    """
    
    # To be a list of results: Trues and Falses.
    results = []
    for test_example in test_examples:
        
        # Extract features for given speakers' filepaths.
        features = get_features_path(test_example.paths)
        
        # Get results for given speaker on all models in labeled_models.
        labeled_results = []
        for labeled_model in labeled_models:
            res = LabeledResult(labeled_model.label,
                                labeled_model.model.score(features))
            labeled_results.append(res)
        
        # Get index of a highest score and make corresponding label a prediction.
        scores_only = np.array([lr.score for lr in labeled_results], dtype=float)
        index = np.argmax(scores_only)
        label_pred = labeled_results[index].label
        
        # Check if prediction if correct and store result in results.
        results.append(label_pred == test_example.label)
    
    return sum(results) / len(results)

# 2. Training

Dataset is taken from [here](https://appliedmachinelearning.blog/2017/11/14/spoken-speaker-identification-based-on-gaussian-mixture-models-python-implementation/) contains 10 utterances from 34 speakers.
Training and testing sets both use 5 utterances for each speaker.

My folder setup looks like this:

```
classic_gmm_vr.ipynb
├── dataset_vr
│   ├── label1-metadata1/wav/sample-01.wav
│   ├── label1-metadata1/wav/sample-02.wav
│   │ ...
│   ├── label1-metadata1/wav/sample-10.wav
│   │
│   │ ...
│   │
│   ├── label2-metadata1/wav/sample-01.wav
│   ├── label2-metadata1/wav/sample-02.wav
│   │ ...
│   ├── label2-metadata1/wav/sample-10.wav
└── 
```

which I didn't change from original dataset.

In [6]:
train_dir = './dataset_vr'
train_paths_file = './train_data_paths.txt'

In [7]:
training_set = get_data_paths(train_paths_file, prefix=train_dir)

In [8]:
models = get_models(training_set)

# 3. Testing

In [9]:
test_paths_file = './test_data_paths.txt'

In [10]:
testing_set = get_data_paths(test_paths_file, prefix=train_dir)

In [11]:
test_models(testing_set, models)

1.0

# 4. Very brief result discussion

All the data was apparently recorded under identical conditions so it's no surprize we've got accuracy of 1.0.