# DTW alignment on the basis of energy

(suggested by results from MFCC tests in vowel-discri git).

In [None]:
%matplotlib inline

# generic imports
import numpy as np
import os.path as path

# for feature extraction/storage
import soundfile
import scipy.fftpack
import scipy.signal as sig
from librosa.core.spectrum import power_to_db, stft
from librosa import filters
import h5features

# results analysis / plots
from scone_phobia import apply_analysis
from scone_phobia.analyses.avg_error import avg_error
import scone_phobia.metadata.add_metadata as add_metadata
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%load_ext Cython

In [None]:
def read_wav(wavefile, onset=None, offset=None):
    data, fs = soundfile.read(wavefile)
    if onset is None:
        onset = 0
    if offset is None:
        offset = len(data)/float(fs)
    times = 1/(2*float(fs)) + np.arange(len(data))/float(fs)
    data = data[(times>=onset) & (times<=offset)]
    return data, fs


def extract_mfcc(wav_folder, segments_file, out_file, **kwargs):
    segments = {}
    with open(segments_file, 'r') as fh:
        for line in fh:
            seg, wav, onset, offset = line.strip().split()
            onset, offset = float(onset), float(offset)
            segments[seg] = wav, onset, offset
    utts, feats, times = [], [], []
    for i, seg in enumerate(segments):
        if i % 100 == 0:
            print("Done {} / {}".format(i, len(segments)))
        wav, onset, offset = segments[seg]
        wavefile = path.join(wav_folder, wav)
        data, fs = read_wav(wavefile, onset=onset, offset=offset)
        assert fs == 16000
        coefs = mfcc.mfcc(data, **kwargs)
        feats.append(coefs.T)
        utts.append(seg)
        times.append(0.0125 + np.arange(coefs.shape[1])*0.01)
    data = h5features.Data(utts, times, feats, check=True)
    with h5features.Writer(out_file) as fh:
        fh.write(data, '/features')

In [None]:
data, fs = soundfile.read('./bras.wav')
assert fs == 16000

## Test ABX discriminability on WSJ corpus

This requires some of the material from the "Early phonetic learning without phonetic categories" paper.


### First extract features of interest and store them in h5features format to allow testing ABX phone discriminability.

In [None]:
conditions = [('energy', True),  ('energy', False),
              ('remove', False), ('remove', True),
              (None, False), (None, True)]
root = '/Users/admin/Documents/PhD/Data/GPJ_match_WSJ_test/'
for zeroth, norm in conditions[1:]:
    print(zeroth)
    print(norm)
    data = extract_mfcc(root + 'wavs/',
                        root + 'segments.txt',
                        root + 'mfcc/mfcc_{}_{}.features'.format(zeroth, norm),
                        zeroth_coef=zeroth, cep_mean_norm=norm)

## 2. Run WSJ discrimination on features (done on a remote cluster)

## 3. Plot results

In [None]:
# Loading (or computing if it's the first time) avg_error analysis results with full resamples

mp_folder = '/Users/admin/Documents/PhD/Data/vowel_discri/mp_scores'

analysis = avg_error
df_avg = apply_analysis(analysis, mp_folder,
                        add_metadata=add_metadata.language_register,
                        resampling=False)

In [None]:
sns.catplot(data=df_avg, kind='bar', y='error', hue='model type', x='contrast type')

Conclusion: for within-speaker phone discrimination:
    - cepstral mean normalization does not have much effect overall (tends to make consonant a bit harder to discriminate and vowel a bit easier) -> do not do it
    - log-energy: big effect unscaled zeroth-order MFCC appears best????
        - Does this have to do with DTW aligning signal based on energy profile due to scale unbalance???
            -> how come there is this unbalance in the first place, isn't this supposed to be a PCA?
                -> maybe log-energy synchronization (DTW or otherwise) + cosine distance on aligned signals without the energy would work very well? (or more generally on signal deconvoluted from pitch+prosody contours)
   
    - Short-term: take normalized DTW with basic MFCC (no log energy or removing of first coefficient) and no cepstral mean normalization. 