In [None]:
base_root = '/scratch1/users/thomas/perceptual-tuning-data/'

# Warnings

This all assumes that wave filenames uniquely identify speakers and vice-versa (i.e. one wavefile per speaker).

I'm not sure the ordering of speakers in the subcorpora (or even in the original corpus) make sense. This is not a problem clearly for DPGMM, but this might be a factor for other type of models. I'm not sure if we want to define meaningful orderings independently of models and then do model comparison using those or if we should make ad hoc decisions on ordering for each model. The former makes more sense if we are seeing this as modeling a temporally ordered environment.

Also we kept even very very short segments. The procedure for generating the training data from the segments file should be robust to that.


# Methods

How to generate the appropriate smaller segments.txt from the initial one?

Retained solution is to measure total duration for each speaker, and to divide it in *j* equal duration parts, keeping the initial ordering of utterances. This yield *j* independent datasets, with a subset-superset structure taking the form of a tree. The reason for doing this is to get smaller datasets where, as much as possible, the only difference with the large dataset is the duration. To keep all other factors constant, we make sure of having the same speakers and relative duration per speakers and having temporally ordered datasets.

One thing we are not keeping constant is having fully-formed sentences. For small *j*'s this shouldn't have a large effect, as most sentence will remain fully formed. As *j* increases, however, this will eventually result in having less than one full utterance per speaker. For now, we just ignore this issue. I think the proper way to address it if it ever becomes important is to study the effect of the number of speakers and more generally of the distribution of speech amount per speaker independently of duration at the same time as we study the effect of total duration, using in particular small number of speakers with large amounts of data.

Choice of *j*'s. Let's start with dividing sets by 10, 100 and 1000. We either have ~10h total from 20 different speakers (for spontaneous speech) or ~20h total from ~100 speakers (for read speech) of training data. This will result respectively in around ~1h, 6min, 36s total with on average 3min, 18s and 1.8s per speaker and ~2h, 12min, 72s total with on average 72s, 7.2s, 0.72s, per speaker. Then we can add more data-points wherever it seems interesting.

To start with, let's use only min(j, 10) of the subsets. If j>10, we use the first subset (in temporal order) that is included in one of the j=10 subsets. With this approach, we can later add any *j* such that 10 divides *j* and keep the nice inclusion relationships. If we need smaller *j*'s, we should look at j=5 and j=2 to keep inclusions nice.

Once we have the segments files for subcorpora, we generate .mat training files with appropriately selected subsets of the total data using the vad_file argument of h5f2mat.py.

In [None]:
%matplotlib inline
import io
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os.path as path

In [None]:
# This part is pretty generic and might form the basis for an independent module

# Function to load base segments.txt

def read_segs(vad_file):
    utts = {'utt': [], 'spk': [], 'start': [], 'stop': [], 'duration': []}
    with io.open(vad_file, 'r', encoding='utf-8') as fh:
        for line in fh:
            utt_id, wav_id, start, stop = line.strip().split(u" ")
            start, stop = float(start), float(stop)
            utts['utt'].append(utt_id)
            utts['spk'].append(wav_id[:-4])
            utts['start'].append(start)
            utts['stop'].append(stop)
            utts['duration'].append(stop-start)
    utts = pd.DataFrame(utts)
    return utts


# Functions to generate subcorpora segments files

def get_subset_bounds(utts, speakers, nb_subsets):
    # define each subset by its time boundaries
    dur_per_spk = dict(utts.groupby('spk')['duration'].sum()) # in seconds
    subsets = {}
    for spk in speakers:
        dur = dur_per_spk[spk]
        for subset_id in range(nb_subsets):
            subsets[spk, subset_id] = {'start': subset_id*dur/float(nb_subsets),
                                       'stop': (subset_id+1)*dur/float(nb_subsets)}
    return subsets


def get_spk_subset_df(spk_df, start, stop):
    # Start:
    # - find first row with spk_amount_so_far strictly higher than start
    # - drop any previous rows and replace start of this row with:
    #         this_row_stop-(spk_amount_so_far_of_this_row-desired_start)
    # Stop:
    # - find first row with spk_amount_so_far higher or equal than stop (could be same row as for start)
    # - drop any following rows and replace stop of this row with:
    #         this_row_stop-(spk_amount_so_far_of_this_row-desired_stop)
    row_start = np.searchsorted(spk_df['spk_amount_so_far'], start, side='right')
    assert len(row_start) == 1
    row_start = row_start[0]
    row_stop = np.searchsorted(spk_df['spk_amount_so_far'], stop, side='left')
    assert len(row_stop) == 1
    row_stop = row_stop[0]
    spk_df = spk_df.iloc[row_start:row_stop+1].copy() # using iloc is appropriate here + copy to modify it without side effects
    start_col_ind = list(spk_df.columns).index('start')
    stop_col_ind = list(spk_df.columns).index('stop')
    spk_amount_col_ind = list(spk_df.columns).index('spk_amount_so_far')
    spk_df.iat[0, start_col_ind] = spk_df.iat[0, stop_col_ind] - (spk_df.iat[0, spk_amount_col_ind] - start)
    spk_df.iat[-1, stop_col_ind] = spk_df.iat[-1, stop_col_ind] - (spk_df.iat[-1, spk_amount_col_ind] - stop)
    return spk_df


def get_subset_dfs(utts, speakers, subset_bounds, nb_subsets):
    """
    Generate an appropriate dataframe for each subset
    """
    # We use a column containing the sum duration of all sentences
    # by the same speaker so far (assuming utterances to be sorted
    # in the order of occurence in time for each speaker). The current
    # sentence is included.
    utts['spk_amount_so_far'] = utts.groupby('spk')['duration'].cumsum()
    dfs = {}
    for subset_id in range(nb_subsets):
        dfs[subset_id] = {}
        for spk, spk_df in utts.groupby('spk'):
            # find appropriate bounds
            start, stop = subset_bounds[spk, subset_id]['start'], subset_bounds[spk, subset_id]['stop']
            # generate subset df
            subset_df = get_spk_subset_df(spk_df, start, stop)
            dfs[subset_id][spk] = subset_df
        # concatenate dataframe for each spk in order determined by speakers
        dfs[subset_id] = pd.concat([dfs[subset_id][spk] for spk in speakers])
    return dfs


def save_subset_segfiles(subset_dfs, out_dir, res_id='segments'):
    """
    Save the results in aptly named segments files
    """
    nb_subsets = len(subset_dfs)
    for subset_id in subset_dfs:
        filename = path.join(out_dir, res_id + '__{}subsets__subset{}.txt'.format(nb_subsets, 1+subset_id))
        df = subset_dfs[subset_id]
        with io.open(filename, 'w', encoding='utf-8') as fh:
            for _, row in df.iterrows():
                fh.write(u"{} {}.wav {} {}\n".format(row['utt'], row['spk'], row['start'], row['stop']))


def generate_subcorpora(utts, nb_subsets, out_dir, res_id='segments'):
    """
    Main function, splitting a corpus in nb_subsets parts as similar to the original corpus as possible
    """
    # get an ordering for speakers
    # (not sure if this is the same as in the original segments file, but order doesn't matter for GMM training at least)
    speakers = [spk for spk, _ in utts.groupby('spk')]
    subset_bounds = get_subset_bounds(utts, speakers, nb_subsets)
    subset_dfs = get_subset_dfs(utts, speakers, subset_bounds, nb_subsets)
    save_subset_segfiles(subset_dfs, out_dir, res_id)


In [None]:
# This generates the subcorpora for our current setup.

def load_segs(root, corpus):
    if corpus == 'WSJ':
        cocorpus = 'GPJ'
    elif corpus == 'GPJ':
        cocorpus = 'WSJ'
    elif corpus == 'CSJ':
        cocorpus = 'BUC'
    elif corpus == 'BUC':
        cocorpus = 'CSJ'
    else:
        assert False
    vad_file = path.join(root, '{}/{}_matched_data_train/segments.txt')
    vad_file = vad_file.format(corpus, cocorpus)
    return read_segs(vad_file)

# get j subset segments files for each j
js = [10, 100, 1000]
corpora = ['WSJ', 'BUC', 'GPJ', 'CSJ']
root = base_root + 'corpora'

for j in js:
    for corpus in corpora:
        out_dir = path.join(root, corpus, 'subcorpora')
        # load target segments file
        utts = load_segs(root, corpus)
        generate_subcorpora(utts, j, out_dir)

In [None]:
# Results checks: distribution of number of sentences per speaker

# original corpus
plt.figure()
for corpus in ['WSJ', 'BUC', 'GPJ', 'CSJ']:
    utts = load_segs(root, corpus)
    s = utts.groupby('spk').size()
    print(np.min(s))
    h = plt.hist(s, label=corpus)
l = plt.legend()

# a subcorpus
nb_subsets = 10
subset_id = 7  # 1-indexed here
res_id = 'segments'
plt.figure()
for corpus in ['WSJ', 'BUC', 'GPJ', 'CSJ']:
    out_dir = path.join(root, corpus, 'subcorpora')
    seg_file = path.join(out_dir, res_id + '__{}subsets__subset{}.txt'.format(nb_subsets, subset_id))
    utts = read_segs(seg_file)
    s = utts.groupby('spk').size()
    print(np.min(s))
    h = plt.hist(s, label=corpus)
l = plt.legend()

In [None]:
# Results checks: distribution of number of 10ms frames per speaker, approximately

# original corpus
plt.figure()
for corpus in ['WSJ', 'BUC', 'GPJ', 'CSJ']:
    utts = load_segs(root, corpus)
    f = utts.groupby('spk')['duration'].sum()*100
    print(np.min(f))
    h = plt.hist(f, label=corpus)
l = plt.legend()

# One subcorpus
nb_subsets = 1000
subset_id = 7  # 1-indexed here
res_id = 'segments'
plt.figure()
for corpus in ['WSJ', 'BUC', 'GPJ', 'CSJ']:
    out_dir = path.join(root, corpus, 'subcorpora')
    seg_file = path.join(out_dir, res_id + '__{}subsets__subset{}.txt'.format(nb_subsets, subset_id))
    utts = read_segs(seg_file)
    f = utts.groupby('spk')['duration'].sum()*100
    print(np.min(f))
    h = plt.hist(f, label=corpus)
l = plt.legend()