In [4]:
import re
import numpy as np
from os import path, makedirs
from collections import Counter

class Config(object):
    """Set up model for debugging."""

    trainfile = "../kaldi/data/len6-50frames-count2/train/mfcc.scp"
    devfile = "../kaldi/data/len6-50frames-count2/dev/mfcc.scp"
    batch_size = 32
    current_epoch = 0
    num_epochs = 1
    feature_dim = 39
    num_layers = 3
    hidden_size = 256
    bidirectional = True
    keep_prob = 0.7
    margin = 0.5
    max_same = 1
    max_diff = 5
    lr = 0.001
    mom = 0.9
    logdir = "../logs/hystest2"
    ckptdir = "../ckpts/hystest2"
    log_interval = 10
    ckpt = None
    debugmode = True

    makedirs(logdir, exist_ok=True)
    makedirs(ckptdir, exist_ok=True)


class Dataset(object):
    """Creat data class."""

    def __init__(self,dataset, partition, config):
        """Initialize dataset."""
        self.is_train = (partition == "train")

        self.feature_dim = config.feature_dim

        labels, data = dataset[partition]['labels'], dataset[partition]['data']
        self.labels = np.array(labels)

        words = [re.split("_", x)[0] for x in labels]
        uwords = np.unique(words)

        word2id = {v: k for k, v in enumerate(uwords)}
        ids = [word2id[w] for w in words]

        feature_mean, n = 0.0, 0
        for x in data:
            feature_mean += np.sum(x)
            n += np.prod(x.shape)
        self.feature_mean = feature_mean / n

        self.data = np.array([x - self.feature_mean for x in data])
        self.ids = np.array(ids, dtype=np.int32)
        self.id_counts = Counter(ids)

        self.num_classes = len(self.id_counts)
        self.num_examples = len(self.ids)

    def shuffle(self):
        """Shuffle data."""

        shuffled_indices = np.random.permutation(self.num_examples)
        self.data = self.data[shuffled_indices]
        self.ids = self.ids[shuffled_indices]


    def pad_features(self, indices):
        """Pad acoustic features to max length sequence."""
        b = len(indices)
        lens = np.array([len(xx) for xx in self.data[indices]], dtype=np.int32)
        padded = np.zeros((b, max(lens), self.feature_dim))
        for i, (x, l) in enumerate(zip(self.data[indices], lens)):
            padded[i, :l] = x

        return padded, lens, self.ids[indices]


    def batch(self, batch_size, max_same=1, max_diff=1):
        """Batch data."""

        self.shuffle()

        same = []
        for index, word_id in enumerate(self.ids):  # collect same samples
            indices = np.where(self.ids == word_id)[0]
            same.append(np.random.permutation(indices[indices != index])[:max_same])
        same = np.array(same)

        diff_ids = np.random.randint(0, self.num_classes - 1, (self.num_examples, max_diff))
        diff_ids[diff_ids >= np.tile(self.ids.reshape(-1, 1), [1, max_diff])] += 1

        diff = np.full_like(diff_ids, 0, dtype=np.int32)
        for word_id, count in self.id_counts.items():  # collect diff samples
            indices = np.where(diff_ids == word_id)
            diff[indices] = np.where(self.ids == word_id)[0][np.random.randint(0, count, len(indices[0]))]

        get_batch_indices = lambda start: range(start, min(start + batch_size, self.num_examples))

        for indices in map(get_batch_indices, range(0, self.num_examples, batch_size)):

            if self.is_train:
                b = len(indices)

                same_partition = [np.arange(b)]  # same segment ids for anchors
                same_partition += [(b + i) * np.ones(len(x)) for i, x in enumerate(same[indices])]  # same segment ids for same examples
                same_partition += [(2 * b) + np.arange(max_diff * b)]  # same segment ids for diff examples
                same_partition = np.concatenate(same_partition)

                diff_partition = np.concatenate([i * np.ones(max_diff) for i in range(b)])  # diff segment ids for diff examples

                indices = np.concatenate((indices, np.hstack(same[indices]), diff[indices].flatten()))

                data, lens, _ = self.pad_features(indices)
                yield data, lens, same_partition, diff_partition

            else:
                yield self.pad_features(indices)

    def pad_features2(self, indices):
        """Pad acoustic features to max length sequence."""
        b = len(indices)
        lens = np.array([len(xx) for xx in self.data[indices]], dtype=np.int32)
        padded = np.zeros((b, max(lens), self.feature_dim))
        for i, (x, l) in enumerate(zip(self.data[indices], lens)):
            padded[i, :l] = x

        return padded, lens, self.ids[indices], self.labels[indices]

    def batch_for_evaluation(self, batch_size):
        """Batch data."""

        get_batch_indices = lambda start: range(start, min(start + batch_size, self.num_examples))

        for indices in map(get_batch_indices, range(0, self.num_examples, batch_size)):
            yield self.pad_features2(indices)

In [6]:
full_dataset = np.load('swbd_rest.npy').item()
config = Config()
train_data = Dataset(full_dataset, partition="train", config=config)
dev_data = Dataset(full_dataset, partition="dev", config=config)

In [8]:
full_dataset2 = np.load('swbd.npy').item()
config = Config()
train_data2 = Dataset(full_dataset2, partition="train", config=config)
dev_data2 = Dataset(full_dataset2, partition="dev", config=config)

In [7]:
train_data.labels

array(['abandoned_sw02459-B_032843-032904',
       'abandoned_sw02459-B_033098-033148',
       'ability_sw02314-B_035114-035166', ...,
       'concerning_sw03620-A_024773-024838',
       'concert_sw02020-A_031375-031435',
       'concert_sw02656-B_024745-024804'], dtype='<U42')

In [30]:
words = [re.split("_", x)[0] for x in train_data.labels]
uwords = np.unique(words)

In [31]:
print(uwords)

['7-eleven' 'abandoned' 'aberdeen' ... "you're" 'younger' 'yourself']


In [32]:
words = [re.split("_", x)[0] for x in train_data2.labels]
uwords = np.unique(words)
print(uwords)

['abandoned' 'ability' 'absolute' ... "you're" 'younger' 'yourself']


In [10]:
train_data2.labels

array(['abandoned_sw02459-B_032843-032904',
       'abandoned_sw02459-B_033098-033148',
       'ability_sw02314-B_035114-035166', ...,
       'yourself_sw04371-A_022966-023027',
       'yourself_sw04376-A_021817-021874',
       'yourself_sw04911-B_005572-005624'], dtype='<U42')

In [12]:
train_data2.feature_dim

39

In [17]:
train_data.data[0]

array([[ 0.94147736,  0.5845694 , -0.9574372 , ...,  0.03074487,
        -0.01902566, -0.02155841],
       [ 1.1195636 ,  0.65759546, -1.3011527 , ..., -0.04385631,
        -0.18893848,  0.05795404],
       [ 1.0602016 ,  0.5845694 , -1.523557  , ..., -0.05877131,
        -0.26356038,  0.04199879],
       ...,
       [ 0.04746871,  2.0204134 ,  0.24192266, ...,  0.11575785,
         0.02330487, -0.08532371],
       [-0.05688924,  1.8847027 ,  0.7680174 , ...,  0.10428689,
         0.08478573, -0.00295285],
       [ 0.19530897,  0.45068842,  0.4666956 , ...,  0.00918745,
         0.08473071,  0.12002893]], dtype=float32)

In [19]:
train_data2.data[0]

array([[ 0.94207066,  0.5851627 , -0.9568439 , ...,  0.03133819,
        -0.01843234, -0.02096509],
       [ 1.120157  ,  0.65818876, -1.3005593 , ..., -0.04326299,
        -0.18834515,  0.05854736],
       [ 1.0607951 ,  0.5851627 , -1.5229635 , ..., -0.05817799,
        -0.26296708,  0.04259211],
       ...,
       [ 0.04806203,  2.0210068 ,  0.242516  , ...,  0.11635117,
         0.02389819, -0.08473039],
       [-0.05629592,  1.8852961 ,  0.7686107 , ...,  0.10488021,
         0.08537905, -0.00235953],
       [ 0.1959023 ,  0.45128173,  0.4672889 , ...,  0.00978077,
         0.08532403,  0.12062225]], dtype=float32)

In [23]:
train_data2.num_examples

9971

In [27]:
train_data.ids.shape

(10653,)

In [28]:
train_data2.ids.shape

(9971,)