Steps:
1. Read TensorFlow Dataset data as numpy
2. Convert audio to float and resample
3. Convert audio to embeddings
4. Train and eval sklearn model

In [None]:
tfds_dataset_name = 'savee'  #@param
REQUIRED_SAMPLE_RATE_ = 16000

In [None]:
# Read the data into numpy arrays.
import collections
SingleSplit = collections.namedtuple(
    'SingleSplit', ['audio', 'labels', 'speaker_id'])
Data = collections.namedtuple(
    'Data', ['train', 'validation', 'test'])

import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()
assert tf.executing_eagerly()
import tensorflow_datasets as tfds
def _dat_from_split(split):
  np_generator = tfds.as_numpy(tfds.load(tfds_dataset_name, split=split))
  dat = [(x['audio'], x['label'], x['speaker_id']) for x in np_generator]
  audio, labels, speaker_id = zip(*dat)

  import numpy as np
  labels = np.array(labels, dtype=np.int16)
  speaker_id = np.array(speaker_id)
  assert len(audio) == labels.size == speaker_id.size
  assert labels.ndim == speaker_id.ndim == 1
  print(f'Finished {split}')
  return audio, labels, speaker_id

all_data = Data(
    train=SingleSplit(*_dat_from_split('train')),
    validation=SingleSplit(*_dat_from_split('validation')),
    test=SingleSplit(*_dat_from_split('test')))

In [None]:
# Make the audio floats, and resample the audio if necessary.
import collections
import librosa
import numpy as np
FloatData = collections.namedtuple('FloatData', ['train', 'validation', 'test'])

sample_rate = tfds.builder(tfds_dataset_name).info.features['audio'].sample_rate
def _int_to_float(audio_int16, split_name):
  float_audio_16k = []
  for i, samples in enumerate(audio_int16):
    float_audio = samples.astype(np.float32) / np.iinfo(np.int16).max
    if sample_rate != REQUIRED_SAMPLE_RATE_:
      float_audio = librosa.core.resample(
          float_audio, orig_sr=sample_rate, target_sr=16000, 
          res_type='kaiser_best')
    float_audio_16k.append(float_audio)
    if i % 50 == 0:
      print(f'Finished resampling {i} / {len(audio_int16)} for {split_name}')
  print(f'Finished {split_name}')
  return float_audio_16k


float_audio_16k = FloatData(
    train=_int_to_float(all_data.train.audio, 'train'),
    validation=_int_to_float(all_data.validation.audio, 'validation'),
    test=_int_to_float(all_data.test.audio, 'test'))

In [None]:
tfhub_model_name = 'https://tfhub.dev/google/nonsemantic-speech-benchmark/trill-distilled/1'  #@param
output_key = 'embedding'  #@param

In [None]:
# Convert the audio to embeddings. Preaverage the embeddings across time.
import tensorflow_hub as hub
model = hub.load(tfhub_model_name)

In [None]:
import collections
Embeddings = collections.namedtuple(
    'Embeddings', ['train', 'validation', 'test'])

def _calc_embeddings(cur_float_audio, split_name):
  cur_embeddings = []
  for i, float_samples in enumerate(cur_float_audio):
    tf_out = model(tf.constant(float_samples, tf.float32),
                  tf.constant(16000, tf.int32))
    embedding_2d = tf_out[output_key]
    assert embedding_2d.ndim == 2
    embedding_1d = np.mean(embedding_2d, axis=0)
    cur_embeddings.append(embedding_1d)
    if i % 50 == 0:
      print(f'Finished embedding {i} / {len(cur_float_audio)} for {split_name}')
  print(f'Finished {split_name}')
  cur_embeddings = np.array(cur_embeddings, dtype=np.float32)
  return cur_embeddings

embeddings = Embeddings(
    train=_calc_embeddings(float_audio_16k.train, 'train'),
    validation=_calc_embeddings(float_audio_16k.validation, 'validation'),
    test=_calc_embeddings(float_audio_16k.test, 'test'))
assert embeddings.train.shape[1] == embeddings.validation.shape[1] == embeddings.test.shape[1]
assert embeddings.train.shape[0] == all_data.train.labels.shape[0] == all_data.train.speaker_id.shape[0]
assert embeddings.validation.shape[0] == all_data.validation.labels.shape[0] == all_data.validation.speaker_id.shape[0]
assert embeddings.test.shape[0] == all_data.test.labels.shape[0] == all_data.test.speaker_id.shape[0]
assert not np.isnan(embeddings.train).any()
assert not np.isnan(embeddings.validation).any()
assert not np.isnan(embeddings.test).any()

In [None]:
model_name = 'LogisticRegression_balanced'  #@param

In [None]:
from sklearn import linear_model

def get_sklearn_model(model_name):
  return {
      'LogisticRegression': lambda: linear_model.LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'),
      'LogisticRegression_balanced': lambda: linear_model.LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial', class_weight='balanced'),
  }[model_name]()

def _speaker_normalization(embedding_np, speaker_id_np):
  """Normalize embedding features by per-speaker statistics."""
  all_speaker_ids = np.unique(speaker_id_np)
  for speaker in all_speaker_ids:
    cur_i = speaker_id_np == speaker
    embedding_np[cur_i] -= embedding_np[cur_i].mean(axis=0)
    stds = embedding_np[cur_i].std(axis=0)
    stds[stds == 0] = 1
    embedding_np[cur_i] /= stds

  return embedding_np

# Train models.
d = get_sklearn_model(model_name)
normalized_train = _speaker_normalization(
    embeddings.train, all_data.train.speaker_id)
d.fit(normalized_train, all_data.train.labels)

# Eval.
normalized_validation = _speaker_normalization(
    embeddings.validation, all_data.validation.speaker_id)
eval_score = d.score(normalized_validation, all_data.validation.labels)
print(f'{model_name} eval score: {eval_score}')

# Test.
normalized_test = _speaker_normalization(
    embeddings.test, all_data.test.speaker_id)
test_score = d.score(normalized_test, all_data.test.labels)
print(f'{model_name} test score: {test_score}')