## Setup

In [1]:
!pip install datasets --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m297.0/480.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━

In [2]:
import json, datasets, torchaudio, torch, os
import plotly.express as px

In [3]:
# Connect to Drive repository
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
RAW_DATASET_DIR = 'drive/MyDrive/MLMI2/raw_data'
PROC_DATASET_DIR = 'drive/MyDrive/MLMI2/proc_data'
WAV_DIR = 'wav'

In [5]:
# Unzip TRAIN.zip and TEST.zip
!unzip -q {RAW_DATASET_DIR}/wav/TRAIN.zip -d ./wav/
!unzip -q {RAW_DATASET_DIR}/wav/TEST.zip -d ./wav/

## Load dataset

In [6]:
def load_dataset(path):
  ''' Load and re-format dataset from json file.
  '''
  with open(path, 'r') as f:
    data = json.load(f)

  # Get sample IDs and dictionnary of features
  samples = []
  for key, value in data.items():
    value['sample_id'] = key
    value['wav'] = f'{WAV_DIR}/' + '/'.join(value['wav'].split('/')[-4:])
    samples.append(value)

  dataset = datasets.Dataset.from_list(samples)
  return dataset

In [7]:
# Load training/validation/testing datasets
dataset = datasets.DatasetDict({
    'train': load_dataset(f'{RAW_DATASET_DIR}/json/train.json'),
    'valid': load_dataset(f'{RAW_DATASET_DIR}/json/dev.json'),
    'test':  load_dataset(f'{RAW_DATASET_DIR}/json/test.json'),
})

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['wav', 'duration', 'spk_id', 'phn', 'wrd', 'ground_truth_phn_ends', 'sample_id'],
        num_rows: 3696
    })
    valid: Dataset({
        features: ['wav', 'duration', 'spk_id', 'phn', 'wrd', 'ground_truth_phn_ends', 'sample_id'],
        num_rows: 400
    })
    test: Dataset({
        features: ['wav', 'duration', 'spk_id', 'phn', 'wrd', 'ground_truth_phn_ends', 'sample_id'],
        num_rows: 944
    })
})

In [9]:
dataset['train'][0]

{'wav': 'wav/TRAIN/DR1/FVFB0/SX222.WAV',
 'duration': 2.5344375,
 'spk_id': 'FVFB0',
 'phn': 'sil y aa w ih z sil k ih m ah sil w ah th sil p ae th sil l aa sil jh ih sil k l ih sil g z ae m sil p l sil s sil',
 'wrd': 'you always come up with pathological examples',
 'ground_truth_phn_ends': '2320 3007 3634 4344 5200 6280 6790 7490 7905 8533 9880 11006 11240 11808 12257 13120 13738 15540 16457 16800 18339 20428 20840 21505 22230 23330 23680 25148 26151 26650 26903 27947 30560 31080 32167 32668 33960 35090 38490 40480',
 'sample_id': 'FVFB0_SX222.WAV'}

## Get output vocabulary (unique phones)

In [10]:
# Get a dict of all unique phones with their counts
phn_counts = {'_': 0}
for sample in dataset['train']['phn']:
  for phone in sample.split(' '):
    if phone not in phn_counts:
      phn_counts[phone] = 1
    else:
      phn_counts[phone] += 1

In [11]:
# Create a vocabulary file
output_vocabulary = list(phn_counts.keys())
with open(f'{PROC_DATASET_DIR}/vocab_39.txt', 'w') as f:
  f.write('\n'.join(output_vocabulary))

In [34]:
# Plot a histogram (phone frequencies)
fig = px.histogram(x=list(phn_counts.keys()), y=list(phn_counts.values()))

fig.update_layout(
    title_text='Phone Frequencies in Training data',
    xaxis_title_text='Phones',
    yaxis_title_text='Frequency',
    bargap=0.2,
    title_x=0.5,
    width=1100,
    height=700,
)

fig.show()

In [31]:
# Get distributions of speaker groups in training data
speaker_groups = {k: 0 for k in ['DR1', 'DR2', 'DR3', 'DR4', 'DR5', 'DR6', 'DR7', 'DR8']}
for sample in dataset['train']:
  group_id = sample['wav'].replace('wav/TRAIN/', '')[:3]
  speaker_groups[group_id] += 1

print(speaker_groups)

{'DR1': 304, 'DR2': 608, 'DR3': 608, 'DR4': 544, 'DR5': 560, 'DR6': 280, 'DR7': 616, 'DR8': 176}


In [35]:
# Plot a histogram
fig = px.histogram(x=list(speaker_groups.keys()), y=list(speaker_groups.values()))

fig.update_layout(
    title_text='Number of Samples per Speaker Group in Training data',
    xaxis_title_text='Speaker Group',
    yaxis_title_text='Number of Samples',
    bargap=0.2,
    title_x=0.5,
    width=800,
    height=500,
)

fig.show()

## Tokenize outputs

In [None]:
# Transform phones into input tokens
def get_tokens(sample):
  phones, tokens = sample['phn'], []
  for phn in phones.split(' '):
    tokens.append(output_vocabulary.index(phn))
  sample['tokens'] = tokens
  return sample

tokenized_dataset = dataset.map(get_tokens)

Map:   0%|          | 0/3696 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/944 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['wav', 'duration', 'spk_id', 'phn', 'wrd', 'ground_truth_phn_ends', 'sample_id', 'tokens'],
        num_rows: 3696
    })
    valid: Dataset({
        features: ['wav', 'duration', 'spk_id', 'phn', 'wrd', 'ground_truth_phn_ends', 'sample_id', 'tokens'],
        num_rows: 400
    })
    test: Dataset({
        features: ['wav', 'duration', 'spk_id', 'phn', 'wrd', 'ground_truth_phn_ends', 'sample_id', 'tokens'],
        num_rows: 944
    })
})

## Extract MFCC features from WAV samples

In [None]:
def extract_features(sample):
  ''' Extract MFCC features from WAV file.
  '''
  # Load in WAV audio
  wav = torchaudio.load(sample['wav'])[0]

  # Extract MFCC features for 10ms windows
  mfcc = torchaudio.compliance.kaldi.fbank(wav)

  # Normalize MFCC features
  mfcc_mean = torch.mean(mfcc, dim=0, keepdim=True)
  mfcc_std = torch.std(mfcc, dim=0, keepdim=True)
  mfcc_norm = (mfcc - mfcc_mean) / mfcc_std

  # Save normalized MFCC features locally
  mfcc_save_path = f'mfcc_features/{sample["sample_id"]}.pt'
  torch.save(mfcc_norm, mfcc_save_path)
  sample['mfcc_path'] = mfcc_save_path

  return sample

In [None]:
# Extract MFCC features from WAV files
os.makedirs('mfcc_features', exist_ok=True)
mfcc_dataset = tokenized_dataset.map(
    extract_features, batched=False,
    remove_columns=['ground_truth_phn_ends']
)

Map:   0%|          | 0/3696 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/944 [00:00<?, ? examples/s]

In [None]:
mfcc_dataset

DatasetDict({
    train: Dataset({
        features: ['wav', 'duration', 'spk_id', 'phn', 'wrd', 'sample_id', 'tokens', 'mfcc_path'],
        num_rows: 3696
    })
    valid: Dataset({
        features: ['wav', 'duration', 'spk_id', 'phn', 'wrd', 'sample_id', 'tokens', 'mfcc_path'],
        num_rows: 400
    })
    test: Dataset({
        features: ['wav', 'duration', 'spk_id', 'phn', 'wrd', 'sample_id', 'tokens', 'mfcc_path'],
        num_rows: 944
    })
})

In [None]:
# Save mfcc_dataset to disk
mfcc_dataset.save_to_disk(f'{PROC_DATASET_DIR}/mfcc_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/3696 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/944 [00:00<?, ? examples/s]

In [None]:
# Save extracted feature to disk
!zip -r --quiet {PROC_DATASET_DIR}/mfcc_features/mfcc_features.zip mfcc_features/

## Save processed datasets as json

In [None]:
def dataset_to_json(dataset, json_path):
  data_as_list = dataset.to_list()
  with open(json_path, 'w') as f:
      json.dump(data_as_list, f, indent=2)

In [None]:
dataset_to_json(mfcc_dataset['train'], json_path=f'{PROC_DATASET_DIR}/json/train.json')
dataset_to_json(mfcc_dataset['valid'], json_path=f'{PROC_DATASET_DIR}/json/valid.json')
dataset_to_json(mfcc_dataset['test'], json_path=f'{PROC_DATASET_DIR}/json/test.json')