In [3]:
import torchaudio

data = torchaudio.datasets.LIBRITTS(root="data", url="train-clean-100", download=True)

In [15]:
%pip install pickle5

Collecting pickle5
  Downloading pickle5-0.0.11.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.1/132.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: pickle5
  Building wheel for pickle5 (setup.py) ... [?25ldone
[?25h  Created wheel for pickle5: filename=pickle5-0.0.11-cp310-cp310-macosx_11_0_arm64.whl size=119409 sha256=219ca2c1afd1b63ff5c64d8d86331c6f4db6af1dca9b67b2356b2f78dd8358b8
  Stored in directory: /Users/pulljosh/Library/Caches/pip/wheels/7d/14/ef/4aab19d27fa8e58772be5c71c16add0426acf9e1f64353235c
Successfully built pickle5
Installing collected packages: pickle5
Successfully installed pickle5-0.0.11
Note: you may need to restart the kernel to use updated packages.


# Generate training set (examples 0 - 3999 from dataset)

This will create a file called `prepared-data/dataset_4000.pkl` which contains the first 4000 examples from the dataset.

In [4]:
import pickle
import scipy

prepared_data = []

for n in range(4):
  for i in range(1000):
    item = data[i + n * 1000]
    waveform, sample_rate, original_text, normalized_text, speaker_id, chapter_id, utterance_id = item
    frequencies, times, spectrogram = scipy.signal.spectrogram(waveform.squeeze(), sample_rate)

    prepared_data.append({
      "waveform": waveform,
      "spectrogram": spectrogram,
      "sample_rate": sample_rate,
      "original_text": original_text,
      "normalized_text": normalized_text,
      "speaker_id": speaker_id,
      "chapter_id": chapter_id,
      "utterance_id": utterance_id
    })

  with open("prepared-data/dataset_4000.pkl", "wb") as f:
    print(f"Dumping {len(prepared_data)} examples")
    pickle.dump(prepared_data, f)


Dumping 1000 examples
Dumping 2000 examples
Dumping 3000 examples
Dumping 4000 examples


# Generate test set (examples 5000 - 5999 from dataset)

This will create a file called `prepared-data/dataset_1000.pkl` which contains 1000 examples from the dataset.

I am using 5000-5999 rather than 4000-4999 because when I tried 4000-4999, it failed and said the file was too big. I assume there are a few massive audio files in that range?

In [9]:
prepared_data = []

for i in range(1000):
  item = data[i + 5000] # Using 5000-5999 rather than 4000-4999 because when I tried 4000-4999, it failed and said the file was too big. I assume there are a few massive audio files in there?
  waveform, sample_rate, original_text, normalized_text, speaker_id, chapter_id, utterance_id = item
  frequencies, times, spectrogram = scipy.signal.spectrogram(waveform.squeeze(), sample_rate)

  prepared_data.append({
    "waveform": waveform,
    "spectrogram": spectrogram,
    "sample_rate": sample_rate,
    "original_text": original_text,
    "normalized_text": normalized_text,
    "speaker_id": speaker_id,
    "chapter_id": chapter_id,
    "utterance_id": utterance_id
  })

with open("prepared-data/dataset_1000.pkl", "wb") as f:
  print(f"Dumping {len(prepared_data)} examples")
  pickle.dump(prepared_data, f)

Dumping 1000 examples


# Example: How to load the training/testing sets for use later

In [5]:
import pickle

with open("prepared-data/dataset_1000.pkl", "rb") as f:
  testing_data = pickle.load(f)

print(len(testing_data))

with open("prepared-data/dataset_4000.pkl", "rb") as f:
  training_data = pickle.load(f)

print(len(training_data))

1000
4000


In [6]:
# Preview one of the examples
testing_data[0]

# We probably care most about spectrogram and normalized_text, but the other information is there as well

{'waveform': tensor([[-0.0021, -0.0020, -0.0018,  ..., -0.0026, -0.0024, -0.0023]]),
 'spectrogram': array([[6.1617322e-11, 2.4870866e-11, 1.4738427e-10, ..., 1.6959106e-11,
         3.0114894e-10, 6.8760858e-10],
        [2.0799680e-08, 1.8790917e-08, 1.8172292e-08, ..., 2.4452904e-08,
         3.2318614e-08, 2.7027603e-08],
        [4.7673461e-09, 2.5217244e-09, 6.8299362e-09, ..., 3.9812305e-09,
         9.4678745e-09, 8.0733180e-09],
        ...,
        [6.7064058e-15, 8.9862013e-15, 3.6173813e-14, ..., 5.9794421e-14,
         3.0979890e-13, 8.9270564e-15],
        [1.0402141e-14, 1.3616231e-15, 7.5033363e-14, ..., 1.1059050e-14,
         8.6787863e-14, 2.0568611e-14],
        [1.6667581e-14, 1.2333496e-13, 4.6033733e-15, ..., 4.9778886e-14,
         1.7889984e-15, 5.8257808e-14]], dtype=float32),
 'sample_rate': 24000,
 'original_text': "The one was saying to the other as the weary youth lay down, 'Is there anything the least wonderful or remarkable about this neighbourhood?'",
 