In [1]:
import sys
sys.path.append('..')
import torch
import torchtext
from torchtext.datasets import WikiText103
from torchaudio.datasets import LIBRISPEECH
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
import IPython.display as ipd
import matplotlib.pyplot as plt
import math
from data.unimodal import get_librispeech_dataset, get_raw_librispeech_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
train = WikiText103(split='train')

In [None]:
train_iter = iter(train)

In [None]:
next(train_iter)

In [None]:
librispeech = LIBRISPEECH(root="./data", url="train-clean-100", download=True)

In [None]:
len(librispeech)

In [None]:
for waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id in librispeech:
    print(f"Waveform: {waveform}")
    print(f"Sample rate: {sample_rate}")
    print(f"Utterance: {utterance}")
    print(f"Speaker ID: {speaker_id}")
    print(f"Chapter ID: {chapter_id}")
    print(f"Utterance ID: {utterance_id}")
    # Process the audio and metadata as needed
    break  # Remove or modify this line as needed to process more data

In [None]:
mel_spectrogram = T.MelSpectrogram()

In [None]:
r=mel_spectrogram(waveform)

In [None]:
waveform.shape

In [None]:
r.shape

In [None]:
plt.imshow((r + 1e-6).log2().squeeze().numpy(), cmap='gray')

In [None]:
def simple_time_mask(mel_spectrogram, mask_percentage=0.05):
    mel_spectrogram = mel_spectrogram.clone()
    num_time_steps = mel_spectrogram.shape[-1]
    mask_length = int(num_time_steps * mask_percentage)
    start = torch.randint(0, num_time_steps - mask_length, (1,)).item()
    mel_spectrogram[..., start:start+mask_length] = 0  # Masking with zeros
    return mel_spectrogram

In [None]:
r_masked = simple_time_mask(r)
plt.imshow((r_masked + 1e-6).log2().squeeze().numpy(), cmap='gray')

In [None]:
s=T.MelSpectrogram()
ma=T.TimeMasking(time_mask_param=100_000, iid_masks=True, p=0.1)

In [None]:
plt.imshow((ma(s(waveform)) + 1e-6).log2().squeeze().numpy(), cmap='gray')

In [None]:
ipd.display(ipd.Audio(waveform, rate=16_000))

In [None]:
waveform_length = waveform.shape[-1]
mask_length = int(waveform_length * 0.2)
start = torch.randint(0, waveform_length - mask_length, (1,)).item()
waveform_masked = waveform.clone()
waveform_masked[..., start : start + mask_length] = 0
ipd.display(ipd.Audio(waveform_masked, rate=16_000))

In [None]:
librispeech = get_librispeech_dataset(spectrogram=True, scale=True, dataset="train-clean-100", mask_percentage=0.3, consequitive=True,
                 batch_size=8, shuffle=True, num_workers=1)

In [None]:
librispeech_iter = iter(librispeech)

In [None]:
for _ in range(10):
    x, y = next(librispeech_iter)

In [None]:
x.shape

In [None]:
figure = plt.figure(figsize=(8, 8))
cols, rows = 1, 8
for i in range(1, cols * rows + 1):
    img = x[i-1]
    figure.add_subplot(rows, cols, i)
    plt.axis("off")
    plt.imshow(img)
plt.show()

In [2]:
librispeech_raw = get_raw_librispeech_dataset(dataset="train-clean-100", batch_size=8, shuffle=True, num_workers=1)

In [3]:
librispeech_raw_iter = iter(librispeech_raw)

Collate torch.Size([187920])
Collate torch.Size([161200])


In [4]:
batch = next(librispeech_raw_iter)

Collate torch.Size([223440])


In [8]:
batch.shape

torch.Size([8, 258880])