In [3]:
import os
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import MelSpectrogram
from transformers import PreTrainedTokenizerFast
from tacotron2_model import Tacotron2 
import torchaudio

In [12]:
# Define custom dataset
class EnglishSpeechDataset(Dataset):
    def __init__(self, csv_file, tokenizer, mel_transform):
        self.data = pd.read_csv(csv_file)
        self.audio_paths = self.data["english_audio"].values
        self.texts = self.data["english_text"].values
        self.tokenizer = tokenizer
        self.mel_transform = mel_transform

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        audio_path = self.audio_paths[idx]
        text = self.texts[idx]

        # Tokenize text
        text_ids = self.tokenizer(text, return_tensors="pt").input_ids.squeeze(0)

        # Load audio and compute Mel spectrogram
        waveform, sample_rate = torchaudio.load(audio_path)
        mel_spec = self.mel_transform(waveform).squeeze(0)

        return text_ids, mel_spec

In [13]:
# Collate function to handle batching
def collate_fn(batch):
    text_batch, mel_batch = zip(*batch)

    # Pad text and mel spectrogram sequences
    text_lens = torch.tensor([len(text) for text in text_batch])
    mel_lens = torch.tensor([mel.size(-1) for mel in mel_batch])  # Sequence lengths (time dimension)

    text_padded = torch.nn.utils.rnn.pad_sequence(text_batch, batch_first=True, padding_value=0)
    
    # Check mel feature consistency (n_mels)
    max_n_mels = max(mel.size(0) for mel in mel_batch)
    mel_padded = []
    for mel in mel_batch:
        if mel.size(0) < max_n_mels:
            pad = torch.zeros((max_n_mels - mel.size(0), mel.size(1)))  # Pad frequency bins
            mel = torch.cat([mel, pad], dim=0)
        mel_padded.append(mel)
    mel_padded = torch.nn.utils.rnn.pad_sequence(mel_padded, batch_first=True, padding_value=0)

    return text_padded, text_lens, mel_padded, mel_lens


In [14]:
# Model training loop
def train_tacotron2(csv_path, model, tokenizer, epochs=10, batch_size=16, learning_rate=1e-4):
    # Prepare dataset and data loader
    mel_transform = MelSpectrogram(sample_rate=22050, n_fft=1024, hop_length=256, n_mels=80)
    dataset = EnglishSpeechDataset(csv_path, tokenizer, mel_transform)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    # Define optimizer and loss
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = torch.nn.MSELoss()

    # Training loop
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0

        for batch in dataloader:
            text_padded, text_lens, mel_padded, mel_lens = batch

            # Move data to device
            text_padded = text_padded.to(model.device)
            mel_padded = mel_padded.to(model.device)

            # Forward pass
            optimizer.zero_grad()
            mel_pred, mel_postnet, _ = model(text_padded, text_lens, mel_padded, mel_lens)

            # Compute loss
            loss = criterion(mel_pred, mel_padded) + criterion(mel_postnet, mel_padded)
            epoch_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}")

In [15]:
# Main script
if __name__ == "__main__":
    csv_path = "dataset.csv"  # Path to your CSV file

    # Load pre-trained tokenizer
    tokenizer = PreTrainedTokenizerFast.from_pretrained("t5-small")

    # Initialize Tacotron2 model
    model = Tacotron2()
    model.to(torch.device("cpu"))

    # Train the model
    train_tacotron2(csv_path, model, tokenizer, epochs=10, batch_size=16, learning_rate=1e-4)

    # Save the model
    torch.save(model.state_dict(), "tacotron2_model.pth")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


RuntimeError: The size of tensor a (877) must match the size of tensor b (1489) at non-singleton dimension 1