In [42]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import librosa
import pandas as pd
import numpy as np
from tacotron2.model import Tacotron2
from tacotron2.hparams import create_hparams
from tacotron2.loss_function import Tacotron2Loss
from tacotron2.data_utils import TextMelCollate, TextMelLoader
from torch.nn.utils.rnn import pad_sequence

In [43]:
def tokenize_and_pad(text_list, tokenizer, pad_token=0):
    tokenized = [torch.tensor(tokenizer(text), dtype=torch.long) for text in text_list]
    padded = pad_sequence(tokenized, batch_first=True, padding_value=pad_token)
    return padded

In [44]:
# Dummy tokenizer function (replace with actual tokenizer as needed)
def dummy_tokenizer(text):
    # text_ecoded = TextMelLoader(text)
    return [ord(char) for char in text] 

In [45]:
# Function to extract prosodic features from Nepali raw audio
def extract_prosody_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    pitch = librosa.pyin(y, fmin=50, fmax=300, sr=sr)[0]  # F0
    energy = librosa.feature.rms(y=y)[0]  # Root mean square energy
    duration = len(y) / sr  # Duration in seconds
    return {
        "pitch": pitch / np.max(pitch),
        "energy": energy / np.max(energy),
        "duration": duration,
    }

In [46]:
# Custom Dataset to include prosody embeddings from Nepali audio
class ProsodyTextMelDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Extract Nepali audio path, English text, and English audio path from CSV
        nepali_audio_path = self.data.iloc[idx]["nepali_audio"]
        english_text = self.data.iloc[idx]["english_text"]
        english_audio_path = self.data.iloc[idx]["english_audio"]

        # Load mel spectrogram for English audio
        y_english, sr_english = librosa.load(english_audio_path, sr=None)
        mel = librosa.feature.melspectrogram(y=y_english, sr=sr_english)
        mel = torch.tensor(mel, dtype=torch.float32)

        # Extract prosodic features from Nepali audio
        prosody_features = extract_prosody_features(nepali_audio_path)
        prosody_embedding = torch.tensor([
            np.mean(prosody_features["pitch"]),
            np.mean(prosody_features["energy"]),
            prosody_features["duration"]
        ], dtype=torch.float32)

        return english_text, mel, prosody_embedding

In [47]:
# Modified Tacotron2 model to accept prosody embeddings
class Tacotron2WithProsody(Tacotron2):
    def __init__(self, hparams):
        super(Tacotron2WithProsody, self).__init__(hparams)
        # Adding a prosody conditioning layer
        self.prosody_embedding_layer = nn.Linear(3, hparams.decoder_rnn_dim)

    def parse_batch(self, batch):
        text_padded, input_lengths, mel_padded, gate_padded, output_lengths, prosody = batch
        return (
            (text_padded, input_lengths, mel_padded, output_lengths, prosody),
            (mel_padded, gate_padded)
        )

    def forward(self, inputs):
        text_inputs, text_lengths, mels, output_lengths, prosody = inputs

        # Original Tacotron2 forward pass
        embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
        encoder_outputs = self.encoder(embedded_inputs, text_lengths)

        # Condition decoder on prosody embeddings
        prosody_embedding = self.prosody_embedding_layer(prosody)
        decoder_inputs = self.decoder.prenet(mels[:, :-1, :])
        decoder_inputs = decoder_inputs + prosody_embedding.unsqueeze(1)

        mel_outputs, gate_outputs, alignments = self.decoder(
            decoder_inputs, encoder_outputs, memory_lengths=text_lengths
        )
        mel_outputs_postnet = self.postnet(mel_outputs) + mel_outputs

        return mel_outputs, mel_outputs_postnet, gate_outputs, alignments


In [48]:
def train_tacotron2_with_prosody(model, dataset, hparams, checkpoint_path, epochs=50):
    # Initialize DataLoader and optimizer
    collate_fn = TextMelCollate(hparams.n_frames_per_step)
    dataloader = DataLoader(dataset, batch_size=hparams.batch_size, shuffle=True, collate_fn=collate_fn)
    optimizer = optim.Adam(model.parameters(), lr=hparams.learning_rate)
    criterion = Tacotron2Loss()

    # Move model to device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for batch in dataloader:
            # Parse batch and move to device
            inputs, targets = model.parse_batch(batch)
            inputs = tuple(i.to(device) for i in inputs)
            targets = tuple(t.to(device) for t in targets)

            # Forward pass
            optimizer.zero_grad()
            mel_outputs, mel_outputs_postnet, gate_outputs, _ = model(inputs)

            # Compute loss
            loss = criterion((mel_outputs, mel_outputs_postnet, gate_outputs), targets)
            epoch_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(dataloader)}")

        # Save checkpoint
        torch.save({'state_dict': model.state_dict()}, checkpoint_path)


In [49]:
# def create_hparams():
#     hparams = {
#         'batch_size': 16,
#         'n_frames_per_step': 1,
#         'learning_rate': 1e-3,
#         'decoder_rnn_dim': 1024,
#         'max_wav_value': 32768.0,
#         'sampling_rate': 22050,
#         'mel_fmin': 0.0,
#         'mel_fmax': 8000.0,
#     }
#     return hparams

In [50]:
# Example usage
if __name__ == "__main__":
    # Hyperparameters
    hparams = create_hparams()

    # Load dataset from CSV
    csv_file = "dataset.csv"  # CSV file containing Nepali audio, English text, and English audio paths
    dataset = ProsodyTextMelDataset(csv_file)

    # Initialize model
    model = Tacotron2WithProsody(hparams)

    # Train model
    train_tacotron2_with_prosody(model, dataset, hparams, "tacotron2_with_prosody_checkpoint.pth")


AttributeError: 'str' object has no attribute 'size'

In [21]:
df = pd.read_csv('./dataset.csv')

In [22]:
df['english_audio'] = df['english_audio'].apply(lambda x: "./audio/" + x)

In [23]:
df['nepali_audio'] = df['nepali_audio'].apply(lambda x: "./audio/" + x)

In [24]:
df.head()

Unnamed: 0,nepali_audio,english_text,english_audio
0,./audio/PRB_Nep_01_Bhojpur_01m4a,Hello friends Today we are embarking on a jour...,./audio/PRB_Eng_01_Bhojpur_01m4a
1,./audio/PRB_Nep_01_Bhojpur_02m4a,This district is located in Province No 1 of N...,./audio/PRB_Eng_01_Bhojpur_02m4a
2,./audio/PRB_Nep_01_Bhojpur_03m4a,The natural beauty of Bhojpur is captivating u...,./audio/PRB_Eng_01_Bhojpur_03m4a
3,./audio/PRB_Nep_01_Bhojpur_04m4a,We plan to visit some of the famous temples here,./audio/PRB_Eng_01_Bhojpur_04m4a
4,./audio/PRB_Nep_01_Bhojpur_05m4a,Our first stop an ancient temple in Bhojpur Th...,./audio/PRB_Eng_01_Bhojpur_05m4a


In [40]:
df.to_csv('dataset.csv', index=False)