<a href="https://colab.research.google.com/github/SamJ70/BYOP/blob/main/umeed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchaudio transformers librosa tqdm numpy
!pip install phonemizer pydub


Collecting phonemizer
  Downloading phonemizer-3.3.0-py3-none-any.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting segments (from phonemizer)
  Downloading segments-2.2.1-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting dlinfo (from phonemizer)
  Downloading dlinfo-1.2.1-py3-none-any.whl.metadata (1.1 kB)
Collecting clldutils>=1.7.3 (from segments->phonemizer)
  Downloading clldutils-3.24.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting csvw>=1.5.6 (from segments->phonemizer)
  Downloading csvw-3.5.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting colorlog (from clldutils>=1.7.3->segments->phonemizer)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting bibtexparser>=2.0.0b4 (from clldutils>=1.7.3->segments->phonemizer)
  Downloading bibtexparser-2.0.0b8-py3-none-any.whl.

In [None]:
import os
import librosa
import numpy as np
from phonemizer import phonemize
from tqdm import tqdm
import pickle
# Paths
dataset_path = "/content/drive/MyDrive/nus-smc-corpus_48"
output_path = "/content/drive/MyDrive"
os.makedirs(output_path, exist_ok=True)

# Helper functions
def preprocess_audio(audio_file):
    """Convert audio to Mel-spectrogram."""
    waveform, sr = librosa.load(audio_file, sr=22050)
    mel_spec = librosa.feature.melspectrogram(
        y=waveform, sr=sr, n_fft=2048, hop_length=512, n_mels=80
    )
    log_mel_spec = librosa.power_to_db(mel_spec)
    return log_mel_spec

def preprocess_annotations(annotation_file):
    """Read time-aligned annotations and convert them to phonemes."""
    with open(annotation_file, "r") as f:
        lines = f.readlines()
    phonemes = " ".join([line.split()[2] for line in lines])  # Extract phoneme column
    return phonemes

# Process dataset
data = []
subjects = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]

for subject in tqdm(subjects):
    subject_path = os.path.join(dataset_path, subject)
    sing_folder = os.path.join(subject_path, "sing")

    if not os.path.exists(sing_folder):
        print(f"Skipping {subject}: 'sing' folder not found.")
        continue

    for audio_file in os.listdir(sing_folder):
        if audio_file.endswith(".wav"):
            audio_file_path = os.path.join(sing_folder, audio_file)
            annotation_file_path = os.path.join(sing_folder, audio_file.replace(".wav", ".txt"))

            if not os.path.exists(annotation_file_path):
                print(f"Skipping {audio_file}: No corresponding annotation file.")
                continue

            # Preprocess audio and annotations
            mel_spec = preprocess_audio(audio_file_path)
            phonemes = preprocess_annotations(annotation_file_path)

            # Append processed data
            data.append((mel_spec, phonemes))

# Save processed data
output_file = os.path.join(output_path, "preprocessed_data.pkl")
with open(output_file, "wb") as f:
    pickle.dump(data, f)

print(f"Preprocessing complete! Data saved at {output_file}.")

In [None]:
import pickle
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np
class MelodyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        mel_spectrogram, phonemes = self.data[idx]
        phonemes = str(phonemes)  # Ensure phonemes are strings
        mel_spectrogram = np.array(mel_spectrogram, dtype=np.float32)
        return phonemes, torch.tensor(mel_spectrogram)

# Pad variable-length tensors in a batch
def collate_fn(batch):
    phonemes, mel_spectrograms = zip(*batch)
    max_time = max(mel.shape[1] for mel in mel_spectrograms)
    padded_mel_spectrograms = torch.stack([
        torch.nn.functional.pad(
            mel, (0, max_time - mel.shape[1]), mode="constant", value=0
        )
        for mel in mel_spectrograms
    ])
    return list(phonemes), padded_mel_spectrograms

# Load preprocessed data
with open("/content/drive/MyDrive/preprocessed_data.pkl", "rb") as f:
    data = pickle.load(f)

# Create Dataset and DataLoader with padding
dataset = MelodyDataset(data)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
# Create Dataset and DataLoader with padding
dataset = MelodyDataset(data)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
# Load preprocessed data
# with open("/content/drive/MyDrive/preprocessed_data.pkl", "rb") as f:
#     data = pickle.load(f)

# # Assume the dataset is a list of (phonemes, mel_spectrogram) pairs
# from torch.utils.data import DataLoader, Dataset

# class MelodyDataset(Dataset):
#     def __init__(self, data):
#         self.data = data

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         phonemes, mel_spectrogram = self.data[idx]
#         return phonemes, mel_spectrogram

# dataset = MelodyDataset(data)
# dataloader = DataLoader(dataset, batch_size=4, shuffle=True)


In [None]:
import torch.nn as nn
from transformers import MBartForConditionalGeneration, MBartTokenizer

# Load MBart model and tokenizer
model_name = "facebook/mbart-large-cc25"
tokenizer = MBartTokenizer.from_pretrained(model_name)
text_to_melody_model = MBartForConditionalGeneration.from_pretrained(model_name)

# Define the TacotronWrapper for text-to-melody generation


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
class TacotronWrapper(nn.Module):
    def __init__(self):
        super(TacotronWrapper, self).__init__()
        self.text_to_melody = text_to_melody_model

    def forward(self, phonemes):
        if not isinstance(phonemes, list):  # Ensure phonemes are a list of strings
            raise ValueError("Phonemes must be a list of strings.")

    # Tokenize input text
        tokens = tokenizer(phonemes, return_tensors="pt", padding=True, truncation=True).to(next(self.text_to_melody.parameters()).device)
        tokens['input_ids'] = tokens['input_ids'].to(torch.int64)
    # Debug: Check indices
        max_index = tokens['input_ids'].max()
        if max_index >= tokenizer.vocab_size:
            raise ValueError(f"Token index out of range: {max_index} exceeds vocab size {tokenizer.vocab_size}")

    # Clamp to avoid index issues
        tokens['input_ids'] = torch.clamp(tokens['input_ids'], max=tokenizer.vocab_size - 1)

    # Forward pass through the model
        outputs = self.text_to_melody(**tokens)
        mel_spectrogram = outputs.logits

    # Truncate sequence length (optional, for large outputs)
        max_time_steps = 8000  # Define max time steps
        if mel_spectrogram.shape[2] > max_time_steps:
            mel_spectrogram = mel_spectrogram[:, :, :max_time_steps]

        return mel_spectrogram


In [None]:
import soundfile as sf
import numpy as np
import librosa

def griffinlim(mel_spectrogram, sr, iterations=50):
    # This function takes the magnitude spectrogram and performs the Griffin-Lim inversion
    # mel_spectrogram: Input magnitude spectrogram
    # sr: Sample rate
    # iterations: Number of iterations for Griffin-Lim algorithm
    angle_spectrogram = np.angle(np.exp(2j * np.pi * np.random.uniform(0, 1, mel_spectrogram.shape)))
    reconstructed = librosa.istft(mel_spectrogram * np.exp(1j * angle_spectrogram), hop_length=256, win_length=1024)
    for _ in range(iterations):
        angles = np.angle(librosa.stft(reconstructed, hop_length=256, win_length=1024))
        reconstructed = librosa.istft(mel_spectrogram * np.exp(1j * angles), hop_length=256, win_length=1024)
    return reconstructed

def generate_audio_from_mel(mel_spectrogram, output_path="output.wav"):
    audio = griffinlim(mel_spectrogram.T, 22050)  # Transpose for the Griffin-Lim function
    sf.write(output_path, audio, samplerate=22050)
    print(f"Generated audio saved at {output_path}")

In [None]:
from itertools import accumulate
from tqdm import tqdm
tacotron_model = TacotronWrapper()
optimizer = torch.optim.Adam(tacotron_model.parameters(), lr=1e-4)
# Import necessary libraries
from torch.cuda.amp import GradScaler, autocast
import torch.nn.functional as F
import torch


accumulation_steps = 4  # Number of steps to accumulate gradients before updating weights
# Define custom loss: STFT Loss (Optional but recommended)
def stft_loss(pred, target):
    pred=pred.mean(dim=1)
    target=target.mean(dim=1)
    pred_stft = torch.stft(pred.squeeze(1), n_fft=1024, hop_length=256, win_length=1024, return_complex=False)
    target_stft = torch.stft(target.squeeze(1), n_fft=1024, hop_length=256, win_length=1024, return_complex=False)
    return F.l1_loss(pred_stft, target_stft)

# Initialize AMP GradScaler
scaler = torch.amp.GradScaler()  # Updated per deprecation warning


# Training loop
for epoch in range(10):  # Adjust epochs as needed
    epoch_loss = 0
    tacotron_model.train()

    for step, (phonemes, mel_spec) in enumerate(tqdm(dataloader)):
        print(max(phonemes), min(phonemes))
        print(f"Phonemes indices: {phonemes}")
        print(f"Mel Spec indices: {mel_spec}")
        optimizer.zero_grad()
            # Forward pass
        generated_mel = tacotron_model(phonemes)

            # Project generated mel to match target mel dimensions
        if generated_mel.size(1) != 80:  # Check if projection is needed
            projection_layer = torch.nn.Linear(generated_mel.size(1), 80).to(generated_mel.device)
            projected_mel = projection_layer(generated_mel.transpose(1, 2)).transpose(1, 2)
        else:
            projected_mel = generated_mel  # No projection needed if already 80 channels

            # Align sequence length
        if projected_mel.shape[2] > mel_spec.shape[2]:  # Trim
                projected_mel = projected_mel[:, :, :mel_spec.shape[2]]
        elif projected_mel.shape[2] < mel_spec.shape[2]:  # Pad
                padding = mel_spec.shape[2] - projected_mel.shape[2]
                projected_mel = F.pad(projected_mel, (0, padding))

            # Compute loss (Use L1 loss or STFT loss)
        loss = stft_loss(projected_mel, mel_spec) / accumulation_steps

        scaler.scale(loss).backward()  # Scaled loss for AMP
        if (step + 1) % accumulation_steps == 0 or (step + 1) == len(dataloader):
          scaler.step(optimizer)
          scaler.update()
          optimizer.zero_grad()

        epoch_loss += loss.item()


    print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(dataloader)}")

def generate_audio_from_mel(mel_spectrogram, output_path="output.wav"):
    audio = griffinlim(mel_spectrogram.T, 22050)
    sf.write(output_path, audio, samplerate=22050)
    print(f"Generated audio saved at {output_path}")




  0%|          | 0/48 [00:00<?, ?it/s]

sil t w ih ng k ah sp t w ih ng k ah sp l ih t ow sp s t aa r sp hh aw sp ay sp w ah n d er sp w ah n sil y uw w sp aa r sil ah b sp ah b ah v sp dh ah sp w er ah d sp s ow sp hh ay sil l ay k sp ah sp d ay m eh n d sp ih n sp ah sp s k ay sil t w ih ng k ow sp t w ih ng k ah sp l ih t ow sp s t aa r sil hh aw sp ay sp w ao ah n d er sp w ah sil y uw w sp aa r sil w ah n sp ah sp b l ey z ih ng sp s aa n sp ih s sp g ao n sil w eh n sp hh iy sp n ah th ih ng sp sh ay n s sp ah p ao n sil dh eh n sp y ah sp sh ow sp y ao sp l ih t ow sp l ay t sil t w ah ng k ow sp t w ih ng k ow l sp ao sp dh ah sp n ay t sil t w ih ng k ah sp t w ih ng k ah sp l ih t ow sp s t aa r sil hh aw sp ay sp w ah n d er sp w ah sil y uw w sp aa r sil dh eh n sp ah sp ch r aa v l er sil ih n sp ah sp d aa r k sil th ey ng k s sp y uw sp f uw oy sp y ah r sp t ay n iy sp s p ao sil hh iy sp k uh sp n ao sp s iy sp w ih ch sp w ey sp t uw sp g ow sil ih f sp y uw sp d ih sp n aa sp t w ih n k ow sp s ow sil t w 

  2%|▏         | 1/48 [02:04<1:37:17, 124.21s/it]

sil l ih s ah n sp t uw sp v ah sp r ih dh ah m sp ao f sp dh ah sp f ao l ih ng sp r ey n sil t eh l iy ng sp m iy sp jh ah s w sp ah t sp ah sp f uw l sp ay f sp b ih n sil ay sp w ih sh sp jh ah t sp ih sp w uw sp g ow sp eh n sp l eh sp m iy sp k r ay sp ih n sp v ey n sil eh n sp l eh sp m iy sp b iy sp ah l ow n sp ah g ey n sil v iy y sp ow n l iy sp g ah l sp ay sp k eh r sp ah b aw t sp hh ah z sp g ao n sp ow w ey sil l uw k ih n sp f ao r sp ah sp b r eh n sp n iy uw sp s t aa r sil b ah sp l ih ow sp dh ah sp sh iy sp n ow sp dh eh sp w eh n sp sh iy sp l ae f sp dh eh sp d ey sil ah l ao ng sp w ih f sp hh ah r sp sh iy sp t uw k sp m ay sp hh aa r sil r ey n sp p iy s sp t ah sp m iy sp n ow sp d ah s sp dh ah sp t iy m sp f eh sil f ao sp hh ah sp t ah sp s t iy uw sp m ay sp hh aa r t sp ow w ey sp w ey n sp sh iy sp d ow n sp k eh sil ay sp k eh n sp l ah v sp ah n ah v ah sp w eh n sp m ay sp hh aa sp s ow m w eh ah sp f aa r ah w ey sil v iy y sp ow l iy sp g ah l sp

  4%|▍         | 2/48 [03:13<1:10:19, 91.73s/it] 

sil s sp l ey t sp iy n iy sp f n iy ng sil sh ih s sp w ah n jh r ih ng sp w ah t sp k l ow s sp t uh uw sp w eh er sil sh ih sp p uw s sp ah n sp hh er sp m ey k ah p sil eh n sp b r eh sh ih s sp hh ah sp l ao ng sp b l ah n sp hh ey eh ah r sil hh eh n sp eh n sp sh iy sp eh s sp m iy sil d uw w sp ay sp l ah uw k sp aa uh hh r ay t sil eh n sp ay sp s ey sp y eh s sil hh y uw sp l uw k sp hh w ah n d eh er f uw l sil t ih n ay t sil w ih sil k ah sp t uw w sp ah sp p aa r t ih sil eh n sp eh v r ih w ah n sp t ah r n s sp t uh sp s iy sil d ih s sp p ih uw t ih f uw sp l ey d iy sil d eh s sp w ao k ih n eh sp er r aw uw n sp w ih sil m iy sil hh eh n sp eh n sp sh iy sp eh s sp m iy sil t uw sp y uw sp f ih l sp ao r ay t sil eh n sp ay sp s ey sp y eh s sil ay sp f ih l ow sp hh w ah n d eh er f uw l sil t ah n ay sil ay sp f ih ow sp hh w ah n eh er f uw l sil b ih k ah s sp ay sp s iy sil d ah sp l ah f sp l ay t sp ih n sp y ao er r sp ay s sil hh eh n sp d ow sp w ah n d eh 

  6%|▋         | 3/48 [04:26<1:02:25, 83.24s/it]

sil hh ih s sp l ey t sp ih n sp dh ah sp iy ih f n ih ng sil sh ih s sp w ao ah n d r ih ng sp w ah sp k l ow s sp t uw sp w eh sil sh iy sp p uh s sp ao n sp hh er sp m ey k ah sil hh eh m sp b r ah sh ah s sp hh ah sp l ao ng sp b l ao n sp hh eh er sil hh eh n sp dh eh n sp sh iy y sp ae s sp m iy sil n d y uw w sp ay sp l uh k sp ah ow uw r ay t sil hh eh n sp ay sp s ey sp y eh s sil y uw sp l uh k sp w ao n d er f ow l sp t ah n ay sil w iy sp g ow sp t uw w sp ah sp p aa r t iy sil hh eh n sp eh v r iy w ah n sp t ah n s sp t uw sp s iy sil d ih s sp p y uw t ih f uw sp l ey d iy sil n d eh s sp w ao k iy ih ng sp ah er r aw n sp w ih s sp m iy sil hh eh n sp d eh n sp sh iy y sp eh s sp m iy sil d uw sp y uh sp f iy ih l sp ao r ay sil eh n sp ay sp s ey sp y eh s sil hh ay sp f ih ah ow sp v w ah n d ah f ow l sp t ah n ay sil ay sp f ih ah ow sp v w ao ah n d ah f ow l sil b iy k ah z sp ay sp s iy sp d ah sp l ah v sp l ay t sil hh ih n sp y ao r sp ay s sil eh n sp d ah sp

In [None]:
print(tacotron_model)


TacotronWrapper(
  (text_to_melody): MBartForConditionalGeneration(
    (model): MBartModel(
      (shared): MBartScaledWordEmbedding(250027, 1024, padding_idx=1)
      (encoder): MBartEncoder(
        (embed_tokens): MBartScaledWordEmbedding(250027, 1024, padding_idx=1)
        (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
        (layers): ModuleList(
          (0-11): 12 x MBartEncoderLayer(
            (self_attn): MBartSdpaAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=Tru

In [None]:
for phonemes, mel_spec in dataloader:
    print("Mel Spec Shape:", mel_spec.shape)
    break


Mel Spec Shape: torch.Size([1, 80, 7218])


In [None]:
tacotron_model.eval()
for phonemes, mel_spec in dataloader:
    with torch.no_grad():
        generated_mel = tacotron_model(phonemes)
        print("Generated Mel Shape:", generated_mel.shape)
        break



Generated Mel Shape: torch.Size([1, 869, 250027])
