<a href="https://colab.research.google.com/github/SamJ70/BYOP/blob/main/ajeeb_umeed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import librosa
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from glob import glob
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F


In [2]:
def build_vocab():
    """Builds a simple vocabulary mapping for lyrics."""
    vocab = {ch: idx for idx, ch in enumerate(" abcdefghijklmnopqrstuvwxyz", start=1)}
    vocab['<pad>'] = 0  # Padding index
    return vocab

vocab = build_vocab()

def text_to_sequence(text, vocab):
    """Converts a string of text into a sequence of indices."""
    return [vocab.get(ch, 0) for ch in text.lower() if ch in vocab]



In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import librosa
import numpy as np
from glob import glob
import os
from tqdm import tqdm

def build_vocab():
    """Builds a simple vocabulary mapping for lyrics."""
    vocab = {ch: idx for idx, ch in enumerate(" abcdefghijklmnopqrstuvwxyz", start=1)}
    vocab['<pad>'] = 0
    return vocab

vocab = build_vocab()

In [4]:
class NUS48Dataset(Dataset):
    def __init__(self, base_dir, sampling_rate=22050, n_fft=1024, hop_length=256, n_mels=80):
        self.audio_files = glob(os.path.join(base_dir, "**/sing/*.wav"), recursive=True)
        self.annotation_files = [f.replace('.wav', '.txt') for f in self.audio_files]
        self.sampling_rate = sampling_rate
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.n_mels = n_mels

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        # Load audio
        wav_path = self.audio_files[idx]
        annotation_path = self.annotation_files[idx]

        # Load audio and compute mel spectrogram
        audio, _ = librosa.load(wav_path, sr=self.sampling_rate)

        # Compute mel spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=audio,
            sr=self.sampling_rate,
            n_fft=self.n_fft,
            hop_length=self.hop_length,
            n_mels=self.n_mels
        )
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max)

        # Load and process lyrics
        with open(annotation_path, 'r') as f:
            lyrics = f.read().strip().replace('\n', ' ')
        lyrics_seq = text_to_sequence(lyrics, vocab)

        return {
            'mel_spec': torch.tensor(mel_spec, dtype=torch.float32),
            'lyrics': torch.tensor(lyrics_seq, dtype=torch.long),
            'mel_len': mel_spec.shape[1],
            'lyrics_len': len(lyrics_seq)
        }


In [5]:
def collate_fn(batch):
    """Custom collate function to handle variable length sequences."""
    # Get maximum lengths in the batch
    max_mel_len = max(item['mel_len'] for item in batch)
    max_lyrics_len = max(item['lyrics_len'] for item in batch)

    # Initialize tensors
    mel_specs = []
    lyrics = []
    mel_lengths = []
    lyrics_lengths = []

    for item in batch:
        # Pad mel spectrograms
        mel_spec = item['mel_spec']
        pad_len = max_mel_len - mel_spec.shape[1]
        mel_spec = F.pad(mel_spec, (0, pad_len), mode='constant', value=0)
        mel_specs.append(mel_spec)

        # Pad lyrics
        lyric = F.pad(item['lyrics'], (0, max_lyrics_len - len(item['lyrics'])), mode='constant', value=vocab['<pad>'])
        lyrics.append(lyric)

        # Store lengths
        mel_lengths.append(item['mel_len'])
        lyrics_lengths.append(item['lyrics_len'])

    return {
        'mel_specs': torch.stack(mel_specs),
        'lyrics': torch.stack(lyrics),
        'mel_lengths': torch.tensor(mel_lengths),
        'lyrics_lengths': torch.tensor(lyrics_lengths)
    }


In [6]:
class NSFModel(nn.Module):
    def __init__(self, vocab_size, hidden_dim=256, n_mels=80):
        super(NSFModel, self).__init__()
        self.hidden_dim = hidden_dim

        # Text Encoder
        self.text_encoder = nn.Embedding(vocab_size, hidden_dim, padding_idx=vocab['<pad>'])
        self.text_lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)

        # Time-wise projection to match mel spectrogram length
        self.length_regulator = nn.Linear(hidden_dim * 2, hidden_dim)

        # Pitch generator (simplified)
        self.pitch_generator = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

        # Mel-spec generator
        self.mel_generator = nn.Sequential(
            nn.Linear(hidden_dim + 1, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, n_mels)
        )

    def forward(self, lyrics, lyrics_lengths, mel_lengths):
        batch_size = lyrics.size(0)

        # Encode lyrics
        lyrics_emb = self.text_encoder(lyrics)  # [B, L, H]
        lyrics_packed = nn.utils.rnn.pack_padded_sequence(
            lyrics_emb,
            lyrics_lengths.cpu(),
            batch_first=True,
            enforce_sorted=False
        )
        lyrics_encoded, _ = self.text_lstm(lyrics_packed)
        lyrics_encoded, _ = nn.utils.rnn.pad_packed_sequence(lyrics_encoded, batch_first=True)

        # Project to hidden dimension
        lyrics_encoded = self.length_regulator(lyrics_encoded)  # [B, L, H]

        # Create output sequence aligned to target length
        max_mel_len = mel_lengths.max().item()
        outputs = []

        for i in range(batch_size):
            # Get current sequence
            curr_lyrics = lyrics_encoded[i, :lyrics_lengths[i]]  # [L, H]
            target_len = mel_lengths[i]

            # Interpolate to match mel spectrogram length
            curr_lyrics = F.interpolate(
                curr_lyrics.unsqueeze(0).transpose(1, 2),
                size=target_len,
                mode='linear',
                align_corners=False
            ).transpose(1, 2).squeeze(0)  # [T, H]

            # Generate pitch
            pitch = self.pitch_generator(curr_lyrics)  # [T, 1]

            # Combine features
            features = torch.cat([curr_lyrics, pitch], dim=-1)  # [T, H+1]

            # Generate mel spectrogram
            mel = self.mel_generator(features)  # [T, n_mels]

            # Pad to max length
            if target_len < max_mel_len:
                mel = F.pad(mel, (0, 0, 0, max_mel_len - target_len))

            outputs.append(mel)

        return torch.stack(outputs)

In [7]:
def train_model(model, data_loader, num_epochs=25, lr=1e-4):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss(reduction='none')
    device = next(model.parameters()).device

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(data_loader):
            mel_specs = batch['mel_specs'].to(device)  # [B, n_mels, T]
            lyrics = batch['lyrics'].to(device)
            mel_lengths = batch['mel_lengths'].to(device)
            lyrics_lengths = batch['lyrics_lengths'].to(device)

            optimizer.zero_grad()

            # Forward pass
            predicted_mel = model(lyrics, lyrics_lengths, mel_lengths)  # [B, T, n_mels]
            predicted_mel = predicted_mel.transpose(1, 2)  # [B, n_mels, T]

            # Create mask based on mel_lengths
            mask = torch.arange(mel_specs.size(2), device=device)[None, None, :] < mel_lengths[:, None, None]
            mask = mask.expand_as(mel_specs)

            # Compute masked loss
            loss = criterion(predicted_mel, mel_specs)
            loss = (loss * mask).sum() / mask.sum()

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")


In [None]:
# import torch
# import torch.nn as nn
# import torchaudio
# import transformers

# # class AdvancedMusicGenerationModel(nn.Module):
# #     def __init__(self,
# #                  vocab_size,
# #                  d_model=512,
# #                  nhead=8,
# #                  num_layers=6,
# #                  spectrogram_dim=128):
# #         super().__init__()

# #         # Text Encoder (use pre-trained transformer)
# #         self.text_encoder = transformers.AutoModel.from_pretrained('bert-base-uncased')

# #         # Music-specific embedding
# #         self.music_embedding = nn.Sequential(
# #             nn.Linear(d_model, d_model * 2),
# #             nn.LayerNorm(d_model * 2),
# #             nn.GELU(),
# #             nn.Linear(d_model * 2, spectrogram_dim)
# #         )

# #         # Advanced Transformer for sequential generation
# #         self.music_transformer = nn.TransformerEncoder(
# #             nn.TransformerEncoderLayer(
# #                 d_model=spectrogram_dim,
# #                 nhead=nhead,
# #                 dim_feedforward=d_model * 4,
# #                 dropout=0.1
# #             ),
# #             num_layers=num_layers
# #         )

# #         # Conditioning mechanism
# #         self.style_embedding = nn.Linear(d_model, spectrogram_dim)

# #         # Pitch and rhythm estimator
# #         self.pitch_rhythm_estimator = nn.Sequential(
# #             nn.Linear(spectrogram_dim, spectrogram_dim // 2),
# #             nn.ReLU(),
# #             nn.Linear(spectrogram_dim // 2, 2)  # Pitch and rhythm features
# #         )

# #         # Advanced spectrogram generator
# #         self.spectrogram_generator = nn.Sequential(
# #             nn.ConvTranspose1d(spectrogram_dim, spectrogram_dim // 2, kernel_size=3, stride=2, padding=1),
# #             nn.BatchNorm1d(spectrogram_dim // 2),
# #             nn.ReLU(),
# #             nn.ConvTranspose1d(spectrogram_dim // 2, 1, kernel_size=3, stride=2, padding=1),
# #             nn.Tanh()  # Normalized output
# #         )

# #     def forward(self, text, text_mask):
# #         # Text encoding
# #         text_features = self.text_encoder(text, attention_mask=text_mask).last_hidden_state

# #         # Music embedding and conditioning
# #         music_embedding = self.music_embedding(text_features)
# #         style_embedding = self.style_embedding(text_features.mean(dim=1))

# #         # Transformer-based generation
# #         generated_music = self.music_transformer(music_embedding + style_embedding)

# #         # Pitch and rhythm estimation
# #         pitch_rhythm = self.pitch_rhythm_estimator(generated_music)

# #         # Spectrogram generation
# #         spectrogram = self.spectrogram_generator(generated_music.transpose(1, 2))

# #         return spectrogram, pitch_rhythm

# class AdvancedMusicGenerationModel(nn.Module):
#     def __init__(self,
#                  vocab_size,
#                  d_model=512,
#                  nhead=8,
#                  num_layers=6,
#                  spectrogram_dim=80,
#                  max_seq_length=1000):
#         super().__init__()

#         # Text Encoder (use pre-trained transformer)
#         self.text_encoder = transformers.AutoModel.from_pretrained('bert-base-uncased')

#         # Embedding for lyrics
#         self.lyrics_embedding = nn.Embedding(vocab_size, d_model, padding_idx=vocab['<pad>'])

#         # Positional encoding with safe max length
#         self.pos_encoder = nn.Embedding(max_seq_length, d_model)

#         # Music-specific embedding
#         self.music_embedding = nn.Sequential(
#             nn.Linear(d_model, d_model * 2),
#             nn.LayerNorm(d_model * 2),
#             nn.GELU(),
#             nn.Linear(d_model * 2, spectrogram_dim)
#         )

#         # Advanced Transformer for sequential generation
#         encoder_layer = nn.TransformerEncoderLayer(d_model=spectrogram_dim, nhead=nhead)
#         self.music_transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

#         # Spectrogram generator
#         self.spectrogram_generator = nn.Sequential(
#             nn.Linear(spectrogram_dim, spectrogram_dim * 2),
#             nn.ReLU(),
#             nn.Linear(spectrogram_dim * 2, spectrogram_dim)
#         )

#     def forward(self, lyrics, lyrics_lengths, mel_lengths):
#         # Safety check for input shapes
#         batch_size, seq_len = lyrics.size()
#         device = lyrics.device

#         # Create attention mask for text
#         text_mask = (lyrics != vocab['<pad>'])

#         # Embed lyrics
#         lyrics_emb = self.lyrics_embedding(lyrics)

#         # Add positional encoding with careful indexing
#         # Ensure position indices are within the embedding's range
#         max_pos = min(seq_len, 999)  # Assuming max_seq_length is 1000
#         positions = torch.clamp(
#             torch.arange(seq_len, device=device).unsqueeze(0).expand(batch_size, -1),
#             max=999
#         )
#         pos_emb = self.pos_encoder(positions)

#         # Add positional embedding
#         lyrics_emb = lyrics_emb + pos_emb

#         # Transformer-based generation
#         max_mel_len = mel_lengths.max().item()

#         # Initialize output tensor
#         outputs = []

#         for i in range(batch_size):
#             # Prepare current sequence
#             curr_lyrics = lyrics_emb[i, :lyrics_lengths[i]]

#             # Music embedding
#             music_emb = self.music_embedding(curr_lyrics)

#             # Interpolate to match mel spectrogram length
#             if music_emb.size(0) != mel_lengths[i]:
#                 music_emb = F.interpolate(
#                     music_emb.unsqueeze(0).transpose(1, 2),
#                     size=mel_lengths[i],
#                     mode='linear',
#                     align_corners=False
#                 ).transpose(1, 2).squeeze(0)

#             # Transformer processing
#             processed_emb = self.music_transformer(music_emb.unsqueeze(1)).squeeze(1)

#             # Generate spectrogram
#             mel = self.spectrogram_generator(processed_emb)

#             # Pad to max length
#             if mel.size(0) < max_mel_len:
#                 mel = F.pad(mel, (0, 0, 0, max_mel_len - mel.size(0)))

#             outputs.append(mel)

#         return torch.stack(outputs)


In [8]:
base_dir = "/content/drive/MyDrive/nus-smc-corpus_48"
dataset = NUS48Dataset(base_dir)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
model = NSFModel(len(vocab)).to("cuda")
train_model(model, data_loader)

100%|██████████| 24/24 [01:45<00:00,  4.39s/it]


Epoch 1/25, Loss: 3750.6873


100%|██████████| 24/24 [00:24<00:00,  1.01s/it]


Epoch 2/25, Loss: 3628.1267


100%|██████████| 24/24 [00:24<00:00,  1.00s/it]


Epoch 3/25, Loss: 2918.7384


100%|██████████| 24/24 [00:24<00:00,  1.01s/it]


Epoch 4/25, Loss: 1171.2968


100%|██████████| 24/24 [00:22<00:00,  1.05it/s]


Epoch 5/25, Loss: 421.2151


100%|██████████| 24/24 [00:24<00:00,  1.01s/it]


Epoch 6/25, Loss: 272.3601


100%|██████████| 24/24 [00:24<00:00,  1.02s/it]


Epoch 7/25, Loss: 243.5415


100%|██████████| 24/24 [00:23<00:00,  1.00it/s]


Epoch 8/25, Loss: 238.5823


100%|██████████| 24/24 [00:23<00:00,  1.03it/s]


Epoch 9/25, Loss: 236.3069


100%|██████████| 24/24 [00:24<00:00,  1.02s/it]


Epoch 10/25, Loss: 235.5260


100%|██████████| 24/24 [00:24<00:00,  1.02s/it]


Epoch 11/25, Loss: 237.4733


100%|██████████| 24/24 [00:23<00:00,  1.01it/s]


Epoch 12/25, Loss: 235.9925


100%|██████████| 24/24 [00:23<00:00,  1.03it/s]


Epoch 13/25, Loss: 235.1826


100%|██████████| 24/24 [00:24<00:00,  1.01s/it]


Epoch 14/25, Loss: 236.1219


100%|██████████| 24/24 [00:24<00:00,  1.01s/it]


Epoch 15/25, Loss: 235.4996


100%|██████████| 24/24 [00:23<00:00,  1.04it/s]


Epoch 16/25, Loss: 235.4267


100%|██████████| 24/24 [00:24<00:00,  1.00s/it]


Epoch 17/25, Loss: 235.6994


100%|██████████| 24/24 [00:24<00:00,  1.01s/it]


Epoch 18/25, Loss: 235.3468


100%|██████████| 24/24 [00:24<00:00,  1.02s/it]


Epoch 19/25, Loss: 234.7868


100%|██████████| 24/24 [00:23<00:00,  1.04it/s]


Epoch 20/25, Loss: 236.8022


100%|██████████| 24/24 [00:24<00:00,  1.01s/it]


Epoch 21/25, Loss: 234.6680


100%|██████████| 24/24 [00:24<00:00,  1.00s/it]


Epoch 22/25, Loss: 234.6416


100%|██████████| 24/24 [00:23<00:00,  1.00it/s]


Epoch 23/25, Loss: 235.3977


100%|██████████| 24/24 [00:23<00:00,  1.02it/s]


Epoch 24/25, Loss: 236.2095


100%|██████████| 24/24 [00:24<00:00,  1.01s/it]

Epoch 25/25, Loss: 233.7417





In [10]:
# After training
torch.save(model.state_dict(), "nsf_model_weights.pth")

In [9]:
# base_dir = "/content/drive/MyDrive/nus-smc-corpus_48"
# dataset = NUS48Dataset(base_dir)
# data_loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
# model = AdvancedMusicGenerationModel(len(vocab)).to("cuda")
# train_model(model, data_loader)

In [None]:
# # Import required libraries
# import torch
# import librosa
# import soundfile as sf

# # Load model
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = NSFModel(len(vocab)).to(device)
# model.load_state_dict(torch.load("nsf_model_weights.pth"))

# # Generate singing voice
# lyrics = "sing me a song"
# audio = generate_singing_voice(model, lyrics, "output_song.wav")

In [11]:
def generate_singing(model, text, device="cuda"):
    """
    Generate singing voice from input text.

    Args:
        model: Trained NSFModel instance
        text: Input lyrics as string
        device: Device to run inference on

    Returns:
        mel_spec: Generated mel spectrogram
    """
    model.eval()

    # Convert text to sequence
    lyrics_seq = text_to_sequence(text.lower(), vocab)
    lyrics_tensor = torch.tensor([lyrics_seq], dtype=torch.long).to(device)
    lyrics_length = torch.tensor([len(lyrics_seq)], dtype=torch.long).to(device)

    # Estimate mel length (you might want to adjust this ratio)
    estimated_mel_length = torch.tensor([len(lyrics_seq) * 10], dtype=torch.long).to(device)

    with torch.no_grad():
        # Generate mel spectrogram
        mel_spec = model(lyrics_tensor, lyrics_length, estimated_mel_length)

    return mel_spec[0].cpu().numpy()

In [12]:
def mel_to_audio(mel_spec, sr=220500, n_fft=1024, hop_length=256):
    """
    Convert mel spectrogram to audio using Griffin-Lim algorithm.

    Args:
        mel_spec: Mel spectrogram array
        sr: Sampling rate
        n_fft: FFT window size
        hop_length: Number of samples between successive frames

    Returns:
        audio: Generated audio waveform
    """
    # Convert from dB to power
    mel_spec = librosa.db_to_power(mel_spec)

    # Inverse mel spectrogram
    audio = librosa.feature.inverse.mel_to_audio(
        mel_spec,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        n_iter=32  # Number of Griffin-Lim iterations
    )

    return audio


In [13]:
def generate_singing_voice(model, text, output_path="generated_singing.wav"):
    """
    Generate and save singing voice from text.

    Args:
        model: Trained NSFModel instance
        text: Input lyrics as string
        output_path: Path to save the generated audio file
    """
    # Generate mel spectrogram
    mel_spec = generate_singing(model, text)

    # Convert to audio
    audio = mel_to_audio(mel_spec)

    # Save audio file
    sf.write(output_path, audio, 22050)

    return audio


In [28]:
lyrics = "rise in the sky"
generated_audio = generate_singing_voice(model, lyrics, "output_song.wav")

In [15]:
!pip install soundfile
import soundfile as sf # Import the soundfile module and alias it as 'sf'


