In [1]:
# import os
# import librosa
# import numpy as np
# import torch
# from torch.utils.data import Dataset, DataLoader
# from torchvision.transforms import Compose
# import wandb

# # Initialize a new wandb run
# # wandb.init(project='TransformerWaveNet')

# # Function to load an audio file and resample
# def load_audio(file_path, sample_rate=16000):
#     audio, sr = librosa.load(file_path, sr=sample_rate)
#     return audio

# def apply_dtw(audio, target_length, sample_rate=16000):
#     # Generate a reference signal of target length
#     ref_signal = np.linspace(0, 1, target_length)
    
#     # Ensure both signals are 2D with shape (1, N)
#     audio = audio.reshape(1, -1)
#     ref_signal = ref_signal.reshape(1, -1)
    
#     # Compute the DTW path
#     D, wp = librosa.sequence.dtw(X=ref_signal, Y=audio, metric='euclidean')
    
#     # Use the DTW path to resample the audio to the target length
#     # Extract the path indices that map ref_signal to audio
#     path_indices = wp[:, 1]  # Get the indices from the warping path
#     aligned_audio = audio.flatten()[path_indices]  # Align audio according to the path
    
#     # If the aligned audio is longer than the target length, trim it
#     if len(aligned_audio) > target_length:
#         aligned_audio = aligned_audio[:target_length]
#     elif len(aligned_audio) < target_length:
#         # If shorter, pad it
#         aligned_audio = np.pad(aligned_audio, (0, target_length - len(aligned_audio)), mode='constant')
    
#     return aligned_audio

# # Function to convert audio to a spectrogram
# def audio_to_spectrogram(audio, n_fft=2048, hop_length=512):
#     spectrogram = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
#     spectrogram = librosa.amplitude_to_db(np.abs(spectrogram))
#     return spectrogram

# # Custom Dataset class
# class AudioDataset(Dataset):
#     def __init__(self, root_dir, target_length=16000, transform=None):
#         self.root_dir = root_dir
#         self.files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(root_dir) for f in filenames if f.endswith('.mp3') or f.endswith('.wav')]
#         self.target_length = target_length
#         self.transform = transform

#     def __len__(self):
#         return len(self.files)

#     def __getitem__(self, idx):
#         audio_path = self.files[idx]
#         audio = load_audio(audio_path)
#         audio = apply_dtw(audio, self.target_length)
#         spectrogram = audio_to_spectrogram(audio)

#         if self.transform:
#             spectrogram = self.transform(spectrogram)

#         return torch.tensor(spectrogram, dtype=torch.float32)

# # Transform to move data to GPU
# class ToTensorGPU:
#     def __call__(self, tensor):
#         return torch.tensor(tensor).cuda()

# # Compose transforms
# transforms = Compose([
#     ToTensorGPU()
# ])

In [20]:
import os
import librosa
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Compose

# Function to load an audio file, resample it, and convert to spectrogram
def load_and_process_audio(file_path, sample_rate=16000, n_fft=2048, hop_length=512, max_length=1025):
    audio, _ = librosa.load(file_path, sr=sample_rate)
    spectrogram = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
    spectrogram = librosa.amplitude_to_db(np.abs(spectrogram))
    
    # Pad or truncate the spectrogram to make sure all are the same size
    if spectrogram.shape[1] < max_length:
        # Pad the spectrogram
        padding = max_length - spectrogram.shape[1]
        spectrogram = np.pad(spectrogram, ((0, 0), (0, padding)), mode='constant')
    else:
        # Truncate the spectrogram
        spectrogram = spectrogram[:, :max_length]
    
    return spectrogram

# Custom Dataset class
class AudioDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(root_dir) for f in filenames if f.endswith('.mp3') or f.endswith('.wav')]
        self.transform = transform

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        audio_path = self.files[idx]
        spectrogram = load_and_process_audio(audio_path)
        if self.transform:
            spectrogram = self.transform(spectrogram)
        return torch.tensor(spectrogram, dtype=torch.float32, device='cuda')  # Directly create tensor on GPU

# Example usage of the dataset and DataLoader
dataset = AudioDataset(root_dir='DATA')
loader = DataLoader(dataset, batch_size=10, shuffle=True, num_workers=4)

In [22]:
# Example of iterating over the DataLoader
for batch in loader:
    print("Input batch size:", batch.size())


RuntimeError: DataLoader worker (pid(s) 37412, 17824, 37448, 33588) exited unexpectedly

In [3]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# class ConditionalWaveNet(nn.Module):
#     def __init__(self, audio_channels=1, num_channels=64, kernel_size=2, num_blocks=4, num_layers=10, num_condition_classes=10):
#         super(ConditionalWaveNet, self).__init__()
#         self.num_blocks = num_blocks
#         self.num_layers = num_layers
#         self.dilated_convs = nn.ModuleList()
#         self.condition_convs = nn.ModuleList()
#         self.residual_convs = nn.ModuleList()
#         self.skip_convs = nn.ModuleList()
        
#         # Embedding for conditioning
#         self.embedding = nn.Embedding(num_condition_classes, num_channels)
        
#         # Layers for processing the spectrogram input
#         self.spectrogram_conv = nn.Conv1d(audio_channels, num_channels, 1)
        
#         # Dilated convolutions and condition convolutions
#         for _ in range(num_blocks):
#             for i in range(num_layers):
#                 dilation = 2 ** i
#                 self.dilated_convs.append(nn.Conv1d(num_channels, 2 * num_channels, kernel_size, dilation=dilation, padding=dilation))
#                 self.condition_convs.append(nn.Conv1d(num_channels, 2 * num_channels, 1))
#                 self.residual_convs.append(nn.Conv1d(num_channels, num_channels, 1))
#                 self.skip_convs.append(nn.Conv1d(num_channels, num_channels, 1))
        
#         # Attention layer
#         self.attention = nn.MultiheadAttention(num_channels, num_heads=8)
        
#         # Output layers
#         self.final_conv1 = nn.Conv1d(num_channels, num_channels, 1)
#         self.final_conv2 = nn.Conv1d(num_channels, audio_channels, 1)

#     def forward(self, audio, spectrogram, condition):
#         # Embedding for condition
#         condition_embedding = self.embedding(condition)
        
#         # Process spectrogram
#         spectrogram = self.spectrogram_conv(spectrogram)
        
#         # Combine audio and spectrogram
#         x = audio + spectrogram
        
#         skip_connections = []
        
#         for b in range(self.num_blocks):
#             for l in range(self.num_layers):
#                 # Dilated convolution
#                 dilated = self.dilated_convs[b * self.num_layers + l](x)
#                 # Conditioned convolution
#                 conditioned = self.condition_convs[b * self.num_layers + l](condition_embedding)
#                 # Split for gated activation
#                 filtered, gate = torch.split(dilated + conditioned, dilated.size(1) // 2, dim=1)
#                 x = torch.tanh(filtered) * torch.sigmoid(gate)
#                 # Residual and skip connections
#                 x = self.residual_convs[b * self.num_layers + l](x)
#                 skip = self.skip_convs[b * self.num_layers + l](x)
#                 skip_connections.append(skip)
        
#         # Sum all skip connections
#         x = torch.sum(torch.stack(skip_connections), dim=0)
        
#         # Apply attention
#         x = x.permute(2, 0, 1)  # Rearrange for attention
#         x, _ = self.attention(x, x, x)
#         x = x.permute(1, 2, 0)  # Rearrange back
        
#         # Final convolutions
#         x = F.relu(self.final_conv1(x))
#         x = self.final_conv2(x)
        
#         return x

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerWaveNet(nn.Module):
    def __init__(self, audio_channels=1, num_channels=64, kernel_size=2, num_blocks=4, num_layers=10, num_condition_classes=10, num_heads=8):
        super(TransformerWaveNet, self).__init__()
        self.num_blocks = num_blocks
        self.num_layers = num_layers
        self.dilated_convs = nn.ModuleList()
        self.condition_convs = nn.ModuleList()
        self.residual_convs = nn.ModuleList()
        self.skip_convs = nn.ModuleList()
        
        # Embedding for conditioning
        self.embedding = nn.Embedding(num_condition_classes, num_channels)
        
        # Initial convolution layer for raw audio
        self.audio_conv = nn.Conv1d(audio_channels, num_channels, 1)
        
        # Initial convolution layer for spectrogram
        self.spectrogram_conv = nn.Conv1d(audio_channels, num_channels, 1)
        
        # Transformer block
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=num_channels, nhead=num_heads, dim_feedforward=num_channels * 4, batch_first=True),
            num_layers=3)
        
        # Dilated convolutions and condition convolutions
        for _ in range(num_blocks):
            for i in range(num_layers):
                dilation = 2 ** i
                self.dilated_convs.append(nn.Conv1d(num_channels, 2 * num_channels, kernel_size, dilation=dilation, padding=dilation))
                self.condition_convs.append(nn.Conv1d(num_channels, 2 * num_channels, 1))
                self.residual_convs.append(nn.Conv1d(num_channels, num_channels, 1))
                self.skip_convs.append(nn.Conv1d(num_channels, num_channels, 1))
        
        # Output layers
        self.final_conv1 = nn.Conv1d(num_channels, num_channels, 1)
        self.final_conv2 = nn.Conv1d(num_channels, audio_channels, 1)

    def forward(self, audio, spectrogram, condition):
        # Embedding for condition
        condition_embedding = self.embedding(condition)
        
        # Process audio and spectrogram
        audio = self.audio_conv(audio)
        spectrogram = self.spectrogram_conv(spectrogram)
        
        # Combine audio and spectrogram
        x = audio + spectrogram
        
        # Transformer processing
        # x = x.permute(2, 0, 1)  # Rearrange for transformer (seq_len, batch, features)
        x = self.transformer(x)
        # x = x.permute(1, 2, 0)  # Rearrange back (batch, features, seq_len)
        
        skip_connections = []
        
        for b in range(self.num_blocks):
            for l in range(self.num_layers):
                # Dilated convolution
                dilated = self.dilated_convs[b * self.num_layers + l](x)
                # Conditioned convolution
                conditioned = self.condition_convs[b * self.num_layers + l](condition_embedding)
                # Split for gated activation
                filtered, gate = torch.split(dilated + conditioned, dilated.size(1) // 2, dim=1)
                x = torch.tanh(filtered) * torch.sigmoid(gate)
                # Residual and skip connections
                x = self.residual_convs[b * self.num_layers + l](x)
                skip = self.skip_convs[b * self.num_layers + l](x)
                skip_connections.append(skip)
        
        # Sum all skip connections
        x = torch.sum(torch.stack(skip_connections), dim=0)
        
        # Final convolutions
        x = F.relu(self.final_conv1(x))
        x = self.final_conv2(x)
        
        return x
    
    def generate(self, audio, spectrogram, condition):
        """
        Generate audio using the model in an autoregressive manner.
        Assumes the model is already trained and in eval mode.
        """
        self.eval()  # Ensure the model is in evaluation mode
        with torch.no_grad():  # No need to track gradients
            # Assuming the inputs are already on the correct device and preprocessed
            generated_audio = self.forward(audio, spectrogram, condition)

            # Post-processing if necessary (e.g., applying a sigmoid to ensure output is in the correct range)
            generated_audio = torch.sigmoid(generated_audio)  # Example post-processing

        return generated_audio

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Load a pre-trained VGGish model for audio feature extraction
vggish = torch.hub.load('harritaylor/torchvggish', 'vggish')

# Define the Perceptual Loss using VGGish as the feature extractor
class PerceptualLoss(nn.Module):
    def __init__(self, feature_extractor):
        super(PerceptualLoss, self).__init__()
        self.feature_extractor = feature_extractor
        self.feature_extractor.eval()  # Set to evaluation mode

    def forward(self, generated_audio, target_audio):
        with torch.no_grad():
            real_features = self.feature_extractor(target_audio)
        generated_features = self.feature_extractor(generated_audio)
        loss = F.l1_loss(generated_features, real_features)
        return loss

perceptual_loss = PerceptualLoss(vggish)

Using cache found in C:\Users\rahat/.cache\torch\hub\harritaylor_torchvggish_master
Downloading: "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish-10086976.pth" to C:\Users\rahat/.cache\torch\hub\checkpoints\vggish-10086976.pth
100%|██████████| 275M/275M [01:20<00:00, 3.58MB/s] 
Downloading: "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish_pca_params-970ea276.pth" to C:\Users\rahat/.cache\torch\hub\checkpoints\vggish_pca_params-970ea276.pth
100%|██████████| 177k/177k [00:00<00:00, 1.69MB/s]


In [8]:
class MultiScaleSpectrogramLoss(nn.Module):
    def __init__(self, scales=[1024, 2048, 4096]):
        super(MultiScaleSpectrogramLoss, self).__init__()
        self.scales = scales

    def forward(self, generated_audio, target_audio):
        loss = 0
        for scale in self.scales:
            gen_spec = torch.stft(generated_audio, n_fft=scale, return_complex=True)
            target_spec = torch.stft(target_audio, n_fft=scale, return_complex=True)
            loss += F.l1_loss(gen_spec.abs(), target_spec.abs())
        return loss / len(self.scales)

spectrogram_loss = MultiScaleSpectrogramLoss()

In [9]:
# For demonstration, let's assume we have a simple CNN as a discriminator
class SimpleAudioDiscriminator(nn.Module):
    def __init__(self):
        super(SimpleAudioDiscriminator, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(16 * 16, 1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        return x

    def intermediate_forward(self, x):
        x = F.relu(self.conv1(x))
        return x

discriminator = SimpleAudioDiscriminator()

class FeatureMatchingLoss(nn.Module):
    def __init__(self, discriminator):
        super(FeatureMatchingLoss, self).__init__()
        self.discriminator = discriminator
        self.discriminator.eval()

    def forward(self, generated_audio, target_audio):
        with torch.no_grad():
            real_features = self.discriminator.intermediate_forward(target_audio)
        generated_features = self.discriminator.intermediate_forward(generated_audio)
        loss = F.l1_loss(generated_features, real_features)
        return loss

feature_matching_loss = FeatureMatchingLoss(discriminator)

In [10]:
# Example of a composite loss
class CompositeLoss(nn.Module):
    def __init__(self, perceptual_loss, spectrogram_loss, feature_matching_loss):
        super(CompositeLoss, self).__init__()
        self.perceptual_loss = perceptual_loss
        self.spectrogram_loss = spectrogram_loss
        self.feature_matching_loss = feature_matching_loss

    def forward(self, generated_audio, target_audio):
        loss = (self.perceptual_loss(generated_audio, target_audio) +
                self.spectrogram_loss(generated_audio, target_audio) +
                self.feature_matching_loss(generated_audio, target_audio))
        return loss

In [11]:
import torch
from torch.utils.data import DataLoader
import os

def train(model, train_loader, val_loader, optimizer, criterion, epochs, device):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for i, (audio, spectrogram, condition) in enumerate(train_loader):
            audio, spectrogram, condition = audio.to(device), spectrogram.to(device), condition.to(device)
            optimizer.zero_grad()
            output = model(audio, spectrogram, condition)
            loss = CompositeLoss(output, audio)
            loss.backward()
            optimizer.step()
            
            # Log loss to wandb
            wandb.log({"train_loss": loss.item()})
            print(f"Epoch [{epoch + 1}/{epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item()}")
            # Save model checkpoint
            torch.save(model.state_dict(), f'TW_Checkpoint/model_TW_{epoch}.pt')
            
            # Generate synthetic data and add to train_loader
            if i % 10 == 0:  # Every 10 iterations, generate synthetic data
                with torch.no_grad():
                    synthetic_audio = model.generate(audio, spectrogram, condition)
                train_loader.dataset.append((synthetic_audio, spectrogram, condition))
        
        epoch_loss /= len(train_loader)
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {epoch_loss}")
        wandb.log({"epoch_loss": epoch_loss})
        # Validation loop
        model.eval()
        with torch.no_grad():
            val_loss = 0
            for audio, spectrogram, condition in val_loader:
                audio, spectrogram, condition = audio.to(device), spectrogram.to(device), condition.to(device)
                output = model(audio, spectrogram, condition)
                val_loss += criterion(output, audio).item()
            val_loss /= len(val_loader)
        
        # Log validation loss to wandb
        wandb.log({"val_loss": val_loss})

In [12]:
from torch.optim import Adam
from torch.nn import MSELoss

train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=10, shuffle=False, num_workers=0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerWaveNet().to(device)
print(model)
optimizer = Adam(model.parameters(), lr=0.001)
# composite_loss = CompositeLoss(perceptual_loss, spectrogram_loss, feature_matching_loss)
composite_loss = CompositeLoss(perceptual_loss, spectrogram_loss, feature_matching_loss)

# train(model, train_loader, val_loader, optimizer, composite_loss, epochs=50, device=device)

TypeError: cannot unpack non-iterable ellipsis object