<a href="https://colab.research.google.com/github/SUDHARSAN270/Machine_learning/blob/main/audio_constructionupdated33.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [79]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torchaudio
import torchaudio.transforms as transforms
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR

In [2]:
train_dataset = torchaudio.datasets.LIBRISPEECH(
    root='/media',
    url='train-clean-100',
    download=True
)

100%|██████████| 5.95G/5.95G [02:42<00:00, 39.2MB/s]


In [3]:
waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id = train_dataset[0]

print("Waveform: ", waveform.shape)
print("Sample Rate: ", sample_rate)
print("Utterance: ", utterance)
print("Speaker ID: ", speaker_id)
print("Chapter ID: ", chapter_id)
print("Utterance ID: ", utterance_id)

Waveform:  torch.Size([1, 225360])
Sample Rate:  16000
Utterance:  CHAPTER ONE MISSUS RACHEL LYNDE IS SURPRISED MISSUS RACHEL LYNDE LIVED JUST WHERE THE AVONLEA MAIN ROAD DIPPED DOWN INTO A LITTLE HOLLOW FRINGED WITH ALDERS AND LADIES EARDROPS AND TRAVERSED BY A BROOK
Speaker ID:  103
Chapter ID:  1240
Utterance ID:  0


In [78]:
class SoundProcessingModel(nn.Module):
    def __init__(self, num_classes=10):
        super(SoundProcessingModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)  # Output: 32 x 32 x 3

        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)  # Output: 64 x 16 x 1

        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        # Adjust the kernel size or stride of pool3 to avoid reducing the dimension to 0
        self.pool3 = nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0)  # Output: 128 x 8 x 1

        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(0.5)

        self.lstm1 = nn.LSTM(input_size=128 * 8 , hidden_size=128, batch_first=True)  # Adjusted based on new input size
        self.lstm2 = nn.LSTM(input_size=128, hidden_size=64, batch_first=True)

        self.fc1 = nn.Linear(64, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))  # 1x64x7 -> 32x32x3
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))  # 32x32x3 -> 64x16x1
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))  # 64x16x1 -> 128x8x1

        x = self.flatten(x)  # Shape: (batch_size, 128 * 8 * 1)
        x = x.unsqueeze(1)  # Add sequence dimension: Shape: (batch_size, 1, 128 * 8 * 1)

        x, _ = self.lstm1(x)  # Output shape: (batch_size, seq_len, 128)
        x, _ = self.lstm2(x)  # Output shape: (batch_size, seq_len, 64)
        x = x[:, -1, :]  # Get the output of the last time step: Shape: (batch_size, 64)

        x = self.dropout(F.relu(self.fc1(x)))  # Shape: (batch_size, 128)
        x = self.fc2(x)  # Shape: (batch_size, num_classes)
        return x

# Define the model
model = SoundProcessingModel(num_classes=10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [67]:
transform  = transforms.MelSpectrogram(
    sample_rate = 16000,
    n_mels =64

)



class CustomAudioDataset(torch.utils.data.Dataset):
  def __init__(self, dataset, transform = transform, fixed_length = 1227):
    self.dataset = dataset
    self.transform = transform
    self.fixed_length = fixed_length


  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    waveform,sample_rate,utterance,speaker_id, character_id, utterance_id = self.dataset[idx]
     # Apply padding or truncation if fixed_length is specified
    if self.fixed_length is not None:
      waveform = F.pad(waveform, (0, self.fixed_length - waveform.size(-1)))
      waveform = waveform[:,:self.fixed_length]

    if self.transform:
      waveform = self.transform(waveform)
      # Return only the waveform and character_id as a Tensor
      return waveform, torch.tensor(character_id) # Convert character_id to a Tensor


customdataset = CustomAudioDataset(train_dataset)
data_loader = DataLoader(customdataset, batch_size=10, shuffle=False)


In [68]:
# Loop through the data loader and feed batches to the model
for batch_idx, (waveform, utterance) in enumerate(data_loader):
    print(f"Batch {batch_idx} - Waveform: {waveform.shape}")
    print(f"Batch {batch_idx} - Utterance: {utterance}")

    # Move data to the appropriate device (CPU/GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    waveform = waveform.to(device)

    # Forward pass
    output = model(waveform)

    print(f"Batch {batch_idx} - Model Output: {output.shape}")
    print(f"Batch {batch_idx} - Model Output: {output}")

    if batch_idx == 1:  # Just to prevent printing too many items, remove this condition to print all
        break


Batch 0 - Waveform: torch.Size([10, 1, 64, 7])
Batch 0 - Utterance: tensor([1240, 1240, 1240, 1240, 1240, 1240, 1240, 1240, 1240, 1240])
Batch 0 - Model Output: torch.Size([10, 1227])
Batch 0 - Model Output: tensor([[-0.0441, -0.0305, -0.0026,  ..., -0.0627, -0.0341,  0.0387],
        [-0.0634,  0.0548, -0.0386,  ...,  0.0149, -0.0283,  0.0428],
        [-0.0667,  0.0070, -0.0029,  ...,  0.0154, -0.0044,  0.0421],
        ...,
        [-0.0791,  0.0521, -0.0424,  ..., -0.0147, -0.0220, -0.0240],
        [-0.0452,  0.0264, -0.0321,  ..., -0.0670, -0.0338, -0.0277],
        [-0.0640,  0.0310, -0.0156,  ..., -0.0240, -0.0714, -0.0307]],
       grad_fn=<AddmmBackward0>)
Batch 1 - Waveform: torch.Size([10, 1, 64, 7])
Batch 1 - Utterance: tensor([1240, 1240, 1240, 1240, 1240, 1240, 1240, 1240, 1240, 1240])
Batch 1 - Model Output: torch.Size([10, 1227])
Batch 1 - Model Output: tensor([[-0.0132,  0.0690,  0.0145,  ..., -0.0341, -0.0401,  0.0406],
        [-0.0252, -0.0444, -0.0233,  ..., -0.07

In [69]:
train_loader = DataLoader(customdataset, batch_size=10, shuffle=True)

In [85]:
import numpy as np

# Assuming character_ids is a list or NumPy array containing all target labels
unique_classes, counts = np.unique(character_id ,return_counts=True)
num_classes = len(unique_classes)
print("Number of classes:", num_classes)

Number of classes: 1
