In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio
from torchaudio import transforms
import sys

import matplotlib.pyplot as plt
import IPython.display as ipd

from tqdm import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [3]:
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050
mel_spectogram = torchaudio.transforms.MelSpectrogram(
    sample_rate= SAMPLE_RATE,
    n_fft = 1024,
    hop_length = 512,
    n_mels = 64
)

In [4]:
import os
class MyDataset(Dataset):
    def __init__(self, data_dir, dataset, transform, target_sample_rate, num_samples, device):
        self.data_dir = data_dir
        self.dataset = dataset
        self.transform = transform
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
        self.device = device
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        audio_name = os.path.join(self.data_dir, self.dataset.iloc[idx, 0])
        label = self.dataset.iloc[idx, 1]
        signal, sr = torchaudio.load(audio_name)
        signal = signal.to(device)
        signal = self._resample(signal, sr)
        signal = self.cut_signal(signal)
        signal = self._right_pad(signal)
        signal = self.transform(signal)
        return label, signal
    
    def _right_pad(self, signal):
        if signal.shape[1] < self.num_samples:
            num_missing_samples = self.num_samples - signal.shape[1]
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal
    
    def cut_signal(self, signal):
        if signal.shape[1]> self.num_samples:
            signal = signal[:, : self.num_samples]
        return signal
    
    def _resample(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
        return signal

In [5]:
import pandas as pd
train_dir = 'C:/Users/hp/OneDrive/Desktop/AudioClassification/TrainAudioFiles'
dataset = pd.read_csv('train.csv')

In [6]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)

In [7]:
TrainDataset = MyDataset(train_dir, train_df, mel_spectogram, SAMPLE_RATE, NUM_SAMPLES, device)
TestDataset = MyDataset(train_dir, test_df, mel_spectogram, SAMPLE_RATE, NUM_SAMPLES, device)

In [8]:
import os
import torchaudio

def check_audio_formats(directory):
    audio_formats = set()  # Use a set to store unique audio formats
    for file_name in os.listdir(directory):
        if file_name.endswith('.mp3'):  # Adjust the extension based on your audio format
            file_path = os.path.join(directory, file_name)
            info = torchaudio.info(file_path)
            audio_format = info[0].get("format", "unknown")
            audio_formats.add(audio_format)
    return audio_formats

  # Specify the directory containing your audio files
audio_formats = check_audio_formats('TrainAudioFiles')
print("Audio formats found:", audio_formats)




TypeError: 'AudioMetaData' object is not subscriptable

In [9]:
print(f"There are {len(TrainDataset)} sample in dataset")
signal, label = TrainDataset[0]
print(signal, label)

There are 4652 sample in dataset
neutral tensor([[[1.2748e-03, 1.5606e-03, 1.3529e-04,  ..., 5.0757e-03,
          3.5216e-03, 4.0229e-04],
         [7.3054e-02, 2.8878e-02, 1.0325e-02,  ..., 2.1807e-02,
          4.4059e-02, 1.1529e-03],
         [1.1629e-01, 1.3923e-01, 2.1254e-02,  ..., 1.1388e-01,
          3.8099e-02, 1.1796e-02],
         ...,
         [7.6484e-05, 1.1332e-04, 7.6166e-05,  ..., 1.3318e-02,
          2.0225e-03, 9.4653e-04],
         [7.8815e-05, 6.8507e-05, 5.0665e-05,  ..., 1.9693e-02,
          2.2802e-03, 3.7280e-04],
         [1.2547e-04, 8.8900e-05, 5.0916e-05,  ..., 2.6656e-02,
          1.5232e-03, 2.3198e-04]],

        [[2.3210e-03, 1.8789e-03, 2.1479e-04,  ..., 6.0703e-03,
          2.4248e-03, 1.1396e-04],
         [7.2223e-02, 2.6976e-02, 1.1857e-02,  ..., 2.2719e-02,
          4.2457e-02, 1.2342e-03],
         [1.2092e-01, 1.3189e-01, 2.1706e-02,  ..., 1.2151e-01,
          3.7379e-02, 1.2039e-02],
         ...,
         [1.2171e-04, 1.2420e-04, 1.07

In [10]:
class Classification(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1_layer = nn.Sequential(
            nn.Conv2d(in_channels= 1,
                      out_channels= 16,
                      padding= 2,
                      stride= 1,
                      kernel_size= 3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv2_layer = nn.Sequential(
            nn.Conv2d(in_channels= 16,
                      out_channels= 32,
                      padding= 2,
                      stride= 1,
                      kernel_size= 3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv3_layer = nn.Sequential(
            nn.Conv2d(in_channels= 32,
                      out_channels= 64,
                      padding= 2,
                      stride= 1,
                      kernel_size= 3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(64* 5 * 4, 10)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.conv1_layer(x)
        x = self.conv2_layer(x)
        x = self.conv3_layer(x)
        x = self.flatten(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions

In [11]:
BATCH_SIZE = 16
NUM_WORKERS = os.cpu_count()
train_loader = DataLoader(TrainDataset,
                          batch_size= BATCH_SIZE,
                          num_workers= NUM_WORKERS,
                          shuffle= True)
test_loader = DataLoader(TestDataset,
                          batch_size= BATCH_SIZE,
                          num_workers= NUM_WORKERS,
                          shuffle= False)


In [12]:
class_mapping = [
    "neutral",
    "joy",
    "disgust",
    "surprise",
    "sadness",
    "fear",
    "anger"
]

In [13]:
def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
    for input, target in tqdm(data_loader, desc="Training", leave=False):
        input, target = input.to(device), target.to(device)

        # calculate loss
        prediction = model(input)
        loss = loss_fn(prediction, target)

        # backpropagate error and update weights
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    print(f"loss: {loss.item()}")

In [14]:
def train(model, data_loader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")

In [15]:
EPOCHS = 10
LEARNING_RATE = 0.001
cnn = Classification().to(device)
loss_fn = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(cnn.parameters(),
                                lr=LEARNING_RATE)

# train model
train(cnn, train_loader, loss_fn, optimiser, device, EPOCHS)

Epoch 1


Training:   0%|          | 0/291 [00:00<?, ?it/s]

In [None]:
def predict(model, input, target, class_mapping):
    model.eval()
    with torch.no_grad():
        predictions = model(input)
        # Tensor (1, 10) -> [ [0.1, 0.01, ..., 0.6] ]
        predicted_index = predictions[0].argmax(0)
        predicted = class_mapping[predicted_index]
        expected = class_mapping[target]
    return predicted, expected

In [None]:
input, target = TrainDataset[0][0], TrainDataset[0][1] # [batch size, num_channels, fr, time]
input.unsqueeze_(0)

# make an inference
predicted, expected = predict(cnn, input, target,
                                class_mapping)