# Generating the spectrogram image dataset from raw audio file

This script was used to generate the spectrogram image dataset, from the .wav raw audio files, that was used to train the CNN classifier. It is for informative purposes only and **doesn't need to be run again to classify an image using our classifier.** However, if you wish to use the functions in this script, simply modify the hardcoded filepaths in the "Hardcoded Filepaths" chunk to fit your machine's architecture.

**Note: The raw audio .wav files must already be split in train/validation/test format on the drive/machine for this script to function as intended.**

In [2]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, datasets
import torch
import torchvision
import torchvision.transforms as transforms
import glob
import cv2
import matplotlib.pyplot as plt
import torchaudio
import pandas as pd

KeyboardInterrupt: 

### Hardcoded filepaths 

In [None]:
# Load the labels dataframe from the manually labelled training examples in the .csv file
labs = pd.read_csv('/mntDrive/My Drive/VoxCeleb_theo/label.csv')

# Get filepaths of each .wav file for each ID that are already split in train/validation/test format in the drive
train = glob.glob(str('/mntDrive/My Drive/VoxCeleb_theo/train/*/*.wav'))
val = glob.glob(str('/mntDrive/My Drive/VoxCeleb_theo/val/*/*.wav'))
test = glob.glob(str('/mntDrive/My Drive/VoxCeleb_theo/test/*/*.wav'))

### Dataset generation loop

In [None]:
# Iterate through training, validation, and testing datasets of .wav raw audio files
for step in [train, val, test]:
    # Iterate over the ids directory found in each training, validation, and testing datasets
    for idx in step:
        # Extract waveform and sample rate from each .wav file
        waveform, sample_rate = torchaudio.load(idx)
        # Select only the first 6 seconds of each audio file for consistency
        waveform = waveform[:,:60000]
        # Apply Mel Spectrogram transformation on amplitude over time file (generates a tensor)
        specgram = torchaudio.transforms.MelSpectrogram()(waveform)
        # Transform the spectrogram into log2 base and transform as numpy array for exportation as image
        specgram = specgram.log2()[0,:,:].detach().numpy()
        # Convert the pixel scale for exportation in matplotlib
        specgram = cv2.convertScaleAbs(specgram, alpha=(1))
        # Extract the label (1 or 0) for each sample to assign to the right filepath
        l = labs[labs.id == int(idx.split('/')[-2][2:])].label.values[0]
        # Generate the destination filepath for the transformed spectrogram image
        new_path = str('/mntDrive/My Drive/VoxCeleb_theo/images/'+idx.split('/')[-3]+'/'+str(l) +'/'+idx.split('/')[-2]+'_'+idx.split('/')[-1][:-4]+'.jpg')
        # For verification purposes
        print(new_path)    
        # Save the image to the filepath above in grayscale as image channel brings no classifying information
        plt.imsave(new_path,specgram,cmap='gray')  