# Preprocessing existing numpy file

In [None]:
pd.DataFrame(np.load('/home/vedant/projects/ScreamDetection/resources/working_data/data_with_vggish.npy',allow_pickle=True),columns=['video_id','start_time','mid_ts','label','audio','vggish'])

# Creating ScreamDataset Class

In [2]:
from torch.utils.data import Dataset
import torch
import pandas as pd
import torchaudio
import os
import numpy as np

In [None]:
class UrbanSoundDataset(Dataset):
    def __init__(self, annotation_file,audio_dir,transformation,
                target_sample_rate,num_samples,device):
        self.annotations = pd.read_csv(annotation_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self): #return number of elements in dataset
        return len(self.annotations)

    def _get_audio_sample_path(self, index):
        fold = f"fold{self.annotations.iloc[index, 5]}"
        path = os.path.join(self.audio_dir,fold,self.annotations.iloc[index, 0 ])
        return path
    
    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 6]

    def _resample_if_necessary(self,signal,sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr,self.target_sample_rate).to(self.device)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self,signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal,dim=0,keepdim=True)
        return signal
    
    def _right_pad_if_necessary(self,signal):
        length_of_signal = signal.shape[1]
        if length_of_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_of_signal
            last_dim_padding = (0,num_missing_samples) # (left_pad,right_pad)
            signal = torch.nn.functional.pad(signal,last_dim_padding)
        return signal

    def _cut_if_necessary(self,signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:,:self.num_samples]
        return signal

    def __getitem__(self,index): #return item at an index
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device) #register signal to device
        #Transformations
        signal = self._resample_if_necessary(signal, sr) # convert to target sample rate
        signal = self._mix_down_if_necessary(signal) # converting to mono
        signal = self._right_pad_if_necessary(signal) # if num samples < target num samples
        signal = self._cut_if_necessary(signal) # crop if num samples > target num samples
        signal = self.transformation(signal) # Get mel spectrogram
        return signal, label



In [3]:
class ScreamDataset(Dataset):
    def __init__(self,numpy_file,schema,transformation,device):
        self.data = pd.DataFrame(np.load(numpy_file,allow_pickle=True),columns=schema)
        self.device = device
        self.transformation = transformation.to(self.device)
 
    def __len__(self):
        return len(self)
 
    def _get_label(self,index):
        label = self.data.iloc[index, 3]
        return label
 
    def __getitem__(self,index):
        label = self._get_label(index)
        signal = self.data.iloc[index,4].to_numpy()
        signal = signal.to(self.device)
        #Transformations
        signal = self.transformation(signal)
        return signal, label
 


In [4]:
np_path='/home/vedant/projects/ScreamDetection/resources/working_data/data_with_vggish.npy'
schema = ['video_id','start_time','mid_ts','label','audio','vggish']
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate = 44100,
        n_fft = 1024,
        hop_length = 512,
        n_mels = 64
    )
if torch.cuda.is_available():
    DEVICE = 'cuda'
else:
    DEVICE = 'cpu'
sd = ScreamDataset(np_path, schema, mel_spectrogram, DEVICE)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
sd[0]

In [None]:
if __name__ == "__main__":
    ANNOTATIONS_FILE = '/home/vedant/projects/ScreamDetection/CNN/data/UrbanSound8K/metadata/UrbanSound8K.csv'
    AUDIO_DIR = '/home/vedant/projects/ScreamDetection/CNN/data/UrbanSound8K/audio' 
    SAMPLE_RATE = 22050
    NUM_SAMPLES = 22050
    if torch.cuda.is_available():
        DEVICE = 'cuda'
    else:
        DEVICE = 'cpu'
    print(f"Using device: {DEVICE}")
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate = SAMPLE_RATE,
        n_fft = 1024,
        hop_length = 512,
        n_mels = 64
    )

    usd = UrbanSoundDataset(ANNOTATIONS_FILE,AUDIO_DIR,mel_spectrogram,SAMPLE_RATE,NUM_SAMPLES,DEVICE)

    print(f"There are {len(usd)} samples in the dataset")
    signal,label = usd[0]

    a=1