In [None]:
from google.colab import drive
import sys
drive.mount('/content/drive/',force_remount = False)
pat_data_sheet_path = "/content/drive/MyDrive/Study_materials/Voice disorder detection project/data/Raw_data/CRF table.xlsx"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
from random import sample
import torch
from torch.nn import functional as F
import torch.nn as nn
import numpy as np
import pandas as pd
import torchaudio.transforms as ta_trans

In [None]:
class ToOneHot(nn.Module):
    def __call__(self, classification):
        if not isinstance(classification,bool):
            classification = torch.Tensor([classification]).to(torch.int64)
            classification = F.one_hot(classification,10).squeeze()
        return classification
class CFloat(nn.Module):
    def __call__(self,data):
        # print(data.dtype)
        data = data.cfloat()
        return data
class ToTensor(nn.Module):
    """Convert ndarrays in sample to Tensors."""
    def __init__(self):
        self.device = torch.device('cuda') if torch.cuda.is_available() else "cpu"


    def __call__(self, sample):
        return torch.from_numpy(sample).float()

class Inflate(nn.Module):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        return sample.reshape(1,1,len(sample))

class Deflate(nn.Module):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        return sample.reshape(sample.shape[2])


class PadWhiteNoise(nn.Module):
    """Convert ndarrays in sample to Tensors."""

    def __init__(self,length):
        self.length = length

    def __call__(self,sample,sr=50000):
        if len(sample)>self.length:
            return sample
        mean = sample.mean()
        variance = sample.var()
        noise = (torch.normal(mean.item(),variance.item(),size=(self.length-len(sample),),device=sample.device))/1200000
        signal=torch.cat((sample,noise)).to(device=sample.device)

        return signal


class Truncate(nn.Module):
    def __init__(self,N):
        self.N=int(N)
    def __call__(self, sample):
        return sample[:self.N].reshape(1,-1)

class WaveformToInput(torch.nn.Module):
    def __init__(self):
        super().__init__()
        audio_sample_rate = CommonParams.TARGET_SAMPLE_RATE
        window_length_samples = int(round(
            audio_sample_rate * CommonParams.STFT_WINDOW_LENGTH_SECONDS
        ))
        hop_length_samples = int(round(
            audio_sample_rate * CommonParams.STFT_HOP_LENGTH_SECONDS
        ))
        fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
        assert window_length_samples == 400
        assert hop_length_samples == 160
        assert fft_length == 512
        self.mel_trans_ope = VGGishLogMelSpectrogram(
            CommonParams.TARGET_SAMPLE_RATE, n_fft=fft_length,
            win_length=window_length_samples, hop_length=hop_length_samples,
            f_min=CommonParams.MEL_MIN_HZ,
            f_max=CommonParams.MEL_MAX_HZ,
            n_mels=CommonParams.NUM_MEL_BANDS
        )
        # note that the STFT filtering logic is exactly the same as that of a
        # conv kernel. It is the center of the kernel, not the left edge of the
        # kernel that is aligned at the start of the signal.

    #TODO change hard coded number to configuration
    def __call__(self, waveform):
        res = self.wavform_to_log_mel(waveform=waveform,sample_rate=CommonParams.SVD_SAMPLE_RATE)[0]
        shape = res.shape
        return res[0].reshape(*shape[1:])
    #     '''
    #     Args:
    #         waveform: torch tsr [num_audio_channels, num_time_steps]
    #         sample_rate: per second sample rate
    #     Returns:
    #         batched torch tsr of shape [N, C, T]
    #     '''
    #     x = waveform.mean(axis=0, keepdims=True)  # average over channels
    #     resampler = ta_trans.Resample(sample_rate, CommonParams.TARGET_SAMPLE_RATE)
    #     x = resampler(x)
    #     x = self.mel_trans_ope(x)
    #     x = x.squeeze(dim=0).T  # # [1, C, T] -> [T, C]

    #     window_size_in_frames = int(round(
    #         CommonParams.PATCH_WINDOW_IN_SECONDS / CommonParams.STFT_HOP_LENGTH_SECONDS
    #     ))
    #     print(CommonParams.PATCH_WINDOW_IN_SECONDS)

    #     num_chunks = x.shape[0] // window_size_in_frames

    #     # reshape into chunks of non-overlapping sliding window
    #     num_frames_to_use = num_chunks * window_size_in_frames
    #     x = x[:num_frames_to_use]
    #     # [num_chunks, 1, window_size, num_freq]
    #     x = x.reshape(num_chunks, 1, window_size_in_frames, x.shape[-1])
    #     return x

    def wavform_to_log_mel(self, waveform, sample_rate):
        '''
        Args:
            waveform: torch tsr [num_audio_channels, num_time_steps]
            sample_rate: per second sample rate
        Returns:
            batched torch tsr of shape [N, C, T]
        '''
        x = waveform.mean(axis=0, keepdims=True)  # average over channels

        resampler = ta_trans.Resample(sample_rate, CommonParams.TARGET_SAMPLE_RATE)
        x = resampler(x)
        x = self.mel_trans_ope(x)
        x = x.squeeze(dim=0).T  # # [1, C, T] -> [T, C]
        spectrogram = x.cpu().numpy().copy()

        window_size_in_frames = int(round(
            CommonParams.PATCH_WINDOW_IN_SECONDS / CommonParams.STFT_HOP_LENGTH_SECONDS
        ))

        if YAMNetParams.PATCH_HOP_SECONDS == YAMNetParams.PATCH_WINDOW_SECONDS:
            num_chunks = x.shape[0] // window_size_in_frames

            # reshape into chunks of non-overlapping sliding window
            num_frames_to_use = num_chunks * window_size_in_frames
            x = x[:num_frames_to_use]
            # [num_chunks, 1, window_size, num_freq]
            x = x.reshape(num_chunks, 1, window_size_in_frames, x.shape[-1])
        else:  # generate chunks with custom sliding window length `patch_hop_seconds`
            patch_hop_in_frames = int(round(
                YAMNetParams.PATCH_HOP_SECONDS / CommonParams.STFT_HOP_LENGTH_SECONDS
            ))
            # TODO performance optimization with zero copy
            patch_hop_num_chunks = (x.shape[0] - window_size_in_frames) // patch_hop_in_frames + 1
            num_frames_to_use = window_size_in_frames + (patch_hop_num_chunks - 1) * patch_hop_in_frames
            x = x[:num_frames_to_use]
            x = x.reshape(1,1,-1, x.shape[-1])
        return x, spectrogram

class VGGishLogMelSpectrogram(ta_trans.MelSpectrogram):
    '''
    This is a _log_ mel-spectrogram transform that adheres to the transform
    used by Google's vggish model input processing pipeline
    '''

    def forward(self, waveform):
        r"""
        Args:
            waveform (torch.Tensor): Tensor of audio of dimension (..., time)

        Returns:
            torch.Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time)
        """
        specgram = self.spectrogram(waveform)
        # NOTE at mel_features.py:98, googlers used np.abs on fft output and
        # as a result, the output is just the norm of spectrogram raised to power 1
        # For torchaudio.MelSpectrogram, however, the default
        # power for its spectrogram is 2.0. Hence we need to sqrt it.
        # I can change the power arg at the constructor level, but I don't
        # want to make the code too dirty
        specgram = specgram ** 0.5

        mel_specgram = self.mel_scale(specgram)
        mel_specgram = torch.log(mel_specgram + CommonParams.LOG_OFFSET)
        return mel_specgram
