In [None]:
!pip install python_speech_features
!pip install librosa

# Importing Packages

In [None]:
import librosa   #for audio processing
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile #for audio processing
import warnings
warnings.filterwarnings("ignore")

# loading Audio file

In [None]:
def tran_loader(filename):
    name_to_text = {}
    with open (filename, encoding="utf-8")as f:
        f.readline()
        for line in f:
            name=line.split("</s>")[1]
            name=name.replace('(', '')
            name=name.replace(')', '')
            name=name.replace('\n','')
            name=name.replace(' ','')
            text=line.split("</s>")[0]
            text=text.replace("<s>","")
            name_to_text[name]=text
        return name_to_text

In [None]:
transcription=tran_loader(r"C:\Users\sam\Desktop\lang\AMHARIC\data\trsTrain.txt")

In [None]:
def meta_data(trans, path ):
    target=[]
    features=[]
    filenames=[]
    duration_of_recordings=[]
    for k in trans:
        filename=path+k +".wav"
        filenames.append(filename)
        audio, fs = librosa.load(filename, sr=None)
        duration_of_recordings.append(float(len(audio)/fs))

        lable = trans[k]
        target.append(lable)
    return filenames, target,duration_of_recordings

In [None]:
filenames, target,duration_of_recordings= meta_data(transcription,'valid/')

In [None]:
import pandas as pd 
data=pd.DataFrame({'key': filenames,'text': target, 'duration':duration_of_recordings})

In [None]:
data.head()

In [None]:
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio

class Audio_load():
    # ----------------------------
    # Load an audio file. Return the signal as a tensor and the sample rate
    # ----------------------------
    def open(audio_file):
        sig, sr = librosa.load(audio_file, sr=44100)
        return (sig, sr)
    
    
    def rechannel(aud, new_channel):
        sig, sr = aud

        if (sig.shape[0] == new_channel):
            # Nothing to do
            return aud

        if (new_channel == 1):
            # Convert from stereo to mono by selecting only the first channel
            resig = audio[:1, :]
        else:
            # Convert from mono to stereo by duplicating the first channel
#             resig = torch.cat([audio, sudio])

        return ((resig, sr))

    def resample(aud, newsr):
        sig, sr = aud

        if (sr == newsr):
            # Nothing to do
            return aud

        num_channels = sig.shape[0]
        # Resample first channel
        reaudio = torchaudio.transforms.Resample(sr, new_rate)(sig[:1,:])
        if (num_channels > 1):
            # Resample the second channel and merge both channels
            retwo = torchaudio.transforms.Resample(sr, new_rate)(sig[1:,:])
            resig = torch.cat([resig, retwo])

        return ((resig, new_rate))

    def pad_trunc(aud, max_ms):
        sig, sr = aud
        num_rows, sig_len = sig.shape
        max_len = sr//1000 * max_ms

        if (sig_len > max_len):
            # Truncate the signal to the given length
            sig= sig[:,:max_len]

        elif (sig_len < max_len):
            # Length of padding to add at the beginning and end of the signal
            pad_begin_len = random.randint(0, max_len - sig_len)
            pad_end_len = max_len - sig_len - pad_begin_len

            # Pad with 0s
            pad_begin = torch.zeros((num_rows, pad_begin_len))
            pad_end = torch.zeros((num_rows, pad_end_len))

            sig = torch.cat((pad_begin, sig, pad_end), 1)
      
        return (sig, sr)

    def time_shift(aud, shift_limit):
        sig,sr = aud
        _, sig_len = sig.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return (sig.roll(shift_amt), sr)
    
    def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
        sig,sr = aud
        top_db = 80

        # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
        spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

        # Convert to decibels
        spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
        return (spec)
    
    def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
        _, n_mels, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec

        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

        return aug_spec
    
    def show_wave(aud, label='', ax=None):
        sig, sr = aud
        if (not ax):
            _,ax = plt.subplots(1, 1, figsize=(3, 3))
    
        ax.plot(sig[0])
        ax.set_title(label)
    
    def show_transform(orig, trans):
        osig,osr = orig
        tsig,tsr = trans
        if orig is not None: plt.plot(osig[0], 'm', label="Orig.")
        if trans is not None: plt.plot(tsig[0], 'c', alpha=0.5, label="Transf.")
        plt.legend()
        plt.show()
        
    def show_spectro(spec, label='', ax=None, figsize=(6,6)):
        if (not ax):
            _,ax = plt.subplots(1, 1, figsize=figsize)
        # Reduce first dimension if it is greyscale
        ax.imshow(spec if (spec.shape[0]==3) else spec.squeeze(0))
        ax.set_title(f'{label}, {list(spec.shape)}')
    

In [None]:
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio

# ----------------------------
# Sound Dataset
# ----------------------------
class SoundDS(Dataset):
    def __init__(self, df, data_path):
        self.df = df
        self.data_path = str(data_path)
        self.duration = 4000
        self.sr = 44100
        self.channel = 2
        self.shift_pct = 0.4

    # ----------------------------
    # Number of items in dataset
    # ----------------------------
    def __len__(self):
        return len(self.df)    

    # ----------------------------
    # Get i'th item in dataset
    # ----------------------------
    def __getitem__(self, idx):
        
        # Absolute file path of the audio file - concatenate the audio directory with
        # the relative path
        audio_file = self.data_path + self.df.loc[idx, 'relative_path']
        # Get the Class ID
        class_id = self.df.loc[idx, 'classID']

        aud = AudioUtil.open(audio_file)
        # Some sounds have a higher sample rate, or fewer channels compared to the
        # majority. So make all sounds have the same number of channels and same 
        # sample rate. Unless the sample rate is the same, the pad_trunc will still
        # result in arrays of different lengths, even though the sound duration is
        # the same.
        reaud = AudioUtil.resample(aud, self.sr)
        rechan = AudioUtil.rechannel(reaud, self.channel)

        dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
        shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
        sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
        aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

        return aug_sgram, class_id