### Update : 
#### Added audio widget to physically analyse the changes

# 🦉 Cornell Birdcall Identification:
![](https://imgc.artprintimages.com/img/print/a-tawny-frogmouth-owl-podargus-strigoides-at-the-fort-worth-zoo_u-l-pncfu00.jpg?h=550&p=0&w=550&background=fbfbfb)

# Introduction:
### This notebook aims at analysing various transformations and important functions that can be used to encode/transform audio data

In [None]:
import torch
import torchaudio
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import librosa
import librosa.display

import IPython.display as ipd

## Picking up a random Audiofile for all the operatons moving forward

In [None]:
# filename = "../input/birdsong-recognition/train_audio/nutwoo/XC462016.mp3"
filename = '../input/birdsong-recognition/train_audio/balori/XC101614.mp3'

waveform, sample_rate = torchaudio.load(filename)

print("Shape of waveform {}".format(waveform.size()))
print("Sample rate of wavefor {}".format(sample_rate))

plt.figure(figsize=(14,5))
plt.plot(waveform.t()) # transpose


## Original Audio:

In [None]:
ipd.Audio(waveform, rate=sample_rate)

# Transformations:
## ***torchaudio.transforms***
### Torchaudio supports the [following](https://pytorch.org/audio/transforms.html) transformations. We will be looking at some of them here.

## Transformation : Log of spectrogram on log scale

In [None]:
specgram = torchaudio.transforms.Spectrogram()(waveform)

print("Shape of Spectrogram {}".format(specgram.size()))

plt.figure(figsize=(14,5))
plt.imshow(specgram.log2()[0,:,:1200].numpy(), cmap='gray')

## Transformation : MelSpectrogram on log scale

In [None]:
specgram = torchaudio.transforms.MelSpectrogram()(waveform)

print("Shape of MelSpectrogram {}".format(specgram.size()))

plt.figure(figsize=(14,5))
plt.imshow(specgram.log2()[0,:,:1000].numpy(), cmap='gray')

## Transformation : Resampling the waveform, one channel at a time

In [None]:
new_sample_rate = sample_rate / 10

channel = 0

resampled = torchaudio.transforms.Resample(sample_rate, new_sample_rate)(waveform[channel,:].view(1,-1))

print("Shape of resampled waveform: {}".format(resampled.size()))

plt.figure(figsize=(14,5))
plt.plot(resampled[0,:].numpy())

## Resampled Audio:

In [None]:
ipd.Audio(resampled, rate=new_sample_rate)

## Transformation : Mu-Law encoding
### The signal must be between [-1,1] for Mu-Law encoding

In [None]:
print("Min. of waveform {} \n Max. of waveform {} \n Mean of waveform {}".format(waveform.min(), waveform.max(), waveform.mean()))

### Our signal is already between [-1, 1], but had it not been, we would have used the following normalizing function

In [None]:
def normalize(signal):
    signal_minusmean = signal - signal.mean()
    return signal_minusmean / signal_minusmean.abs().max()

#Normalizing waveform
# print("After normalizing waveform...")
# print("Min. of waveform {}".format(normalize(waveform).min().item()))
# print("Max. of waveform {}".format(normalize(waveform).max().item()))
# print("Mean. of waveform {}".format(normalize(waveform).mean().item()))

In [None]:
# Applying Mu Law encoding
encoded = torchaudio.transforms.MuLawEncoding()(waveform)

print("Shape of encoded waveform {}".format(encoded.size()))

plt.figure(figsize=(14,5))
plt.plot(encoded[0,:].numpy())

## Mu-Law encoded Audio:

In [None]:
ipd.Audio(encoded, rate=sample_rate)

## Now lets decode this waveform

In [None]:
reconstructed = torchaudio.transforms.MuLawDecoding()(encoded)

print("Shape of recovered waveform {}".format(reconstructed.size()))

plt.figure(figsize=(14,5))
plt.plot(reconstructed[0,:].numpy())

## Decoded Audio:

In [None]:
ipd.Audio(reconstructed, rate=sample_rate)

### Sounds like original only !

## Comparing original waveform with its reconstructed version

In [None]:
err = ((waveform-reconstructed).abs() / waveform.abs()).median()

print("Median error difference between original waveform and its reconstructed version is {:.2%}".format(err))

# Functions:
## All these transformations that we saw till now rely on stateless *functions* for their computations, which are availabe under *torchaudio.functional*
## torchaudio.functional
### Functions to perform common audio operations [Link](https://pytorch.org/audio/functional.html)

## Functional: Mu Law encoding using functional 

In [None]:
mu_law_encoding_waveform = torchaudio.functional.mu_law_encoding(waveform, quantization_channels=256)

print("Shape of transformed waveform: {}".format(mu_law_encoding_waveform.size()))

plt.figure(figsize=(14,5))
plt.plot(mu_law_encoding_waveform[0,:].numpy())

### Observe how the output from *torchaudio.functional.mu_law_encoding* is same as output from *torchaudio.transforms.MuLawEncoding *

## Functional: Compute_deltas 
### To compute delta cofficients of a tensor

In [None]:
computed = torchaudio.functional.compute_deltas(specgram.contiguous(), win_length=3)

print("Shape of Computed deltas {}".format(computed.size()))

plt.figure(figsize=(14,5))
plt.imshow(computed.log2()[0,:,:1000].numpy(), cmap='gray')

## Functional: gain
### Applies amplification/attenuation to the whole waveform****

In [None]:
gain_waveform = torchaudio.functional.gain(waveform, gain_db=5.0)

print("Min. of gain_waveform {} \nMax. of gain_waveform {} \nMean of gain_waveform {}".format(gain_waveform.min(), gain_waveform.max(), gain_waveform.mean()))

## gain_waveform Audio:

In [None]:
ipd.Audio(gain_waveform, rate=sample_rate)

## Functional: dither
### Increases the perceived dynamic range of audio stored at a particular bit-depth

In [None]:
dither_waveform = torchaudio.functional.dither(waveform)
print("Min of dither_waveform: {}\nMax of dither_waveform: {}\nMean of dither_waveform: {}".format(dither_waveform.min(), dither_waveform.max(), dither_waveform.mean()))

## Dither_waveform Audio:

In [None]:
ipd.Audio(dither_waveform, rate=sample_rate)

# Applying Filters to our waveform: using torchaudio.functional

## Filters : Low-pass filter(Second order)

In [None]:
lowpass_waveform = torchaudio.functional.lowpass_biquad(waveform, sample_rate, cutoff_freq=3000)

print("Min. of lowpass_waveform: {}\nMax. of lowpass_waveform: {}\nMean of lowpass_waveform: {}".format(lowpass_waveform.min(), lowpass_waveform.max(), lowpass_waveform.mean()))

plt.figure(figsize=(14,5))
plt.plot(lowpass_waveform.t().numpy())

## Filtered (low-pass) Audio:

In [None]:
ipd.Audio(lowpass_waveform, rate=sample_rate)

## Filters : High-pass filter(Second order)

In [None]:
highpass_waveform = torchaudio.functional.highpass_biquad(waveform, sample_rate, cutoff_freq=2000)

print("Min of highpass_waveform: {}\nMax of highpass_waveform: {}\nMean of highpass_waveform: {}".format(highpass_waveform.min(), highpass_waveform.max(), highpass_waveform.mean()))

plt.figure(figsize=(14,5))
plt.plot(highpass_waveform.t().numpy())

## Filtered (high-pass) Audio:

In [None]:
ipd.Audio(highpass_waveform, rate=sample_rate)

### Reference:    torchaudio [tutorial](https://pytorch.org/tutorials/beginner/audio_preprocessing_tutorial.html#functional)

# If you like my kernel, do upvote 🕊️🕊️🕊️