### Task 2 Question 2:
Audio Reconstruction - Pick a 5-second audio sample of your liking. Use Random Fourier Features (RFF) and Linear Regression to learn the mapping from time (t) to amplitude (A), where t is the time point, and A is the audio amplitude at that time. Play the reconstructed audio and the original audio to demonstrate reconstruction. Calculate the Root Mean Squared Error (RMSE) and Signal-to-Noise Ratio (SNR) to evaluate the reconstruction. **[1.5 Mark]**


In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import soundfile
import torchaudio

In [30]:
from IPython.display import Audio
Audio('./gt_bach.wav')

In [31]:
audio, sr = torchaudio.load('./gt_bach.wav')
sr, audio.shape

(44100, torch.Size([1, 308207]))

In [32]:
audio = audio[0]

In [33]:
time = audio.shape[0] / sr
time

6.988820861678004

In [34]:
crop_duration = int(0.9944*sr)
cropped_audio = audio[crop_duration:-crop_duration]
cropped_audio.shape, cropped_audio.shape[0]/sr

(torch.Size([220501]), 5.000022675736961)

In [35]:
Audio(cropped_audio, rate = sr)

In [36]:
X = torch.arange(0,len(cropped_audio)).unsqueeze(1).float()
X = X/X.max() * 200 - 100
X
print(X.shape)

torch.Size([220501, 1])


In [37]:
def create_rff_features(X, num_features, sigma):
    from sklearn.kernel_approximation import RBFSampler
    rff = RBFSampler(n_components=num_features, gamma=1/(2 * sigma**2))
    X = rff.fit_transform(X)
    return X

In [38]:
num_features = 5000
sigma = 0.008

X_rff = create_rff_features(X, num_features, sigma)
X_rff.shape

(220501, 5000)

In [39]:
X_rff

array([[-0.01960973,  0.01543949, -0.01826531, ...,  0.01113905,
         0.01901612,  0.01583603],
       [-0.01996943,  0.01441375, -0.01891631, ...,  0.01228623,
         0.01953381,  0.01625846],
       [-0.01992387,  0.01330226, -0.01942268, ...,  0.01337184,
         0.01986162,  0.01665781],
       ...,
       [-0.00428488, -0.00677866,  0.00258781, ...,  0.01650404,
        -0.01912225, -0.01410334],
       [-0.00703561, -0.0082265 ,  0.000798  , ...,  0.01565746,
        -0.01961021, -0.01358902],
       [-0.00960736, -0.0095899 , -0.00097874, ...,  0.01474455,
        -0.01989989, -0.01306853]], dtype=float32)

In [40]:
X_rff_tensor = torch.tensor(X_rff, dtype=torch.float32)

In [41]:
X_rff_tensor.shape

torch.Size([220501, 5000])

In [42]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_rff, cropped_audio)

pred_audio = model.predict(X_rff)

In [43]:
print(pred_audio.shape)

(220501,)


In [44]:
Audio(pred_audio, rate = sr)

In [45]:
cropped_audio

tensor([-0.0354, -0.0360, -0.0349,  ...,  0.2139,  0.2098,  0.2065])

In [46]:
pred_audio

array([0.0421737 , 0.04149561, 0.04079884, ..., 0.03437545, 0.03299397,
       0.03182875], dtype=float32)

In [47]:
def calculate_rmse(original, reconstructed):

    original = original.numpy()
    reconstructed = reconstructed

    mse = np.mean((original - reconstructed) ** 2)
    rmse = np.sqrt(mse)
    return rmse

In [48]:
def calculate_snr(original, reconstructed):

    original = original.numpy()
    reconstructed = reconstructed

    mse = np.mean((original - reconstructed) ** 2)
    max_pixel = 1.0
    snr = 20 * np.log10(max_pixel / np.sqrt(mse))
    return snr

In [49]:
rmse = calculate_rmse(cropped_audio, pred_audio)
rmse

np.float32(0.11622695)

In [50]:
snr = calculate_snr(cropped_audio, pred_audio)
snr

np.float32(18.693863)