In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from einops import rearrange
from pytorch_lightning import Trainer
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import MNIST

In [3]:
from ay2.datasets.audio import wavefake
from ay2.torch.data.audio import WaveDataset

In [4]:
dataset = wavefake.WaveFake_AudioDs(
    root_path="/home/ay/data/DATA/2-datasets/1-df-audio/WaveFake"
)

data = dataset.data
data = data[data["label"] == 1].reset_index(drop=True)

In [5]:
path = data["audio_path"][0]

In [6]:
path

'/home/ay/data/DATA/2-datasets/1-df-audio/WaveFake/jsut_real/wav/BASIC5000_0001.wav'

In [7]:
audio, sample_rate = torchaudio.load(path)

In [8]:
ds = WaveDataset(data, max_wave_length=48000)
dl = torch.utils.data.DataLoader(
    ds, batch_size=32, num_workers=8, drop_last=True, shuffle=True
)

In [29]:
class AudioCS(nn.Module):
    def __init__(self):
        super().__init__()

        n = 1600
        m = 800

        self.phi = nn.Linear(n, m, bias=False)
        self.psi = nn.Linear(m, n, bias=False)

    def forward(self, x):
        x = rearrange(x, "b (n s) -> b n s", s=1600)
        x = self.psi(self.phi(x))
        x = rearrange(x, "b n s -> b (n s)")
        return x

In [33]:
class LitModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = AudioCS()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x = batch["audio"]
        x = x[:, 0, :]
        y_hat = self(x)
        loss = torch.mean(torch.square(y_hat - x))
        self.log_dict({"loss": loss}, prog_bar=True, on_step=True, on_epoch=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)


model = LitModel()
trainer = Trainer(
    max_epochs=50,
    devices=[0],
    accelerator="gpu",
)
trainer.fit(model, train_dataloaders=dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name  | Type    | Params
----------------------------------
0 | model | AudioCS | 2.6 M 
----------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.240    Total estimated model params size (MB)


Training: |                                                                                                   …

`Trainer.fit` stopped: `max_epochs=50` reached.


In [37]:
audio = ds[1]["audio"]

In [38]:
audio

tensor([[-0.0041, -0.0041, -0.0040,  ...,  0.0307,  0.0241,  0.0177]])

In [39]:
model.model(audio)

tensor([[-0.0041, -0.0041, -0.0042,  ...,  0.0246,  0.0240,  0.0240]],
       grad_fn=<ViewBackward0>)

In [40]:
torch.mean(torch.abs(model.model(audio) - audio))

tensor(0.0054, grad_fn=<MeanBackward0>)

In [43]:
s = torchaudio.transforms.Spectrogram(n_fft=512)

In [46]:
s1 = s(audio)
s2 = s(model.model(audio))

In [48]:
s1  - s2

tensor([[[ 5.1732e-02, -5.5174e-03, -2.0109e-02,  ...,  1.7372e-02,
          -3.5412e-02,  1.9059e-02],
         [ 3.2077e-02, -8.1467e-03, -4.2563e-03,  ...,  3.3082e-02,
          -2.8403e-02,  3.7974e-02],
         [ 9.3463e-04, -8.3178e-04,  5.9568e-04,  ...,  3.0843e-02,
          -6.8149e-05,  8.5059e-02],
         ...,
         [-2.2727e-07, -2.2271e-10, -6.2995e-10,  ..., -1.1599e-07,
          -3.1666e-08,  1.2203e-05],
         [-2.1735e-07, -2.0275e-09, -3.9816e-09,  ..., -7.2028e-08,
          -7.1756e-08,  1.1380e-05],
         [-2.1771e-07, -3.0161e-09, -3.1714e-09,  ..., -3.8723e-09,
          -1.9946e-07,  1.1655e-05]]], grad_fn=<SubBackward0>)