In [1]:
import torch
from argparse import Namespace

![](../imgs/framework.png)

In [None]:
As shown in the image, our method processes the input data using the following steps:
1. convert the input speech into Log-Frequency Spectorgram.

# Torch model

In [2]:
from model import AudioModel

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
cfg = Namespace(
    one_stem=False,
    use_f0_loss=False,
    style_shuffle=True,
    feat_shuffle=True,
    feature_extractor="ResNet",
    vocoder_classes=8,
    betas=[1, 1, 0.5, 0.5],
    aug_policy="ss",
    use_speed_loss=True,
    use_compression_loss=True,
    feat_con_loss=True
)

In [4]:
demo_model = AudioModel(feature_extractor='ResNet', cfg=cfg)

  WeightNorm.apply(module, name, dim)


In [5]:
x = torch.randn(3, 1, 257, 257)
test_res = demo_model.forward(x, stage='test')
for k, v in test_res.items():
    print(k, v.shape)

hidden_states torch.Size([3, 256, 17, 17])
content_feature torch.Size([3, 512])
speed_logit torch.Size([3, 16])
compression_logit torch.Size([3, 10])
vocoder_feature torch.Size([3, 512])
vocoder_logit torch.Size([3, 9])
content_voc_logit torch.Size([3, 9])
feature torch.Size([3, 1024])
logit torch.Size([3])


In [6]:
x = torch.randn(3, 1, 257, 257)
batch = {
    'label' : torch.randint(0,2, (3,))
}
train_res = demo_model.forward(x, stage="train", batch=batch)
print("print train res")
print("#"*10)
for k, v in train_res.items():
    print(k, v.shape)
print("#"*10, '\n', "print batch res")
print("#"*10)
for k, v in batch.items():
    print(k, v.shape)

print train res
##########
hidden_states torch.Size([3, 256, 17, 17])
content_feature torch.Size([3, 512])
speed_logit torch.Size([3, 16])
compression_logit torch.Size([3, 10])
vocoder_feature torch.Size([3, 512])
vocoder_logit torch.Size([3, 9])
content_voc_logit torch.Size([3, 9])
feature torch.Size([3, 1024])
logit torch.Size([3])
shuffle_logit torch.Size([3])
########## 
 print batch res
##########
label torch.Size([3])
shuffle_label torch.Size([3])


# Lit Model

We use the pytorch_lightning to process the data flow, compute the loss and train the model.

In [7]:
from lit_model import AudioModel_lit

In [25]:
lit_model = AudioModel_lit(cfg=cfg)

In [26]:
x = torch.randn(3, 1, 48000)
batch = {
    "label": torch.randint(0, 2, (3,)),
    "audio": x,
    "sample_rate": 16000,
    "speed_label": torch.randint(0, 10, (3,)),
    "compression_label": torch.randint(0, 10, (3,)),
}

lit_model._shared_eval_step(batch=batch, batch_idx=0)

/Volumes/GEIL2T/Softwares/anaconda3/envs/RobustSpeechDetection/lib/python3.9/site-packages/pytorch_lightning/core/module.py:441: You are trying to `self.log()` but the `self.trainer` reference is not registered on the model yet. This is most likely because the model hasn't been passed to the `Trainer`


{'hidden_states': tensor([[[[3.8436e-01, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
            0.0000e+00, 0.0000e+00],
           [9.3586e-01, 4.7689e-01, 1.5935e+00,  ..., 0.0000e+00,
            0.0000e+00, 0.0000e+00],
           [1.0593e+00, 2.2496e-01, 9.2046e-03,  ..., 0.0000e+00,
            0.0000e+00, 0.0000e+00],
           ...,
           [2.8328e+00, 2.7327e+00, 1.4546e+00,  ..., 2.0292e+00,
            2.2150e+00, 2.2302e+00],
           [1.5272e-01, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
            0.0000e+00, 9.1004e-01],
           [1.9589e+00, 1.2398e+00, 1.2448e+00,  ..., 2.2864e+00,
            1.6821e+00, 1.9849e+00]],
 
          [[8.0397e-01, 4.4891e-01, 4.6121e-01,  ..., 7.7326e-01,
            7.7137e-01, 8.7131e-01],
           [2.8932e+00, 6.2080e-01, 0.0000e+00,  ..., 1.1709e+00,
            3.2606e+00, 2.6546e+00],
           [3.8176e+00, 2.5936e+00, 3.1476e+00,  ..., 1.1554e-01,
            4.2990e-01, 7.6645e-01],
           ...,
           [2.6201e+0