In [7]:
import sys
sys.path.append('..')
import torch
import torch.nn as nn
import math
from dataclasses import dataclass, field
from data.unimodal import get_raw_librispeech_dataset
#from fairseq.examples.data2vec.models import Data2VecAudioModel
from fairseq.models.wav2vec import Wav2Vec2Model, Wav2Vec2Config
from src.models.data2vec_audio import Data2VecAudioModel, Data2VecAudioConfig

### Fairseq model padding mask test

In [9]:
def _get_feat_extract_output_lengths(conv_feature_layers, input_lengths: torch.LongTensor):
        """
        Computes the output length of the convolutional layers
        """

        def _conv_out_length(input_length, kernel_size, stride):
            return torch.floor((input_length - kernel_size) / stride + 1)

        conv_cfg_list = conv_feature_layers

        for i in range(len(conv_cfg_list)):
            input_lengths = _conv_out_length(
                input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2]
            )

        return input_lengths.to(torch.long)

In [24]:
padding_mask = torch.tensor([
    [False, False, False, False, False, False, False, False, False, False, False, False],
    [False, False, False, False, False, False, False, False, True, True,True, True]
])
padding_mask

tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False],
        [False, False, False, False, False, False, False, False,  True,  True,
          True,  True]])

In [25]:
input_lengths= (1 - padding_mask.long()).sum(-1)
conv_feature_layers = [[512, 3, 1], [512, 2, 1], [512, 2, 1], [512, 2, 1], [512, 2, 1]]
output_lengths=_get_feat_extract_output_lengths(conv_feature_layers, input_lengths)

In [26]:
padding_mask = torch.zeros(2, 10)
padding_mask

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [27]:
padding_mask[
                (
                    torch.arange(padding_mask.shape[0]),
                    output_lengths - 1,
                )
            ] = 1

In [28]:
output_lengths

tensor([6, 2])

In [29]:
padding_mask

tensor([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [30]:
padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()
padding_mask

tensor([[False, False, False, False, False, False,  True,  True,  True,  True],
        [False, False,  True,  True,  True,  True,  True,  True,  True,  True]])

In [4]:
@dataclass
class Wav2Vec2ConfigSmall(Wav2Vec2Config):
    encoder_layers: int = field(
        default=1, metadata={"help": "num encoder layers in the transformer"}
    )
    encoder_embed_dim: int = field(
        default=128, metadata={"help": "encoder embedding dimension"}
    )
    encoder_ffn_embed_dim: int = field(
        default=256, metadata={"help": "encoder embedding dimension for FFN"}
    )
    encoder_attention_heads: int = field(
        default=1, metadata={"help": "num encoder attention heads"}
    )
    conv_feature_layers: str = field(
        default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
        metadata={
            "help": "string describing convolutional feature extraction layers in form of a python list that contains "
            "[(dim, kernel_size, stride), ...]"
        },
    )

In [8]:
model = Wav2Vec2Model(cfg=Wav2Vec2ConfigSmall())



In [11]:
sum(p.numel() for p in model.parameters())

4613504