### Step 1: Mount the Google Drive

Remember to use GPU runtime before mounting your Google Drive. (Runtime --> Change runtime type).

### Step 2: Open the project directory

Replace `Your_Dir` with your own path.

### Step 3: Install required packages

After installing them, Colab will require you to restart the session.

### Step 4: Start your experiments!

- Remember to download and copy the dataset to this directory: `Your_Dir/emg2qwerty/data`.
- You may now start your experiments with any scripts! Below are examples of single-user training and testing (greedy decoding).
- **There are two ways to track the logs:**
  - 1. Keep `--multirun`, and the logs will not be printed here, but they will be saved in the folder `logs`, e.g., `logs/2025-02-09/18-24-15/submitit_logs/`.
  - 2. Comment out `--multirun` and the logs will be printed in this notebook, but they will not be saved.

#### Training

- The checkpoints are saved in the folder `logs`, e.g., `logs/2025-02-09/18-24-15/checkpoints/`.

In [1]:
import torch
print(torch.__version__)  # PyTorch version
print(torch.version.cuda)  # CUDA version
print(torch.backends.cudnn.version())  # cuDNN version

2.3.0
12.1
8902


In [4]:
import torch.nn as nn
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
src = torch.rand(10, 32, 512)
out = transformer_encoder(src)

In [5]:
out.shape

torch.Size([10, 32, 512])

In [11]:
class TDSConv2dBlock(nn.Module):
    """A 2D temporal convolution block as per "Sequence-to-Sequence Speech
    Recognition with Time-Depth Separable Convolutions, Hannun et al"
    (https://arxiv.org/abs/1904.02619).

    Args:
        channels (int): Number of input and output channels. For an input of
            shape (T, N, num_features), the invariant we want is
            channels * width = num_features.
        width (int): Input width. For an input of shape (T, N, num_features),
            the invariant we want is channels * width = num_features.
        kernel_width (int): The kernel size of the temporal convolution.
    """

    def __init__(self, channels: int, width: int, kernel_width: int) -> None:
        super().__init__()
        self.channels = channels
        self.width = width

        self.conv2d = nn.Conv2d(
            in_channels=channels,
            out_channels=channels,
            kernel_size=(1, kernel_width),
        )
        self.relu = nn.ReLU()
        self.layer_norm = nn.LayerNorm(channels * width)

    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
        T_in, N, C = inputs.shape  # TNC

        # TNC -> NCT -> NcwT
        x = inputs.movedim(0, -1).reshape(N, self.channels, self.width, T_in)
        print(x.shape)
        x = self.conv2d(x)
        print(x.shape)
        x = self.relu(x)
        x = x.reshape(N, C, -1).movedim(-1, 0)  # NcwT -> NCT -> TNC

        # Skip connection after downsampling
        T_out = x.shape[0]
        x = x + inputs[-T_out:]

        # Layer norm over C
        return self.layer_norm(x)  # TNC

In [12]:
T_in = 256 
N = 32
B = 2
C = 16


In [13]:
data = torch.rand(T_in, N, C * B)

In [16]:
block = TDSConv2dBlock(
    channels=C, width=B, kernel_width=4
)

In [19]:
print("Original:", data.shape)
result = block(data)
print("After TDSConv2dBlock", result.shape)

Original: torch.Size([256, 32, 32])
torch.Size([32, 16, 2, 256])
torch.Size([32, 16, 2, 253])
After TDSConv2dBlock torch.Size([253, 32, 32])


In [21]:
class TDSAttnBlock(nn.Module):
    def __init__(self, channels: int, width: int, num_heads: int, num_layers: int) -> None:
        super().__init__()
        self.channels = channels
        self.width = width
        self.C = self.channels * self.width

        encoder_layer = nn.TransformerEncoderLayer(d_model=self.C, nhead=num_heads, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
        # T_in, N, C = inputs.shape  # TNC

        # TNC -> NTC
        x = inputs.permute(1, 0, 2)
        x = self.encoder(x)
        # NCT -> TNC
        x = x.permute(1, 0, 2) 
        return x

In [20]:
data.shape

torch.Size([256, 32, 32])

In [22]:
attn_block = TDSAttnBlock(
    channels=C, width=B, num_heads=4, num_layers=2
)

In [23]:
attn_block(data).shape

torch.Size([256, 32, 32])