In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch.nn.functional as torch_functional
import torch.nn as nn
import torch

from sonosco.models.seq2seq_tds import TDSEncoder, TDSDecoder
from sonosco.training import Experiment, ModelTrainer
from sonosco.common.path_utils import parse_yaml
from sonosco.datasets import create_data_loaders

In [3]:
channels = [2, 3, 4]
kernel_sizes = [21, 21, 21]

encoder = TDSEncoder(
    input_dim=80,
    in_channel=1,
    channels=channels,
    kernel_sizes=kernel_sizes,
    dropout=0.2,
    bottleneck_dim=1024)

In [12]:
decoder = TDSDecoder()

In [18]:
import numpy as np

batch_dim = 8
time_dim = 80
input_dim = 80
output_time_dim = 20
vocab_dim = 600

xs = np.random.rand(batch_dim, time_dim, input_dim)
xlens = np.random.randint(low=time_dim, high=time_dim + 1, size=batch_dim)
y_labels = np.random.randint(low=1, high=vocab_dim, size=(batch_dim, output_time_dim, vocab_dim))

In [19]:
outs, outs_lens = encoder(torch.from_numpy(xs).float(), torch.from_numpy(xlens))
outs.shape

torch.Size([8, 10, 1024])

In [20]:
outs_lens

tensor([10, 10, 10, 10, 10, 10, 10, 10])

In [22]:
decoder(outs, outs_lens, torch.from_numpy(y_labels))

RuntimeError: input must have 2 dimensions, got 3

In [31]:
class View(nn.Module):
    def __init__(self, shape):
        super(View, self).__init__()
        self.shape = shape
        
    def forward(self, input):
        return input.view(self.shape)

class Reorder(nn.Module):
    def __init__(self, shape):
        super(Reorder, self).__init__()
        self.shape = shape
        
    def forward(self, input):
        return input.permute(self.shape)

In [32]:
class TDSBlock(nn.Module):
    
    def __init__(self, c: int, kw: int, h: int, dropout: float)-> None:
        super().__init__()
        l:int = c * h
        self.conv = nn.Sequential(
            nn.Conv2d(1, 10, kernel_size=(21, 1), stride=(2, 1), padding=(20, 5)), #calc padding
            nn.ReLU(),
            nn.Dropout2d(0.2),
            nn.LayerNorm(3),
            View((-1,l,1,0)),
            Reorder((1,0,2,3)),
            nn.Linear(l,l),
            nn.ReLU(),
            nn.Dropout2d(dropout),
            nn.Linear(l,l),
            Reorder((1,0,2,3)),
            View((-1,h,c,0)),
            nn.Dropout2d(dropout),
            nn.LayerNorm(3)
        )
        
    def forward(self, input):
        return self.conv(input)

In [33]:
b = TDSBlock(1,2,3,0.5)


In [34]:
Experiment.create("CV_STS")
config = parse_yaml("./config.yaml")["train"]

In [35]:
train_loader, val_loader = create_data_loaders(**config)


In [46]:
def custom_loss(batch, model):
    batch_x, batch_y, input_lengths, target_lengths = batch
    model_output, output_lengths = model(batch_x)
    loss = torch_functional.ctc_loss(model_output.transpose(0, 1), batch_y, output_lengths, target_lengths)
    return loss, model_output

In [47]:
class ConvSeqNet(nn.Module):
    def __init__(self):
        super(ConvSeqNet, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1, 10, kernel_size=(21, 1), stride=(2, 1), padding=(20, 5)), #calc padding
            nn.ReLU(),
            nn.Dropout2d(0.2),
            nn.LayerNorm(3),
            TDSBlock(10,21,82,0.2)
        #     TDSBlock(10,21,82,0.2),
        #     nn.Conv2d(10, 14, kernel_size=(21, 1), stride=(2, 1), padding=(20, 5)), #calc padding
        #     nn.ReLU(),
        #     nn.Dropout2d(0.2),
        #     nn.LayerNorm(3),
        #     TDSBlock(10,21,82,0.2),
        #     TDSBlock(10,21,82,0.2),
        #     TDSBlock(10,21,82,0.2),
        #     nn.Conv2d(14, 18, kernel_size=(21, 1), stride=(2, 1), padding=(20, 5)), #calc padding
        #     nn.ReLU(),
        #     nn.Dropout2d(0.2),
        #     nn.LayerNorm(3),
        #     TDSBlock(10,21,82,0.2),
        #     TDSBlock(10,21,82,0.2),
        #     TDSBlock(10,21,82,0.2),
        #     TDSBlock(10,21,82,0.2),
        #     TDSBlock(10,21,82,0.2),
        #     TDSBlock(10,21,82,0.2),
        #     View((-1,1440,1,0)),
        #     Reorder((1,0,3,2)),
        #     nn.Linear(1440,1024),
        )
    def forward(self, input):
        return self.net(input)

In [48]:
model = ConvSeqNet()

trainer = ModelTrainer(model, loss=custom_loss, epochs=config["max_epochs"],
                           train_data_loader=train_loader, val_data_loader=val_loader,
                           lr=config["learning_rate"], custom_model_eval=True)

In [49]:
trainer.start_training()

RuntimeError: Given normalized_shape=[3], expected input with shape [*, 3], but got input of size[32, 10, 51, 213]