## Imports

In [1]:
import torch

# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from dataset import FelixLRS2Dataset
from felix_lipnet import FelixLipNet
from sklearn.preprocessing import LabelEncoder

## Dataset and Dataloader

In [2]:
dataset = FelixLRS2Dataset(alignment_file = '../data/lrs2_v1/mvlrs_v1/dummy_test.txt',
                          root_dir='../data/lrs2_v1/mvlrs_v1/main/')

In [3]:
batch_size = 1
dummy_loader = DataLoader(dataset,
                       batch_size=batch_size,
                       shuffle=False)
                       #num_workers=12)

In [4]:
iter_ = iter(dummy_loader)

In [5]:
data = next(iter_)
print(data['target_lengths'])

tensor([27])


In [6]:
data['alignments']

tensor([[14, 27, 17,  ...,  0,  0,  0]], dtype=torch.int32)

In [7]:
data['frames'].shape

torch.Size([1, 300, 96, 96])

In [8]:
# Report split sizes
print('Training set has {} instances'.format(len(dataset)))

Training set has 529 instances


In [9]:
# Create our vocab list
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!0123456789 "]
char_to_num = LabelEncoder()
char_to_num.fit(vocab)

In [10]:
len(char_to_num.classes_)

40

In [11]:
x_ = char_to_num.inverse_transform(data['alignments'][0])

In [12]:
for i in x_:
    print(i)

a
n
d
 
f
o
r
 
m
e
 
t
h
e
 
s
u
r
p
r
i
s
e
 
w
a
s
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 


In [13]:
blank_label = len(char_to_num.classes_)
blank_label

40

In [14]:
device = 'cuda'

In [15]:
# Create an instance of the PyTorch model
num_classes = len(char_to_num.classes_)+1
num_classes

41

In [16]:
model = FelixLipNet(num_classes)

In [17]:
model.to(device)

FelixLipNet(
  (conv1): Conv3d(1, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (maxpool1): MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (maxpool2): MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv3d(256, 300, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (maxpool3): MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=2, end_dim=-1)
  (lstm1): LSTM(43200, 128, bidirectional=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (lstm2): LSTM(256, 128, bidirectional=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=41, bias=True)
  (log_softmax): LogSoftmax(dim=1)
  (relu): ReLU()
)

In [18]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [19]:
loss_fn = nn.CTCLoss(blank=blank_label, reduction='mean', zero_infinity=True)

In [20]:
frames = data['frames'].to(device)
alignments = data['alignments'].to(device)
target_lengths = data['target_lengths'].to(device)

In [21]:
print(frames.shape)
print(alignments.shape)
print(target_lengths)

torch.Size([1, 300, 96, 96])
torch.Size([1, 10000])
tensor([27], device='cuda:0')


In [22]:
alignments

tensor([[14, 27, 17,  ...,  0,  0,  0]], device='cuda:0', dtype=torch.int32)

In [23]:
batch_size = frames.shape[0]
batch_size

1

In [24]:
frames.shape

torch.Size([1, 300, 96, 96])

In [25]:
frames = frames.unsqueeze(dim=1)
frames.shape

torch.Size([1, 1, 300, 96, 96])

In [26]:
optimizer.zero_grad()

In [27]:
y_pred = model(frames.cuda())

RuntimeError: input.size(-1) must be equal to input_size. Expected 144, got 42624

In [None]:
# Zero your gradients for every batch!
optimizer.zero_grad()

# Make predictions for this batch
outputs = model(frames.unsqueeze(dim=4))

input_lengths = torch.full((len(target_lengths),), 10000, dtype=torch.long)

# Compute the loss and its gradients
loss = loss_fn(outputs.permute(1,0,2), alignments, input_lengths, target_lengths)
loss.backward()

# Adjust learning weights
optimizer.step()

## Training

In [None]:
input_lengths = torch.full((batch_size,), 10000, dtype=torch.long)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(dummy_loader):
        # Every data instance is an input + label pair
        frames = data['frames'].to(device)
        alignments = data['alignments'].to(device)
        target_lengths = data['target_lengths'].to(device)

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(frames.unsqueeze(dim=4))

        input_lengths = torch.full((len(target_lengths),), 10000, dtype=torch.long)
        
        # Compute the loss and its gradients
        loss = loss_fn(outputs.permute(1,0,2), alignments, input_lengths, target_lengths)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 10 == 9:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(dummy_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

## Train!

In [None]:
# Create our vocab list
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!0123456789 "]
char_to_num = LabelEncoder()
char_to_num.fit(vocab)

In [None]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 1

best_vloss = 1_000_000.

torch.autograd.set_detect_anomaly(True)
for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number, writer)


    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()

    avg_vloss = 0
    
#     # Disable gradient computation and reduce memory consumption.
#     with torch.no_grad():
#         for i, vdata in enumerate(validation_loader):
#             vinputs, vlabels = vdata
#             voutputs = model(vinputs)
#             vloss = loss_fn(voutputs, vlabels)
#             running_vloss += vloss

#     avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()

#     # Track best performance, and save the model's state
#     if avg_vloss < best_vloss:
#         best_vloss = avg_vloss
#         model_path = 'model_{}_{}'.format(timestamp, epoch_number)
#         torch.save(model.state_dict(), model_path)

    epoch_number += 1

In [None]:
for i, data in enumerate(dummy_loader):
    # Every data instance is an input + label pair
    frames = data['frames'].to(device)
    alignments = data['alignments'].to(device)
    target_lengths = data['target_lengths'].to(device)

    # Zero your gradients for every batch!
    optimizer.zero_grad()

    # Make predictions for this batch
    outputs = model(frames.unsqueeze(dim=4))

    input_lengths = torch.full((len(target_lengths),), 10000, dtype=torch.long)

    # Compute the loss and its gradients
    loss = loss_fn(outputs.permute(1,0,2), alignments, input_lengths, target_lengths)
    loss.backward()

    # Adjust learning weights
    optimizer.step()

    # Gather data and report
    print(loss.item())

    break

In [None]:
outputs.permute(1,0,2).shape

In [None]:
alignments.shape

In [None]:
input_lengths

In [None]:
target_lengths

In [None]:
loss = loss_fn(outputs.permute(1,0,2), alignments, input_lengths, target_lengths)
loss.item()