In [1]:
import os
import time
import math
from datasetloadingpy import dataloaders
import copy
import torch.optim as optim
from torch.optim import lr_scheduler

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models

from with_other_resnet import TrueResNet, BasicBlock
from super_simple_pytorch_nn import CustomResNet


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
def train_model(model, dataloaders, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'test']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            #for batch_idx, (inputs, lengths, labels) in enumerate(dataloaders[phase]):
            for inputs, lengths, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs.unsqueeze(1).cuda(), lengths=lengths)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model
            if phase == 'test' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model


In [10]:
def _average_batch(x, lengths):
    return torch.stack( [torch.mean( x[index][:,0:i], 1 ) for index, i in enumerate(lengths)],0 )

In [11]:
def threeD_to_2D_tensor(x):
    n_batch, n_channels, s_time, sx, sy = x.shape
    x = x.transpose(1, 2)
    return x.reshape(n_batch*s_time, n_channels, sx, sy)


def _3d_block(in_size, out_size, kernel_size, stride, padding, bias=False, relu_type='prelu'):
    return nn.Sequential(
        nn.Conv3d(in_size, out_size, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias),
        nn.BatchNorm3d(out_size),
        nn.PReLU(num_parameters=out_size) if relu_type== 'prelu' else nn.ReLU()
    )


In [18]:
class LipreadLSTMv1(nn.Module):
    def __init__(self, num_classes, pretrained=False, relu_type='prelu'):
        super(LipreadLSTMv1, self).__init__()
        self.num_classes = num_classes
        self.frontend_out = 64
        self.backend_out = 512

        self.front3D = _3d_block(1, self.frontend_out, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
        self.max_pool_3D = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
        self.resnet = TrueResNet(BasicBlock, [2, 2, 2, 2], relu_type=relu_type)
        #self.fc = nn.Linear(512 * BasicBlock.expansion, self.backend_out)
        #self.bn2 = nn.BatchNorm1d(self.backend_out)
        self.lstm = nn.LSTM(
            input_size=self.backend_out,
            hidden_size=256,
            num_layers=2,
            batch_first=True,
            bidirectional=True
        )
        self.fc2 = nn.Linear(512, num_classes)

        self._initialize_weights_randomly()

        if pretrained:
            pretrained_model = models.resnet18(pretrained=True)
            #pretrained_model.fc = nn.Linear(pretrained_model.fc.in_features, num_classes)
            self.resnet.load_state_dict(pretrained_model.state_dict(), strict=False)
            for param in self.resnet.parameters():
                param.requires_grad = False


    def forward(self, x, lengths):
        B, C, T, H, W = x.size()
        #print("Initial Shape: " + str(x.shape))
        x = self.front3D(x)
        #print("3D Out Shape: " + str(x.shape))
        x = self.max_pool_3D(x)
        #print("3D Max Pool Shape: " + str(x.shape))
        Tnew = x.shape[2]
        #transposed = x.transpose(1, 2).contiguous()
        #x = transposed.view(-1, 64, x.size(3), x.size(4))
        x = threeD_to_2D_tensor(x)
        #print("ResNet In Shape: " + str(x.shape))
        x = self.resnet(x)
        #print("ResNet Out Shape: " + str(x.shape))
        x = x.view(B, Tnew, x.size(1))
        #x = x.transpose(1, 2)
        #print("DataTransform Out Shape: " + str(x.shape))

        #x = self.fc(x)
        #x = self.bn2(x)
        #print("First Batch Norm OutShape: " + str(x.shape))
        #x = x.view(x.shape[0], -1, self.num_classes)
        x, _ = self.lstm(x)
        #print("LSTM Out Shape: " + str(x.shape))
        x = x.transpose(1, 2)
        x = _average_batch(x, lengths)
        #print("Avg Batch OutShape: " + str(x.shape))
        x = self.fc2(x)
        #print("Last Liner Out Shape: " + str(x.shape))
        return x


    def _initialize_weights_randomly(self):

        use_sqrt = True

        if use_sqrt:
            def f(n):
                return math.sqrt( 2.0/float(n) )
        else:
            def f(n):
                return 2.0/float(n)

        for m in self.modules():
            if isinstance(m, nn.Conv3d) or isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d):
                n = np.prod( m.kernel_size ) * m.out_channels
                m.weight.data.normal_(0, f(n))
                if m.bias is not None:
                    m.bias.data.zero_()

            elif isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

            elif isinstance(m, nn.Linear):
                n = float(m.weight.data[0].nelement())
                m.weight.data = m.weight.data.normal_(0, f(n))

In [19]:
data_path = '/home/taylorpap/Bootcamp/CroppedLRW'
temp_words_list = ['ABSOLUTELY', 'BUDGET', 'EVERYONE', 'HOUSE', 'MILITARY', 'PUBLIC', 'RESULT', 'SIGNIFICANT',
                   'WEATHER']
new_temp_words_list = ['BUDGET']
datasets = dataloaders(data_dir=data_path, label_fp=temp_words_list, batch_size=32, workers=8)

Partition train loaded
Partition test loaded


In [20]:
train_features, train_lengths, train_labels = next(iter(datasets['train']))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([32, 29, 88, 88])
Labels batch shape: torch.Size([32])


In [21]:
model_ft = LipreadLSTMv1(len(temp_words_list), pretrained=True)


model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

In [22]:
model_ft

LipreadLSTMv1(
  (front3D): Sequential(
    (0): Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): PReLU(num_parameters=64)
  )
  (max_pool_3D): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
  (resnet): TrueResNet(
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): PReLU(num_parameters=64)
        (relu2): PReLU(num_parameters=64)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), st

In [23]:
model_ft = train_model(model_ft, datasets, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=25)

Epoch 0/24
----------
train Loss: 2.1761 Acc: 0.1556
test Loss: 2.1539 Acc: 0.1711

Epoch 1/24
----------
train Loss: 2.1383 Acc: 0.1798
test Loss: 2.1315 Acc: 0.1689

Epoch 2/24
----------
train Loss: 2.1065 Acc: 0.1973
test Loss: 2.0901 Acc: 0.2089

Epoch 3/24
----------
train Loss: 2.0861 Acc: 0.2139
test Loss: 2.0898 Acc: 0.2044

Epoch 4/24
----------
train Loss: 2.0662 Acc: 0.2193
test Loss: 2.1184 Acc: 0.2156

Epoch 5/24
----------
train Loss: 2.0517 Acc: 0.2357
test Loss: 2.0722 Acc: 0.2000

Epoch 6/24
----------
train Loss: 2.0235 Acc: 0.2508
test Loss: 2.0993 Acc: 0.2067

Epoch 7/24
----------
train Loss: 2.0037 Acc: 0.2644
test Loss: 2.0217 Acc: 0.2378

Epoch 8/24
----------
train Loss: 1.9895 Acc: 0.2684
test Loss: 2.0112 Acc: 0.2489

Epoch 9/24
----------
train Loss: 1.9852 Acc: 0.2688
test Loss: 2.0110 Acc: 0.2511

Epoch 10/24
----------
train Loss: 1.9808 Acc: 0.2702
test Loss: 2.0084 Acc: 0.2600

Epoch 11/24
----------
train Loss: 1.9730 Acc: 0.2752
test Loss: 2.0074 Acc