In [2]:
import os
import time
import math
from datasetloadingpy import dataloaders
import copy
import torch.optim as optim
from torch.optim import lr_scheduler

import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models

from with_other_resnet import TrueResNet, BasicBlock
from super_simple_pytorch_nn import CustomResNet


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
def train_model(model, dataloaders, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'test']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            #for batch_idx, (inputs, lengths, labels) in enumerate(dataloaders[phase]):
            for inputs, lengths, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs.unsqueeze(1).cuda(), lengths=lengths)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model
            if phase == 'test' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model


In [4]:
def _average_batch(x, lengths):
    return torch.stack( [torch.mean( x[index][:,0:i], 1 ) for index, i in enumerate(lengths)],0 )

In [5]:
def threeD_to_2D_tensor(x):
    n_batch, n_channels, s_time, sx, sy = x.shape
    x = x.transpose(1, 2)
    return x.reshape(n_batch*s_time, n_channels, sx, sy)


def _3d_block(in_size, out_size, kernel_size, stride, padding, bias=False, relu_type='prelu'):
    return nn.Sequential(
        nn.Conv3d(in_size, out_size, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias),
        nn.BatchNorm3d(out_size),
        nn.PReLU(num_parameters=out_size) if relu_type== 'prelu' else nn.ReLU()
    )


In [6]:
class LipreadLSTMv1(nn.Module):
    def __init__(self, num_classes, pretrained_res=False, relu_type='prelu'):
        super(LipreadLSTMv1, self).__init__()
        self.num_classes = num_classes
        self.frontend_out = 64
        self.backend_out = 512

        self.front3D = _3d_block(1, self.frontend_out, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
        self.max_pool_3D = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
        self.resnet = TrueResNet(BasicBlock, [2, 2, 2, 2], relu_type=relu_type)
        self.lstm = nn.LSTM(
            input_size=self.backend_out,
            hidden_size=256,
            num_layers=3,
            batch_first=True,
            bidirectional=True,
            dropout=0.1
        )
        self.conv1 = nn.Conv1d(512, 256, kernel_size=3)
        self.batchnorm1 = nn.BatchNorm1d(256)
        if relu_type == 'relu':
            self.relu1 = nn.ReLU()
        elif relu_type == 'prelu':
            self.relu1 = nn.PReLU(num_parameters=256)
        self.fc2 = nn.Linear(256, num_classes)

        self._initialize_weights_randomly()

        if pretrained_res:
            pretrained_model = models.resnet18(pretrained=True)
            #pretrained_model.fc = nn.Linear(pretrained_model.fc.in_features, num_classes)
            self.resnet.load_state_dict(pretrained_model.state_dict(), strict=False)
            for param in self.resnet.parameters():
                param.requires_grad = False


    def forward(self, x, lengths):
        B, C, T, H, W = x.size()
        #print("Initial Shape: " + str(x.shape))
        x = self.front3D(x)
        #print("3D Out Shape: " + str(x.shape))
        x = self.max_pool_3D(x)
        #print("3D Max Pool Shape: " + str(x.shape))
        Tnew = x.shape[2]
        #transposed = x.transpose(1, 2).contiguous()
        #x = transposed.view(-1, 64, x.size(3), x.size(4))
        x = threeD_to_2D_tensor(x)
        #print("ResNet In Shape: " + str(x.shape))
        x = self.resnet(x)
        #print("ResNet Out Shape: " + str(x.shape))
        x = x.view(B, Tnew, x.size(1))
        #x = x.transpose(1, 2)
        #print("DataTransform Out Shape: " + str(x.shape))

        #x = self.fc(x)
        #x = self.bn2(x)
        #print("First Batch Norm OutShape: " + str(x.shape))
        #x = x.view(x.shape[0], -1, self.num_classes)
        x, _ = self.lstm(x)
        #print("LSTM Out Shape: " + str(x.shape))
        x = x.transpose(1, 2)
        #print("Transposed Shape: {}".format(str(x.shape)))
        x = self.conv1(x)
        x = self.batchnorm1(x)
        x = self.relu1(x)
        #print("Avg Batch InShape: " + str(x.shape))
        x = _average_batch(x, lengths)
        #print("Avg Batch OutShape: " + str(x.shape))
        x = self.fc2(x)
        #print("Last Liner Out Shape: " + str(x.shape))
        return x


    def _initialize_weights_randomly(self):

        use_sqrt = True

        if use_sqrt:
            def f(n):
                return math.sqrt( 2.0/float(n) )
        else:
            def f(n):
                return 2.0/float(n)

        for m in self.modules():
            if isinstance(m, nn.Conv3d) or isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d):
                n = np.prod( m.kernel_size ) * m.out_channels
                m.weight.data.normal_(0, f(n))
                if m.bias is not None:
                    m.bias.data.zero_()

            elif isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

            elif isinstance(m, nn.Linear):
                n = float(m.weight.data[0].nelement())
                m.weight.data = m.weight.data.normal_(0, f(n))

In [7]:
data_path = '/home/taylorpap/Bootcamp/CroppedLRW'
temp_words_list = ['ABSOLUTELY', 'BUDGET', 'EVERYONE', 'HOUSE', 'MILITARY', 'PUBLIC', 'RESULT', 'SIGNIFICANT',
                   'WEATHER']
datasets = dataloaders(data_dir=data_path, label_fp=temp_words_list, batch_size=32, workers=8)

Partition train loaded
Partition test loaded


In [8]:
train_features, train_lengths, train_labels = next(iter(datasets['train']))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([32, 29, 88, 88])
Labels batch shape: torch.Size([32])


In [9]:
model_ft = LipreadLSTMv1(len(temp_words_list), pretrained_res=True)


model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)



In [10]:
model_ft

LipreadLSTMv1(
  (front3D): Sequential(
    (0): Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): PReLU(num_parameters=64)
  )
  (max_pool_3D): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
  (resnet): TrueResNet(
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): PReLU(num_parameters=64)
        (relu2): PReLU(num_parameters=64)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), st

In [11]:
model_ft = train_model(model_ft, datasets, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=1)

Epoch 0/0
----------
Initial Shape: torch.Size([32, 1, 29, 88, 88])
3D Out Shape: torch.Size([32, 64, 29, 44, 44])
3D Max Pool Shape: torch.Size([32, 64, 29, 22, 22])
ResNet In Shape: torch.Size([928, 64, 22, 22])
ResNet Out Shape: torch.Size([928, 512])
DataTransform Out Shape: torch.Size([32, 29, 512])
LSTM Out Shape: torch.Size([32, 29, 512])
Transposed Shape: torch.Size([32, 512, 29])
Avg Batch InShape: torch.Size([32, 256, 27])
Avg Batch OutShape: torch.Size([32, 256])
Last Liner Out Shape: torch.Size([32, 9])
Initial Shape: torch.Size([32, 1, 29, 88, 88])
3D Out Shape: torch.Size([32, 64, 29, 44, 44])
3D Max Pool Shape: torch.Size([32, 64, 29, 22, 22])
ResNet In Shape: torch.Size([928, 64, 22, 22])
ResNet Out Shape: torch.Size([928, 512])
DataTransform Out Shape: torch.Size([32, 29, 512])
LSTM Out Shape: torch.Size([32, 29, 512])
Transposed Shape: torch.Size([32, 512, 29])
Avg Batch InShape: torch.Size([32, 256, 27])
Avg Batch OutShape: torch.Size([32, 256])
Last Liner Out Shape:

In [13]:
torch.save(model_ft.state_dict(), '/home/taylorpap/Bootcamp/lipreadlstmv1_2.pth') #60% Acc on Test Set for 9 Words

In [14]:
model_ft_2 = LipreadLSTMv1(len(temp_words_list), pretrained=False)

model_ft_2.load_state_dict(torch.load('/home/taylorpap/Bootcamp/lipreadlstmv1_2.pth'))

model_ft_2 = model_ft_2.to(device)

model_ft_2 = train_model(model_ft_2, datasets, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=45)

Epoch 0/44
----------
train Loss: 0.9723 Acc: 0.6770
test Loss: 1.1632 Acc: 0.5956

Epoch 1/44
----------
train Loss: 0.9637 Acc: 0.6815
test Loss: 1.1648 Acc: 0.5978

Epoch 2/44
----------
train Loss: 0.9644 Acc: 0.6851
test Loss: 1.1616 Acc: 0.5978

Epoch 3/44
----------
train Loss: 0.9618 Acc: 0.6858
test Loss: 1.1647 Acc: 0.6022

Epoch 4/44
----------
train Loss: 0.9551 Acc: 0.6821
test Loss: 1.1589 Acc: 0.6044

Epoch 5/44
----------
train Loss: 0.9715 Acc: 0.6807
test Loss: 1.1636 Acc: 0.6000

Epoch 6/44
----------
train Loss: 0.9621 Acc: 0.6815
test Loss: 1.1649 Acc: 0.5978

Epoch 7/44
----------
train Loss: 0.9687 Acc: 0.6780
test Loss: 1.1611 Acc: 0.6000

Epoch 8/44
----------
train Loss: 0.9741 Acc: 0.6772
test Loss: 1.1658 Acc: 0.6022

Epoch 9/44
----------
train Loss: 0.9728 Acc: 0.6804
test Loss: 1.1601 Acc: 0.6044

Epoch 10/44
----------
train Loss: 0.9676 Acc: 0.6851
test Loss: 1.1675 Acc: 0.5956

Epoch 11/44
----------
train Loss: 0.9644 Acc: 0.6831
test Loss: 1.1617 Acc

KeyboardInterrupt: 

In [4]:
checkpoint_path = '/home/taylorpap/Bootcamp/test_model5/ckpt.pth.tar'
checkpoint = torch.load(checkpoint_path)
print(checkpoint['epoch_idx'])

40
