<a href="https://colab.research.google.com/github/PandaBoi/CSLR_with_RL/blob/main/Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Import All Required Modules

In [None]:
import math
import numpy as np
import pandas as pd
from functools import partial
import os
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
import sys
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.utils as utils
import torchvision.transforms as T
from torch.autograd import Variable
import pdb
from itertools import groupby
!pip install easydict
from easydict import EasyDict as ED
device = "cuda" if torch.cuda.is_available() else "cpu"

#maybe will need for later
# !pip install git+https://github.com/enhuiz/phoenix-datasets
# !pip install xmltodict
# from phoenix_datasets import PhoenixVideoTextDataset
 
# from torch.utils.data import DataLoader



# 3D-ResNet Implementation

In [None]:

def get_inplanes():
    return [64, 128, 256, 512]


def conv3x3x3(in_planes, out_planes, stride=1):
    return nn.Conv3d(in_planes,
                     out_planes,
                     kernel_size=3,
                     stride=stride,
                     padding=1,
                     bias=False)


def conv1x1x1(in_planes, out_planes, stride=1):
    return nn.Conv3d(in_planes,
                     out_planes,
                     kernel_size=1,
                     stride=stride,
                     bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, downsample=None):
        super().__init__()

        self.conv1 = conv3x3x3(in_planes, planes, stride)
        self.bn1 = nn.BatchNorm3d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3x3(planes, planes)
        self.bn2 = nn.BatchNorm3d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1, downsample=None):
        super().__init__()

        self.conv1 = conv1x1x1(in_planes, planes)
        self.bn1 = nn.BatchNorm3d(planes)
        self.conv2 = conv3x3x3(planes, planes, stride)
        self.bn2 = nn.BatchNorm3d(planes)
        self.conv3 = conv1x1x1(planes, planes * self.expansion)
        self.bn3 = nn.BatchNorm3d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self,
                 block,
                 layers,
                 block_inplanes,
                 n_input_channels=3,
                 conv1_t_size=7,
                 conv1_t_stride=1,
                 no_max_pool=False,
                 shortcut_type='B',
                 widen_factor=1.0,
                 n_classes=400):
        super().__init__()

        block_inplanes = [int(x * widen_factor) for x in block_inplanes]

        self.in_planes = block_inplanes[0]
        self.no_max_pool = no_max_pool

        self.conv1 = nn.Conv3d(n_input_channels,
                               self.in_planes,
                               kernel_size=(conv1_t_size, 7, 7),
                               stride=(conv1_t_stride, 2, 2),
                               padding=(conv1_t_size // 2, 3, 3),
                               bias=False)
        self.bn1 = nn.BatchNorm3d(self.in_planes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool3d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, block_inplanes[0], layers[0],
                                       shortcut_type)
        self.layer2 = self._make_layer(block,
                                       block_inplanes[1],
                                       layers[1],
                                       shortcut_type,
                                       stride=2)
        self.layer3 = self._make_layer(block,
                                       block_inplanes[2],
                                       layers[2],
                                       shortcut_type,
                                       stride=2)
        self.layer4 = self._make_layer(block,
                                       block_inplanes[3],
                                       layers[3],
                                       shortcut_type,
                                       stride=2)

        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
        self.fc = nn.Linear(block_inplanes[3] * block.expansion, n_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                nn.init.kaiming_normal_(m.weight,
                                        mode='fan_out',
                                        nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm3d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _downsample_basic_block(self, x, planes, stride):
        out = F.avg_pool3d(x, kernel_size=1, stride=stride)
        zero_pads = torch.zeros(out.size(0), planes - out.size(1), out.size(2),
                                out.size(3), out.size(4))
        if isinstance(out.data, torch.FloatTensor.to(device)):
            zero_pads = zero_pads.to(device)

        out = torch.cat([out.data, zero_pads], dim=1)

        return out

    def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
        downsample = None
        if stride != 1 or self.in_planes != planes * block.expansion:
            if shortcut_type == 'A':
                downsample = partial(self._downsample_basic_block,
                                     planes=planes * block.expansion,
                                     stride=stride)
            else:

# # agent = SC_REINFORCE(opts) # add args 

# # env = Environment(train_dataset, train_loader)
…#   #   if not os.path.exists(os.path.join(BASE_DIR,'Logs/')):
#   #     os.makedirs(os.path.join(BASE_DIR,'Logs/'))
#   #   torch.save(agent.model.state_dict(), os.path.join(BASE_DIR,'Logs/trial.pt'))
#   # train_wers.append(1 - reward_s[-1])    
#   # reward_a, reward_s, log_probs = np.array(reward_a), np.array(reward_s), np.array(log_probs)

# a = next(iter(train_loader))
# print(a['data'].shape, a['gloss'].shape)
                downsample = nn.Sequential(
                    conv1x1x1(self.in_planes, planes * block.expansion, stride),
                    nn.BatchNorm3d(planes * block.expansion))

        layers = []
        layers.append(
            block(in_planes=self.in_planes,
                  planes=planes,
                  stride=stride,
                  downsample=downsample))
        self.in_planes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.in_planes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        if not self.no_max_pool:
            x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)

        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


def generate_model(model_depth, **kwargs):
    assert model_depth in [10, 18, 34, 50, 101, 152, 200]

    if model_depth == 10:
        model = ResNet(BasicBlock, [1, 1, 1, 1], get_inplanes(), **kwargs)
    elif model_depth == 18:
        model = ResNet(BasicBlock, [2, 2, 2, 2], get_inplanes(), **kwargs)
    elif model_depth == 34:
        model = ResNet(BasicBlock, [3, 4, 6, 3], get_inplanes(), **kwargs)
    elif model_depth == 50:
        model = ResNet(Bottleneck, [3, 4, 6, 3], get_inplanes(), **kwargs)
    elif model_depth == 101:
        model = ResNet(Bottleneck, [3, 4, 23, 3], get_inplanes(), **kwargs)
    elif model_depth == 152:
        model = ResNet(Bottleneck, [3, 8, 36, 3], get_inplanes(), **kwargs)
    elif model_depth == 200:
        model = ResNet(Bottleneck, [3, 24, 36, 3], get_inplanes(), **kwargs)

    return model

# Transformer Implementation

`PostionalEncoding` is a helper function which is used originally in the paper "Attention is All you Need". However for simplicity, we have used a single embedding layer in our implementation.

In [None]:

class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:

class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, values, keys, query, mask):
        # Get number of training examples
        N = query.shape[0]

        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
        # Split the embedding into self.heads different pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        query = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)  # (N, value_len, heads, head_dim)
        keys = self.keys(keys)  # (N, key_len, heads, head_dim)
        queries = self.queries(query)  # (N, query_len, heads, heads_dim)

        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        # queries shape: (N, query_len, heads, heads_dim),
        # keys shape: (N, key_len, heads, heads_dim)
        # energy: (N, heads, query_len, key_len)

        # Mask padded indices so their weights become 0
        if mask is not None:
            # print(energy.shape,mask.shape)
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
        # attention shape: (N, heads, query_len, key_len)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )
        # attention shape: (N, heads, query_len, key_len)
        # values shape: (N, value_len, heads, heads_dim)
        # out after matrix multiply: (N, query_len, heads, head_dim), then
        # we reshape and flatten the last two dimensions.

        out = self.fc_out(out)
        # Linear layer doesn't modify the shape, final shape will be
        # (N, query_len, embed_size)

        return out


class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)

        # Add skip connection, run through normalization and finally dropout
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out


class Encoder(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        embed_size,
        num_layers,
        heads,
        device,
        forward_expansion,
        dropout,
        max_length,
    ):

        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size,)
        self.hacky = nn.Linear(1,embed_size)
        # self.position_embedding = PositionalEncoding(d_model = embed_size, max_len = max_length, dropout = 0.0)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)
    
    def hacky_embedding(self, x):
      total_out = []
      for i in range(x.shape[1]):
        inp = x[:,i]
        out = self.hacky(inp.view(x.shape[0],-1))
        # print(out.shape)
        total_out.append(out)
      total_out = torch.stack(total_out).movedim((0,1,2),(1,0,2))
      # print(total_out.shape)
      return total_out


    def forward(self, x, mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        out = self.dropout(
              self.hacky_embedding(x.float())
              # self.word_embedding(x) 
            + self.position_embedding(positions)
        )

        # In the Encoder the query, key, value are all the same, it's in the
        # decoder this will change. This might look a bit odd in this case.
        for layer in self.layers:
            out = layer(out, out, out, None)

        return out


class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.norm = nn.LayerNorm(embed_size)
        self.attention = SelfAttention(embed_size, heads=heads)
        self.transformer_block = TransformerBlock(
            embed_size, heads, dropout, forward_expansion
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, src_mask)
        return out


class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size,
        embed_size,
        num_layers,
        heads,
        forward_expansion,
        dropout,
        device,
        max_length,
    ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)
        # self.position_embedding = PositionalEncoding(d_model = embed_size, max_len = max_length, dropout = 0.0)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))
        # x = self.dropout((self.position_embedding(self.word_embedding(x))))
        # x = self.dropout(x+ self.position(position_embedding(positions)))
        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc_out(x)

        return out


class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        embed_size=512,
        num_layers=6,
        forward_expansion=4,
        heads=8,
        dropout=0.3,
        device="cpu",
        max_length=100,
    ):

        super(Transformer, self).__init__()

        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length,
        )

        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length,
        )

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        )

        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        # out = F.softmax(out,-1)
        return out



# Dataset and DataLoader

PyTorch provides a way to handle all the data in a class format. The data is then automatically converted to Tensors while iterations.

* `PhoenixVideo`: creates a PyTorch dataset, this loads the features and labels. It also holds helper features which help in word2index and index2word conversion.

* `Voc`: stores all the words and converts it to a dictionary where keys and values give us a way to convert words and index into each other.

* `caffeFeatureLoader`: loads all the pre-trained features.

Code inspired from :- [Dialated SLR](https://github.com/ustc-slr/DilatedSLR)

In [None]:
import os
import struct
import logging
import random

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms
from skimage import io, transform
from PIL import Image

class PhoenixVideo(Dataset):
    def __init__(self, corpus_dir, ANNOT_DIR, video_path, phase, DEBUG=False):
        self.ANNOT_DIR = ANNOT_DIR
        self.vocab_file = os.path.join(ANNOT_DIR,'automatic/newtrainingClasses.txt')
        self.image_type = 'png'
        self.max_video_len = 10000
        self.corpus_dir = corpus_dir
        self.video_path = video_path
        self.phase = phase
        self.alignment = {}
        self.voc = Voc(self.vocab_file)

        self.phoenix_dataset = self.load_video_list()
        # print(self.phoenix_dataset)
        self.data_dict = self.phoenix_dataset[phase]
        if DEBUG == True:
            self.data_dict = self.data_dict[:101]
        logging.info('[DATASET: {:s}]: total {:d} samples.'.format(phase, len(self.data_dict)))

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        cur_vid_info = self.data_dict[idx]
        id = cur_vid_info['id']
        video_file = cur_vid_info['path']
        label = cur_vid_info['label']
        label_text = cur_vid_info['label_text']
        video_tensor = self.load_video(video_file)
        sample = {'id': id, 'data': video_tensor, 'label': label, 'label_text':label_text}
        if self.phase == 'train':
          glosses = cur_vid_info['gloss'].split(' ')
          # print("The shape of glosses is", np.shape(glosses))
          # print("The shape of data is", np.shape(video_tensor))
          # print("The shape of label is", len(label))
          # print("The shape of label_text is", len(label_text))
          sample['gloss'], sample['gloss_text'] = self.clip_glosses(sample, glosses)
        return sample # final return (data: (num_clips,512), label:(1,len_of_sentence), gloss: (num_clips,10), id:(1))

    def clip_glosses(self, sample, glosses, window_size = 8, skip_size = 4):
      #input shape: (1,num_of_frames_in_video)
      # segment gloss sentence into clip sizes (num_clips,8) [assert len(data) == num_clips]
      # add BOS and EOS to each glossclip (num_clips, 10)
      gloss_text_clips = []
      gloss_clips = []
      j = 0
      for i in range(0, len(glosses) - window_size, skip_size):
        j = j + 1
        gloss_clip = []
        gloss_clip.append('<BOS>')
        gloss_clip.extend(glosses[i:i+window_size])
        gloss_clip.append('<EOS>')
        gloss_clip = [] + gloss_clip
        gloss_clips.append(self.sentence2index(gloss_clip))
        gloss_text_clips.append(gloss_clip)
      gloss_clips = np.array(gloss_clips)
      # gloss_text_clips = np.array(gloss_text_clips)
      # print("Len of gloss clips is", np.shape(gloss_clips))
      # print("Shape of data is", sample['data'].shape)
      try:
        assert sample['data'].shape[0] == np.shape(gloss_clips)[0]
      except AssertionError:
        print("name: ",sample['id'])
        print("sample and gloss shape dont match: ",sample['data'].shape[0], np.shape(gloss_clips)[0])
      return gloss_clips, gloss_text_clips

    
    def load_video(self, video_name):
        # print('video name: ',video_name)
        feat = caffeFeatureLoader.loadVideoC3DFeature(video_name, 'pool5')
        feat = torch.tensor(feat)
        return feat

    def load_video1(self, video_name):
        frames_list = glob.glob(os.path.join(video_name, '*.{:s}'.format(self.image_type)))
        frames_list.sort()
        num_frame = len(frames_list)
        if self.phase=='train' and self.sample and num_frame > self.max_video_len:
            for _ in range(num_frame-self.max_video_len):
                frames_list.pop(np.random.randint(len(frames_list)))
        frames_tensor_list = [self.load_image(frame_file) for frame_file in frames_list]
        video_tensor = torch.stack(frames_tensor_list, dim=0)
        return video_tensor

    def load_image(self, img_name):
        image = Image.open(img_name)
        image = self.transform(image)
        return image
    
    def load_gloss(self): # TO DO using:- https://github.com/enhuiz/phoenix-datasets/blob/570481bf03a46555ca219f79ace1a2cfab149f8c/phoenix_datasets/corpora.py#L35
        gloss_path = os.path.join(self.ANNOT_DIR,'automatic/train.alignment')
        read = partial(pd.read_csv, sep=" ", na_filter=False)
        ali = read(gloss_path, header=None, names=["id", "classlabel"])
        cls = read(os.path.join(self.ANNOT_DIR,"automatic/trainingClasses.txt"))
        # print(cls)
        df = pd.merge(ali, cls, how="left", on="classlabel")
        # print(df)
        del df["classlabel"]
        # print(df["signstate"])
        df["gloss"] = df["signstate"].apply(lambda s: s.rstrip("012"))

        df["id"] = df["id"].apply(lambda s: "/".join(s.split("/")[3:-2]))
        grouped = df.groupby("id")

        gdf = grouped["gloss"].agg(" ".join)
        sdf = grouped["signstate"].agg(" ".join)

        df = pd.merge(gdf, sdf, "inner", "id")
        # print(df.loc['01April_2010_Thursday_heute_default-0'])
        return df


    def load_video_list(self):
        phoenix_dataset = {}
        outliers = ['13April_2011_Wednesday_tagesschau_default-14'] # '05July_2010_Monday_heute_default-8'
        for task in ['train', 'dev', 'test']:
            if task != self.phase:
                continue
            dataset_path = os.path.join(self.video_path, task)
            corpus = pd.read_csv(os.path.join(self.corpus_dir, '{:s}.corpus.csv'.format(task)), sep='|')
            videonames = corpus['folder'].values
            annotation = corpus['annotation'].values
            if self.phase == 'train':
              glosses_df = self.load_gloss()
              # print(glosses_df.columns)
            ids = corpus['id'].values
            num_sample = len(ids)
            # print('num_sample: ',num_sample)
            video_infos = []
            for i in range(num_sample):
                if ids[i] in outliers:
                    continue
                tmp_info = {
                    'id': ids[i],
                    'path': os.path.join(self.video_path, task, videonames[i].replace('*.png', '')),
                    'label_text': annotation[i],
                    'label': np.array(self.sentence2index(annotation[i].split(' ')))
                }
                if self.phase == 'train':
                  try:
                    tmp_info['gloss'] = glosses_df.loc[ids[i]]['gloss']
                  except:
                    # print('pass')
                    continue
                video_infos.append(tmp_info)
            phoenix_dataset[task] = video_infos
        return phoenix_dataset

    def sentence2index(self, sent):
        #sent = sent.split(' ')
        # print(sent)
        s = []
        for word in sent:
            if word in self.voc.word2index:
                s.append(self.voc.word2index[word])
            else:
                s.append(self.voc.word2index['<UNK>'])
        return s

    def index2sentence(self, indices):
      # print(indices)
      return [self.voc.index2word[ind] for ind in indices]


class Voc():
    def __init__(self, vocab_file):
        PAD_token = 0
        self.vocab_file = vocab_file
        self.word2index = {'PAD': PAD_token}
        self.index2word = {PAD_token: 'PAD'}
        self.num_words = 1

        count = 0
        with open(self.vocab_file, 'r') as fid:
            for line in fid:
                if count != 0:
                    line = line.strip().split(' ')
                    word = line[0]
                    if word not in self.word2index:
                        self.word2index[word] = self.num_words
                        self.index2word[self.num_words] = word
                        self.num_words += 1
                count += 1
        UNK_token = self.num_words
        BOS_token = self.num_words + 1
        EOS_token = self.num_words + 2
        BLANK_token = self.num_words + 3
        self.word2index['<UNK>'] = UNK_token
        self.word2index['<BOS>'] = BOS_token
        self.word2index['<EOS>'] = EOS_token
        self.word2index['<BLANK>'] = BLANK_token
        self.index2word[UNK_token] = '<UNK>'
        self.index2word[BOS_token] = '<BOS>'
        self.index2word[EOS_token] = '<EOS>'
        self.index2word[BLANK_token] = '<BLANK>'
        self.num_words += 4

class caffeFeatureLoader():
    @staticmethod
    def loadVideoC3DFeature(sample_name, feattype = 'pool5'):
        featnames = glob.glob(os.path.join(sample_name, '*.' + feattype))
        featnames.sort()
        feat = []
        for name in featnames:
            feat.append(caffeFeatureLoader.loadC3DFeature(name)[0])
        return feat

    @staticmethod
    def loadC3DFeature(filename):
        feat = []
        with open(filename, 'rb') as fileData:
            num = struct.unpack("i", fileData.read(4))[0]
            chanel = struct.unpack("i", fileData.read(4))[0]
            length = struct.unpack("i", fileData.read(4))[0]
            height = struct.unpack("i", fileData.read(4))[0]
            width = struct.unpack("i", fileData.read(4))[0]
            blob_shape = [num, chanel, length, height, width]
            m = num * chanel * length * height * width
            for i in range(m):
                val = struct.unpack("f", fileData.read(4))[0]
                feat.append(val)
        return feat, blob_shape



#Self-critic REINFORCE Implementation

This class represents the agent used in the paper. The functions `select_action_sample` and `select_action_argmax` are used for training and inference/critic respectively.

In [None]:

class SC_REINFORCE():
    def __init__(self, opts):
      self.opts = opts
      print(self.opts)
      self.model = Transformer(src_vocab_size = opts.src_vocab_size ,
                                trg_vocab_size = opts.trg_vocab_size,
                                src_pad_idx = opts.src_pad_idx,
                                trg_pad_idx = opts.trg_pad_idx,
                                embed_size=512,
                               max_length = opts.max_len,
                               device = opts.device,
                               dropout=0.5,
                               )
      self.model = self.model.to(device)
      self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
      self.model.train()
      self.epsilon = 0.5
      self.decay = 0.95
      self.min_epsilon = 0.1

    def select_action_sample(self, state):
      #for R
      # print(state['data'],state['gloss'])
      probs = F.softmax(self.model(state['data'].to(device),state['gloss'][:,:].to(device)),-1)
      # print(probs.shape, probs.min(),probs.max())       
      # print(probs[:,1,:].min(), probs[:,1,:].max())
      # if np.random.randn()>eps:
      #   action = [ probs[:,i,:].multinomial(1).item() for i in range(np.shape(probs)[1]) ]
      # else:
      #   action = [ np.random.randint(0,np.shape(probs)[-1]) for i in range(np.shape(probs)[1])]
      action = []
      self.epsilon = max(self.min_epsilon, self.epsilon*self.decay)
      for i in range(np.shape(probs)[1]):
        if np.random.randn()>self.epsilon:
          action.append(probs[:,i,:].multinomial(1).item())
        else:
          action.append(np.random.randint(0,np.shape(probs)[-1]))
      # print('action: ',action)
      prob = torch.stack([probs[:,i,a] for i,a in enumerate(action)])
      log_prob = prob.log()
      # print('log prob: ',log_prob.shape)
      # entropy = - (probs*probs.log()).sum()
      # print("action: ",action)
      return action, log_prob

    def select_action_argmax(self, state, inference = False):
      # for R_hat
      probs = F.softmax(self.model(state['data'].to(device),state['gloss'][:,:].to(device)),-1)
      
      # print(probs.shape, probs.min(),probs.max())       
      # action = [probs[:,i,:].argmax().item() for i in range(np.shape(probs)[1]) ]
      self.epsilon = max(self.min_epsilon, self.epsilon*self.decay)
      action = []
      for i in range(np.shape(probs)[1]):
        if np.random.randn()>self.epsilon:
          action.append(probs[:,i,:].argmax().item())
        else:
          action.append(np.random.randint(0,np.shape(probs)[-1]))
      # print(action)
      prob = probs[:, :,action].view(1, -1)
      # log_prob = [prob[0].log()]
      # print('log_prob: ',log_prob.shape)
      # entropy = - (probs*probs.log()).sum()
      return action
    
    def inference(self, ph_video, state):
      #[dataset --> ph_video] "ph_video.voc.word2index['<BOS>']" = returns idx of the token

      # inp  = ph_video.voc.word2index['<BOS>']
      inp  = torch.tensor([ph_video.voc.word2index['<BOS>']]).unsqueeze(0).to(device)

      while True:
        # print("getting action")
        state['gloss'] = inp
        action = self.select_action_argmax(state, inference=True)
        # print("getting next word: ",action)
        next_word = np.argmax(action,axis=-1)
        # print(next_word)
        # break
      #   #next_word = probs;
        # print(len(inp),np.shape(next_word))
        inp = torch.cat([inp,torch.tensor([[next_word]])],dim=-1).to(device)                 
        
        # break
        if next_word == ph_video.voc.word2index['<EOS>'] or np.shape(inp)[-1] >= self.opts.max_len:
          break
      return inp 
      
    def update_parameters(self, rewards, rewards_argmax, log_probs, gamma = 1.):
      # print(rewards, log_probs)
      rewards, rewards_argmax = torch.tensor(rewards).float(), torch.tensor(rewards_argmax).float()
      R = torch.zeros(1, 1)
      loss = 0
      R_hat = rewards_argmax.sum()
      # print(np.shape(log_probs[0]),np.shape())
      for i in reversed(range(len(rewards))):
          R = gamma * R + rewards[i]
          # print("update: ", log_probs[i].shape)
          # l_p = torch.stack(log_probs[i])
          loss = loss - (torch.matmul(log_probs[i].T,(Variable(R).expand_as(log_probs[i]).to(device) - Variable(R_hat).expand_as(log_probs[i]).to(device))))
          # loss = loss - (torch.matmul(log_probs[i].T,(Variable(R).expand_as(log_probs[i]).to(device))))
      loss = 0.5*loss / len(rewards)
      self.optimizer.zero_grad()
      loss.backward()
      utils.clip_grad_norm(self.model.parameters(), 80)
      self.optimizer.step()


# Enviornment class

This class is designed to handle the dynamics of the data, i.e., moving from one state to another. The envrionment also lets us know when a particular video is completed and calculates the rewards for the actions taken in an episode.

In [None]:
class Environment():
    def __init__(self, ph_dataset ,dataLoader, window_size = 8, skip = 4, phase = 'train'):
      self.ph_dataset = ph_dataset
      self.iterator = iter(dataLoader)
      self.window_size = window_size
      self.skip = skip
      self.current_video = None
      self.sample = None
      self.gen = None
      self.actions = []
      self.num_vids = len(self.iterator)
      self.phase = phase
    
    def append_a(self, actions, a):
      if len(actions) == 0:
        actions = a
      else:
        np.append(actions, a[int(len(a)/2):])
      actions = np.array(actions).flatten()
      return actions     


    def next_video(self):
      self.actions = []
      self.sample = next(self.iterator)
      self.gen = iter(self.gen_data_and_gloss())
      data,_,_ = self.step()   
      return data  

    def gen_data_and_gloss(self):
      sample = self.sample
      for i in range(sample['data'].shape[1]):
        if self.phase == 'train':
          yield {'data': sample['data'][:,i,:], 'gloss': sample['gloss'][:,i,:], 'gloss_text': sample['gloss_text'][i]}
        else:
          yield{'data': sample['data'][:,i,:]}


    def get_wer(self, ref,actions):
      un_tags = np.where(np.logical_or(actions == self.ph_dataset.voc.word2index['<EOS>'], actions == self.ph_dataset.voc.word2index['<BOS>']))
      actions = np.delete(actions, un_tags[0])
      hyp_list = [i[0] for i in groupby(actions)]
      ref = ' '.join(e for e in self.ph_dataset.index2sentence(ref.numpy()[0]))
      hyp = ' '.join(e for e in self.ph_dataset.index2sentence(hyp_list))
      self.ref = ref
      self.hyp = hyp
      wer = get_wer_delsubins(ref, hyp)
      return wer[0]*100

    def step(self,action = None):
      done = False
      reward = 0
      if action != None:
        self.actions = self.append_a(self.actions, action)
      clip = next(self.gen, None)
     
      if clip == None:
        done = True
        self.ref = self.sample['label']
        reward = 100 - self.get_wer(self.ref, self.actions)
        
      return clip, reward, done


    

# WER calculation

Word error rate is given by:

$WER = \frac{S + I + D}{N} \times 100$

where $S, I, D$ refer to the number of substitution, insertion and deletion a sentence needs to look like a reference sentence. $N$ is the total number of words in the hypothesis.

In [None]:
def get_wer_delsubins(ref, hyp, debug=False):
    DEL_PENALTY = 1
    SUB_PENALTY = 1
    INS_PENALTY = 1
    r = ref
    h = hyp
    # costs will holds the costs, like in the Levenshtein distance algorithm
    costs = [[0 for inner in range(len(h) + 1)] for outer in range(len(r) + 1)]
    # backtrace will hold the operations we've done.
    # so we could later backtrace, like the WER algorithm requires us to.
    backtrace = [[0 for inner in range(len(h) + 1)] for outer in range(len(r) + 1)]

    OP_OK = 0
    OP_SUB = 1
    OP_INS = 2
    OP_DEL = 3

    # First column represents the case where we achieve zero
    # hypothesis words by deleting all reference words.
    for i in range(1, len(r) + 1):
        costs[i][0] = DEL_PENALTY * i
        backtrace[i][0] = OP_DEL

    # First row represents the case where we achieve the hypothesis
    # by inserting all hypothesis words into a zero-length reference.
    for j in range(1, len(h) + 1):
        costs[0][j] = INS_PENALTY * j
        backtrace[0][j] = OP_INS

    # computation
    for i in range(1, len(r) + 1):
        for j in range(1, len(h) + 1):
            if r[i - 1] == h[j - 1]:
                costs[i][j] = costs[i - 1][j - 1]
                backtrace[i][j] = OP_OK
            else:
                substitutionCost = costs[i - 1][j - 1] + SUB_PENALTY  # penalty is always 1
                insertionCost = costs[i][j - 1] + INS_PENALTY  # penalty is always 1
                deletionCost = costs[i - 1][j] + DEL_PENALTY  # penalty is always 1

                costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
                if costs[i][j] == substitutionCost:
                    backtrace[i][j] = OP_SUB
                elif costs[i][j] == insertionCost:
                    backtrace[i][j] = OP_INS
                else:
                    backtrace[i][j] = OP_DEL

    # back trace though the best route:
    i = len(r)
    j = len(h)
    numSub = 0
    numDel = 0
    numIns = 0
    numCor = 0
    if debug:
        print("OP\tREF\tHYP")
        lines = []
    while i > 0 or j > 0:
        if backtrace[i][j] == OP_OK:
            numCor += 1
            i -= 1
            j -= 1
            if debug:
                lines.append("OK\t" + r[i] + "\t" + h[j])
        elif backtrace[i][j] == OP_SUB:
            numSub += 1
            i -= 1
            j -= 1
            if debug:
                lines.append("SUB\t" + r[i] + "\t" + h[j])
        elif backtrace[i][j] == OP_INS:
            numIns += 1
            j -= 1
            if debug:
                lines.append("INS\t" + "****" + "\t" + h[j])
        elif backtrace[i][j] == OP_DEL:
            numDel += 1
            i -= 1
            if debug:
                lines.append("DEL\t" + r[i] + "\t" + "****")
    if debug:
        lines = reversed(lines)
        for line in lines:
            print(line)
        print("#cor " + str(numCor))
        print("#sub " + str(numSub))
        print("#del " + str(numDel))
        print("#ins " + str(numIns))
    return (numSub + numDel + numIns) / (float)(len(r)), numSub / float(len(r)), numIns / float(len(r)), numDel / float(len(r))

In [None]:

BASE_DIR = "/content/drive/MyDrive/RL_Project_CodeBase"
PHOE_DIR = os.path.join(BASE_DIR,'phoenix2014-release/phoenix-2014-multisigner')
FEAT_DIR = os.path.join(BASE_DIR,'c3d_res_phoenix_body_iter5_120k')
ANNOT_DIR = os.path.join(PHOE_DIR,'annotations')
CORPUS_DIR = os.path.join(ANNOT_DIR,'manual')


# env = ENV()
# helper function append_action(actions, a): returns appended action after ignoring 4 overlap frames
train_dataset = PhoenixVideo(CORPUS_DIR, ANNOT_DIR, FEAT_DIR, 'train' )
train_loader = DataLoader(train_dataset,
    batch_size=1, shuffle=True)

dev_dataset = PhoenixVideo(CORPUS_DIR, ANNOT_DIR, FEAT_DIR, 'dev' )
dev_loader = DataLoader(dev_dataset,
    batch_size=1, shuffle=True)

test_dataset = PhoenixVideo(CORPUS_DIR, ANNOT_DIR, FEAT_DIR, 'test' )
test_loader = DataLoader(test_dataset,
    batch_size=1, shuffle=True)

env = Environment(train_dataset, train_loader)
env2 = Environment(train_dataset, train_loader)

env_dev = Environment(dev_dataset, dev_loader, phase='dev')
env_test = Environment(test_dataset, test_loader, phase='test')

opts = ED({
    "src_vocab_size": 512,
    "trg_vocab_size": train_dataset.voc.num_words,
    "src_pad_idx": 0,
    "trg_pad_idx": 0,
    "embed_size" : 512,
    "num_layers": 6,
    "forward_expansion": 4,
    "heads": 8,
    "dropout": 0.3,
    "device": "cuda" if torch.cuda.is_available() else "cpu", 
    "max_len": 520,
    "log_freq":10,
})

agent = SC_REINFORCE(opts) # add args 
train_wers = []
dev_wers = []
test_wers = []


In [None]:
#debug
# state = env.next_video()
# for key, values in state.items():
#   print("{}: ".format(key),np.shape(values))

In [None]:
#debug
# print(state['gloss'])
# print(state['gloss_text'])

# Main Training and Inference Loops

In [None]:


for vid_num in range(env.num_vids): #env.num_epochs
  # print('*'*5, "NEW VIDEO", '*'*5)
  state = env.next_video()
  state2 = env2.next_video()
  # print('data: ', np.shape(env.sample['data']), 'label: ', np.shape(env.sample['gloss']))
 
  reward_s = []
  reward_a = []
  log_probs = []
  done_s = done_a = False

  while not done_s or not done_a:
    
    if not done_s:
      a_s, log_prob = agent.select_action_sample(state)
      state, r_s, done_s = env.step(a_s)
      reward_s.append(r_s)
      log_probs.append(log_prob)
   
    if not done_a:
      a_a = agent.select_action_argmax(state2)
      state2, r_a, done_a = env2.step(a_a)
      reward_a.append(r_a)

    if done_a and done_s:
      break
  if vid_num % opts.log_freq == 0:
    print("*"*100)
    print("video: ",vid_num)
    print("ref: ",env.ref)
    print("hyp_s: ",env.hyp)
    print("hyp_a: ",env2.hyp)
    print("wer:", np.mean(train_wers))
    if not os.path.exists(os.path.join(BASE_DIR,'Logs/')):
      os.makedirs(os.path.join(BASE_DIR,'Logs/'))
    torch.save(agent.model.state_dict(), os.path.join(BASE_DIR,'Logs/trial.pt'))
  train_wers.append(1 - reward_s[-1])    
  reward_a, reward_s, log_probs = np.array(reward_a), np.array(reward_s), np.array(log_probs)
  # print("action: ",env.actions)
  # print("reward and prob: ",np.shape(reward_a),np.shape(log_probs))
  agent.update_parameters(reward_s, reward_a,log_probs=log_probs)
print("Train WER AVG: ", np.mean(train_wers))

# agent.inference(state)
for epoch in range(env_dev.num_vids):
  print('*'*5, "NEW VIDEO", '*'*5)
  state = env_dev.next_video()
  print('data: ', np.shape(env_dev.sample['data']), 'label: ', np.shape(env_dev.sample['label']))
 
  rewards = []
  done = False

  while not done:
    
    a = agent.inference(dev_dataset,state)
    state, r, done = env_dev.step(a)
    rewards.append(r)
  dev_wers.append(1 - rewards[-1])

print("Dev WER AVG: ", np.mean(dev_wers))







# Supervised Learning route

To cross-test the issues, the SL method of the model was also studied.

`SmoothCrossEntropyLoss` is used for applying label smoothing to the output and labels.

In [None]:
from torch.nn.modules.loss import _WeightedLoss

class SmoothCrossEntropyLoss(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    def k_one_hot(self, targets:torch.Tensor, n_classes:int, smoothing=0.0):
        with torch.no_grad():
            targets = torch.empty(size=(targets.size(0), n_classes),
                                  device=targets.device) \
                                  .fill_(smoothing /(n_classes-1)) \
                                  .scatter_(1, targets.data.unsqueeze(1), 1.-smoothing)
        return targets

    def reduce_loss(self, loss):
        return loss.mean() if self.reduction == 'mean' else loss.sum() \
        if self.reduction == 'sum' else loss

    def forward(self, inputs, targets):
        assert 0 <= self.smoothing < 1

        targets = self.k_one_hot(targets, inputs.size(-1), self.smoothing)
        log_preds = F.log_softmax(inputs, -1)

        if self.weight is not None:
            log_preds = log_preds * self.weight.unsqueeze(0)

        return self.reduce_loss(-(targets * log_preds).sum(dim=-1))

 # Main Training Loop

In [None]:

def collate_fn_video(batch, padding=6):
    # batch.sort(key=lambda x: x['data'].shape[0], reverse=True)
    len_video = [x['data'].shape[0] for x in batch]
    len_label = [len(x['label']) for x in batch]
    batch_video = torch.zeros(len(len_video), max(len_video), batch[0]['data'].shape[1])
    batch_label = []
    IDs = []
    for i, bat in enumerate(batch):
        data = bat['data']
        label = bat['label']
        batch_label.extend(label)
        batch_video[i, :len_video[i], :] = torch.FloatTensor(data)
        IDs.append(bat['id'])
    batch_label = torch.LongTensor(batch_label)
    len_video = torch.LongTensor(len_video)
    len_label = torch.LongTensor(len_label)

    batch_video = batch_video.permute(0, 2, 1)
    return {'data': batch_video, 'label': batch_label, 'len_data': len_video, 'len_label': len_label, 'id': IDs}

def collate_fn_clip(batch):
    IDs = []
    video_list = []
    label_list = {}
    gloss_list = []
    for i, bat in enumerate(batch):
        data = bat['data']
        label = bat['label']
        IDs.append(bat['id'])
        gloss = bat['gloss']
        # print(data.shape, gloss.shape)
        gloss_list.append(torch.tensor(gloss))
        video_list.append(data)
        label_list[bat['id']] = torch.tensor(label).unsqueeze(0)
    # label_tensor = torch.LongTensor(label_list)
    gloss_tensor = torch.cat(gloss_list, dim=0)
    video_tensor = torch.cat(video_list, dim=0)
    return  {'data': video_tensor, 'label': label_list, 'id': IDs, 'gloss': gloss_tensor}


smoothLoss = SmoothCrossEntropyLoss(smoothing = 0.2)
def SL_train(model ,clip, label):
  model.train()  
  clip, label = clip.to(device), label.to(device)
  preds = model(clip, label[:,:-1])
  # print(preds.max(-1))
  # preds = preds.argmax(dim=-1)              
  optimi.zero_grad()
  # print(preds.shape,label[:,1:].shape)
  # loss = F.cross_entropy(preds.view(-1, preds.size(-1)).float(), label[0,1:].view(-1)) #dont know if we need this 
  # print(preds.view(-1, preds.size(-1)).shape, label[:,1:].reshape(-1).shape)
  loss = smoothLoss(preds.view(-1, preds.size(-1)).float(), label[:,1:].reshape(-1))
  loss.backward()
  optimi.step()

  return preds.argmax(dim=-1).squeeze().detach().cpu()

train_dataset = PhoenixVideo(CORPUS_DIR, ANNOT_DIR, FEAT_DIR, 'train' )
train_loader = DataLoader(train_dataset,
    batch_size=1, shuffle=False, collate_fn=collate_fn_clip)



agent = SC_REINFORCE(opts) # add args 

env = Environment(train_dataset, train_loader)
optimi = optim.Adam(agent.model.parameters(),lr = 1e-6)

wers = []
for i,batch in enumerate(train_loader):
  data, gloss = torch.tensor(batch['data']).squeeze(), torch.tensor(batch['gloss'])
  print(data.shape, gloss.shape)
  a = SL_train(agent.model, data, gloss)
  actions = []
  actions = env.append_a(actions,a)
  # print(np.shape(actions[0:10]))
  n=0
  for keys,values in batch['label'].items():
    offset = batch['data'].shape[0]*512
    aa = np.array([actions[n:n+offset]])
    n+=offset
    # print(values.shape,aa[0])
    wers.append(1-env.get_wer(values,aa))
  if (i+1) % opts.log_freq==0:
    print("*"*100)
    print("video: ",i+1)
    print("ref: ",env.ref)
    print("hyp_s: ",env.hyp)
    # print("hyp_a: ",env2.hyp)
    print("wer:", np.mean(wers))
    if not os.path.exists(os.path.join(BASE_DIR,'Logs/')):
      os.makedirs(os.path.join(BASE_DIR,'Logs/'))
    torch.save(agent.model.state_dict(), os.path.join(BASE_DIR,'Logs/trial_SL_Epoch{}.pt'.format(i+1)))
