In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import os
import json
import torch
import random
import pandas as pd
import numpy as np

from data_handlers import YCDataset, SampleBatchIdx
from utils import compute_normalization_parameters
from torch.utils.data import DataLoader
from torch import nn
from tqdm import tqdm
from pathlib import Path
from matplotlib import pyplot as plt

opj = lambda x, y: os.path.join(x, y)

In [3]:
ann_path = Path('/common/users/dm1487/YouCookII')
videos_path = Path('raw_frames/raw_videos')
steps_path = Path('raw_text')

In [4]:
training_df = pd.read_csv('training_with_labels.csv')
validation_df = pd.read_csv('validation_with_labels.csv')
training_df.shape, validation_df.shape

((1194, 8), (417, 8))

In [5]:
train_dataset = YCDataset(training_df)
batch_sampler = SampleBatchIdx(train_dataset, 8, 24)
train_dl = DataLoader(train_dataset, batch_sampler = batch_sampler)

In [6]:
class NonLinearBlock(nn.Module):
    def __init__(self, in_feat, out_feat, batch_norm):
        super(NonLinearBlock, self).__init__()
        self.fc = nn.Linear(in_feat, out_feat)
        self.relu = nn.ReLU()
        self.do_batchnorm = batch_norm
        if self.do_batchnorm:
            self.norm_func = nn.BatchNorm1d(out_feat)
            
        
    def forward(self, x):
        x = self.fc(x)
        # TODO: we can switch positions of relu and batch norm to see what happens
        if self.do_batchnorm:
            x = self.norm_fn(x)
        x =  self.relu(x)
        return x

In [7]:
class NonLinearMapping(nn.Module):
    def __init__(self, feat, num_layers, normalization_params=None, batch_norm=False):
        super(NonLinearMapping, self).__init__()
        self.nonlin_mapping = nn.Sequential(*[NonLinearBlock(feat, feat, batch_norm) for _ in range(num_layers - 1)])
        
        if num_layers > 0:
            self.lin_mapping = nn.Linear(feat, feat)
        else:
            self.lin_mapping = lambda x : torch.zeros_like(x) ## for no layers, do not do anything
        
        self.register_buffer('norm_mean', torch.zeros(feat))
        self.register_buffer('norm_sigma', torch.ones(feat))
    
    def initialize_normalization(self, normalization_params):
        if normalization_params is not None:
            if len(normalization_params) > 0:
                self.norm_mean.data.copy_(normalization_params[0])
            if len(normalization_params) > 1:
                self.norm_mean.data.copy_(normalization_params[1])
    
    def forward(self, x):
        x = (x - self.norm_mean)/ self.norm_sigma
        res = self.nonlin_mapping(x)
        # TODO: maybe add a dropout here
        res = self.lin_mapping(res)
        return x + res

In [13]:
class EmbeddingsMapping(nn.Module):
    def __init__(self, feat, video_layers=2, text_layers=2, drop_layers=1, learnable_drop=False, normalization_dataset=None, batch_norm=False):
        super(EmbeddingsMapping, self).__init__()
        self.video_mapping = NonLinearMapping(feat, video_layers, batch_norm)
        self.text_mapping = NonLinearMapping(feat, text_layers, batch_norm)
        
        if learnable_drop:
            self.drop_mapping = NonLinearMapping(feat, drop_layers, batch_norm)
        
        if normalization_dataset is not None:
            norm_params = compute_normalization_parameters(normalization_dataset, feat)
            self.video_mapping.initialize_normalization(norm_params[:2])
            self.text_mapping.initialize_normalization(norm_params[2:])
            
    def map_video(self, x):
        return self.video_mapping(x)

    def map_text(self, z):
        return self.text_mapping(z)
    
    def compute_distractors(self, v):
        return self.drop_mapping(v)

In [14]:
model = EmbeddingsMapping(512, video_layers=2, text_layers=2, drop_layers=2, learnable_drop=True, normalization_dataset=train_dataset)
model = model.to('cuda:1')

In [21]:
dp = next(iter(train_dl))
distractor = torch.stack([s.mean(0) for s in dp['step_feature']], 0).to('cuda:1')
vf, sf = dp['video_feature'].to('cuda:1'), dp['step_feature'].to('cuda:1')

In [22]:
model.map_video(vf).shape, model.map_text(sf).shape, model.compute_distractors(distractor).shape

(torch.Size([24, 500, 512]), torch.Size([24, 16, 512]), torch.Size([24, 512]))