# Train Bimodal AE with Attention using C3D features

## Extract Training/Test class labels into **train_ids** and **test_ids** sets

In [None]:
import torch
import torch.nn as nn
import numpy as np

from layers.AEwithAttention import AEwithAttention


In [None]:
# from random import randint
# import json

# # num test classes
# n_test_classes = 20

# # load all video ids and labels
# with open('data/activity_net.v1-3.min.json', 'r') as f:
#     anet_json = json.load(f)
    
# # extract class labels
# labels = set()
# for k,v in anet_json['database'].items():
#     if(len(v['annotations']) > 0):
#         labels.add(v['annotations'][0]['label'])
# all_classes = list(labels)

# # pick n classes at random to form the test set
# test_classes = [all_classes[randint(0,len(all_classes))] for i in range(n_test_classes)]

# # extract set of test/training classe labels
# test_ids = set()
# train_ids = set()
# for k,v in anet_json['database'].items():
#     if(len(v['annotations']) > 0):
#         if v['annotations'][0]['label'] in test_classes:
#             test_ids.add(k)
#         else:
#             train_ids.add(k)

In [None]:
# limit the number of videos
# pick 100 samples at random

from random import randint
import json

# num test classes
n_test_classes = 20

# load all video ids and labels
with open('data/activity_net.v1-3.min.json', 'r') as f:
    anet_json = json.load(f)
    
# extract class labels
labels = set()
for k,v in anet_json['database'].items():
    if(len(v['annotations']) > 0):
        labels.add(v['annotations'][0]['label'])
all_classes = list(labels)

# limit the number of videos being processed and number of classes to ensure we get more videos per class
num_vids = 100
num_classes = 10
n_test_classes = 2

# limit the number of classes 
all_classes = all_classes[:num_classes]

# pick n classes at random to form the test set
test_classes = [all_classes[randint(0,len(all_classes)-1)] for i in range(n_test_classes)]

count = 0
# extract set of test/training classe labels
test_ids = set()
train_ids = set()
for k,v in anet_json['database'].items():
    if(len(v['annotations']) > 0) and count < num_vids:
        if v['annotations'][0]['label'] in test_classes:
            test_ids.add(k)
        else:
            train_ids.add(k)
            
        count = count + 1
            

In [None]:
selected_vids = train_ids.union(test_ids)
selected_vids = ['v_{}'.format(v) for v in selected_vids]

## Load C3D video features into **vids** dict

In [None]:
# import pickle
# import numpy as np
# import h5py

# fname = '../data/vid_c3d_feats/sub_activitynet_v1-3.c3d.hdf5'
# f = h5py.File(fname,'r+')    

# # extract c3d features as numpy arrays
# vids_list = list(f.keys())

# anet_c3d = {}
# for vid in vids_list:
#     vid_c3d_feat = np.array(f[vid]['c3d_features'])
#     anet_c3d[vid] = vid_c3d_feat

# vids = anet_c3d

# # fname = 'anet_c3d.pkl'
# # vids = pickle.load(open(fname, 'rb'))


In [None]:
import pickle
# import numpy as np
# import h5py

# fname = 'data/vid_c3d_feats/sub_activitynet_v1-3.c3d.hdf5'
# f = h5py.File(fname,'r+')    


# anet_c3d = {}
# for vid in selected_vids:
#     vid_c3d_feat = np.array(f[vid]['c3d_features'])
#     anet_c3d[vid] = vid_c3d_feat

# vids = anet_c3d

fname = 'data/anet_c3d.pkl'
vids = pickle.load(open(fname, 'rb'))


## Load Captions gloVe features into **caps** dict

In [None]:

# fname = 'data/anet_captions.all.glove.pkl'
# caps = pickle.load(open(fname, 'rb'))

# # transpose feature vectors to get Lx300 dimensions
# caps_t = {}
# for k,v in caps.items():
#     caps_t[k] = v.t()
# caps = caps_t

In [None]:

fname = 'data/anet_captions.all.glove.pkl'
caps = pickle.load(open(fname, 'rb'))
caps_limited = {}

print(len(caps))
for v in selected_vids:
    if v in caps.keys():
        temp = caps[v]
        caps_limited[v] = temp
    
caps = caps_limited


In [None]:
# if there are any differences between caps and vids remove those vids

caps_k = set(caps.keys())
vids_k = set(vids.keys())

diff = vids_k - vids_k.intersection(caps_k)
for k in diff:
    del vids[k]


In [None]:
print('num videos : {}'.format(len(vids)))
print('num captions : {}'.format(len(caps)))


In [None]:
# transpose feature vectors to get Lx300 dimensions
caps_t = {}
for k,v in caps.items():
    caps_t[k] = v.t()
caps = caps_t

## Preprocess **vids** and **caps** to make all feature vectors the same size

In [None]:
# Take 75 percentile to fix feature representation dimensions

In [None]:
import pandas as pd

T = []
src = list(vids.items())
for i in range(len(src)):
    T.append(src[i][1].shape[0])

# print 75 percentile 
T_fixed = pd.DataFrame(T).quantile(.75)

T_fixed

In [None]:
L = []
src = list(caps.items())
for i in range(len(src)):
    L.append(src[i][1].shape[0])
    
# print 75 percentile 
L_fixed = pd.DataFrame(L).quantile(.75)

L_fixed

In [None]:
count = 0
test = {}
for k,v in vids.items():
    test[k] = v
    if count>3:
        break
    count = count+1

In [None]:
import torch 

def preprocess_embeddings_dict(embeddings_dict, num_feats, T):
    target_len = T * num_feats
    processed_embeddings = {}
    count = 0
    for k, emb in embeddings_dict.items():
        emb = emb.reshape(-1)
        processed_emb = unify_embedding_length(emb, target_len)
        processed_emb = processed_emb.reshape(-1, num_feats)
        processed_embeddings[k] = processed_emb
        count = count + 1
        
    return processed_embeddings

# unify feat size to ensure all embeddings are 1024xT
# if embedding is smaller augment it with zeros at the end
# if embedding is larger crop the extra rows
def unify_embedding_length(emb, target_len):
    emb_len = len(emb)
    if emb_len < target_len:
        len_diff = target_len - emb_len
        zero_padding = np.zeros([len_diff])
        return torch.tensor(np.hstack((emb, zero_padding)))
    elif emb_len > target_len:
        return torch.tensor(emb[0:target_len])
    else:
        return torch.tensor(emb)


In [None]:
import numpy as np
# n_feats_v = 500
# n_feats_t = 300
# T_fixed = 600
# L_fixed = 60

n_feats_v = 500
n_feats_t = 500
T_fixed = 600
L_fixed = 600

vids_processed =  preprocess_embeddings_dict(vids, n_feats_v, T_fixed)
caps_processed =  preprocess_embeddings_dict(caps, n_feats_t, L_fixed)

vids = vids_processed
caps = caps_processed


In [None]:
# set params
learning_rate = 0.01
n_epochs = 2

n_filt = 4

# n_feat_v = 500
# n_feat_t = 300
# T = 600
# L = 60

n_feat_v = 500
n_feat_t = 500
T = 600
L = 600

## Train with paired v,t data using recons, joint, cross, cycle loss

In [None]:
### input ###
# vids
# caps
    
model_v = AEwithAttention(n_feat_v, T, n_filt)
model_t = AEwithAttention(n_feat_t, L, n_filt)

criterion = nn.MSELoss()

optimizer_v = torch.optim.SGD(model_v.parameters(), lr = learning_rate, momentum = True)
optimizer_t = torch.optim.SGD(model_t.parameters(), lr = learning_rate, momentum = True)

optimizer_E_v = torch.optim.SGD(model_v.encoder_.parameters(), lr = learning_rate, momentum = True)
optimizer_E_t = torch.optim.SGD(model_t.encoder_.parameters(), lr = learning_rate, momentum = True)

optimizer_G_v = torch.optim.SGD(model_v.decoder_.parameters(), lr = learning_rate, momentum = True)
optimizer_G_t = torch.optim.SGD(model_t.decoder_.parameters(), lr = learning_rate, momentum = True)


In [None]:
# training
for epoch in range(n_epochs):
    counter = 1
    for i in vids.keys():
        # Forward pass
        v = vids[i]
        t = caps[i]
        
        v = torch.tensor(v).float()
        t = torch.tensor(t).float()
        
        dim = v.shape

        # Compute recons loss 
        loss_recons_v = criterion(model_v(v).reshape(dim[0], dim[1]), v)
        loss_recons_t = criterion(model_t(t).reshape(dim[0], dim[1]), t)
        loss_recons = loss_recons_v + loss_recons_t
        # the following losses require paired video/caption data (v and t)
        # model_v and model_t are the corresponding models for video and captions respectively

        # Compute joint loss
        loss_joint = criterion(model_v.encoder(v), model_t.encoder(t))

        # Compute cross loss
        loss_cross1 = criterion(model_t.decoder(model_v.encoder(v)).reshape(dim[0], dim[1]), t)
        loss_cross2 = criterion(model_v.decoder(model_t.encoder(t)).reshape(dim[0], dim[1]), v)
        loss_cross = loss_cross1 + loss_cross2

        # Compute cycle loss
        loss_cycle1 = criterion(model_t.decoder(model_v.encoder(model_v.decoder(model_t.encoder(t)))).reshape(dim[0], dim[1]), t)
        loss_cycle2 = criterion(model_v.decoder(model_t.encoder(model_t.decoder(model_v.encoder(v)))).reshape(dim[0], dim[1]), v)
        loss_cycle = loss_cycle1 + loss_cycle2

        # set hyperparams 
        a1, a2, a3 = 1, 1, 1

        # Compute total loss
        loss = loss_recons + a1 * loss_joint + a2 * loss_cross + a3 * loss_cycle
        
        # Backprop and optimize
        optimizer_v.zero_grad()
        optimizer_t.zero_grad()
        optimizer_E_v.zero_grad()
        optimizer_E_t.zero_grad()
        optimizer_G_v.zero_grad()
        optimizer_G_t.zero_grad()
        
        loss.backward()
        
        optimizer_v.step()
        optimizer_t.step()
        optimizer_E_v.step()
        optimizer_E_t.step()
        optimizer_G_v.step()
        optimizer_G_t.step()
        
        print ('Epoch[{}/{}], Step[{}/{}] Loss: {}\n'.format(epoch + 1, n_epochs, counter, len(vids), loss.item()))

        counter = counter + 1

# torch.save(model.state_dict(), 'out/model.sd')
