In [1]:
#imports
import torch
import numpy as np
import matplotlib.pyplot as plt
import random
import math

In [2]:
%run dataloader.ipynb

In [3]:
batch_size=500
train_dataloader = DataLoader(train_set, collate_fn=collate_fn, batch_size=batch_size, shuffle=True)
dev_dataloader = DataLoader(dev_set, collate_fn=collate_fn, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_set, collate_fn=collate_fn, batch_size=batch_size, shuffle=True)

In [4]:
gpu_id = 'cuda:1'
device = torch.device(gpu_id if torch.cuda.is_available() else 'cpu')

In [12]:
class AE(torch.nn.Module):
    def __init__(self):
        super().__init__()
          
        # Building an linear encoder with Linear
        # layer followed by Relu activation function
        # 81 ==> 8
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(752, 856),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(856, 1024)
        )
          
        # Building an linear decoder with Linear
        # layer followed by Relu activation function
        # 8 ==> 81
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(1024, 856),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(856, 752),
            torch.nn.Sigmoid()
        )
  
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [13]:
# input features
###
#idxs = torch.arange(300) # text, 300
#idxs = torch.arange(300, 381) # speech, 81
#idxs = torch.arange(381, 752) # visual, 371
#idxs = torch.arange(381) # text + speech, 381
#idxs = torch.cat((torch.arange(300), torch.arange(381, 752))) # text + visual, 671
#idxs = torch.arange(300, 752) # speech + visual, 452
idxs = torch.arange(752) # text + speech + visual, 752

device = torch.device(gpu_id if torch.cuda.is_available() else 'cpu')
ae = AE()
ae.to(device)

optimiser = torch.optim.Adam(ae.parameters(), lr=0.001, weight_decay=1e-5)
epochs = 50
patience = 20

print('step', 'error')
errors = []
best_error = np.inf
num_bad_epochs = 0
num_epochs = 0
best_epoch = 0
for epoch in range(1, epochs+1):
    for batch_idx, batch in enumerate(train_dataloader, 0):
        x_p,x_c,y,hid,x_p_len=map(lambda x: x.to(device), batch)
        speech_batch = []
        for i, sent in enumerate(x_p):
            for j, word in enumerate(sent):
                if j < x_p_len[i]:
                    speech_batch.append(word[idxs].detach().tolist())
        speech_batch = torch.tensor(speech_batch, dtype=torch.float32, device=device)
        optimiser.zero_grad()
        _, output = ae(speech_batch)
        error = torch.nn.functional.mse_loss(output, speech_batch)
        error.backward()
        optimiser.step()

    error = error.detach().tolist()
    print(epoch, error)
    errors.append(error)

    if error < best_error:
        num_bad_epochs = 0
        best_error = error
        best_epoch = epoch
        num_epochs = epoch
        torch.save(ae, 'autoencoder_all_2.pth')
    else:
        num_bad_epochs += 1
        num_epochs = epoch
        if num_bad_epochs == patience:
            break

plt.plot(range(1, num_epochs+1), errors, 'r')
plt.title('train error')
plt.xlabel('epochs')
plt.ylabel('error')
plt.show()

print('best model after {} epochs with error {}'.format(best_epoch, best_error))
ae = torch.load('autoencoder_all_2.pth')

step error


  context_w=np.array(self.language_sdk[hid]['context_embedding_indexes'])
  context_of=np.array(self.word_aligned_openface_sdk[hid]['context_features'])
  context_cvp=np.array(self.word_aligned_covarep_sdk[hid]['context_features'])


1 0.060116421431303024
2 0.04257666692137718
3 0.04062328487634659
4 0.040538087487220764
5 0.03969992324709892
6 0.038339436054229736
7 0.03792921081185341
8 0.0379788912832737
9 0.03732626885175705
10 0.0370081290602684
11 0.03813313692808151
12 0.03730037808418274


KeyboardInterrupt: 

In [11]:
for batch_idx, batch in enumerate(train_dataloader, 0):
    x_p,x_c,y,hid,x_p_len=map(lambda x: x.to('cuda:1'), batch)
    speech_batch = []
    for i, sent in enumerate(x_p):
        for j, word in enumerate(sent):
            if j < x_p_len[i]:
                speech_batch.append(word[torch.arange(752)].detach().tolist())
    speech_batch = torch.tensor(speech_batch, dtype=torch.float32, device='cuda:1')
    print(speech_batch[0])
    print(ae(speech_batch[0])[1])
    if batch_idx == 0:
        break

  context_w=np.array(self.language_sdk[hid]['context_embedding_indexes'])
  context_of=np.array(self.word_aligned_openface_sdk[hid]['context_features'])
  context_cvp=np.array(self.word_aligned_covarep_sdk[hid]['context_features'])


tensor([-1.1076e-01,  3.0786e-01, -5.1980e-01,  3.5138e-02,  1.0368e-01,
        -5.2505e-02, -1.8021e-01, -1.1839e-01, -5.4253e-02,  2.4980e+00,
        -3.0241e-01,  4.3233e-02, -9.5862e-02, -9.3529e-02, -1.9817e-01,
        -2.6599e-01, -3.4703e-01,  1.4518e+00, -4.9013e-01,  4.1637e-02,
         1.1185e-01, -1.9023e-02, -1.8716e-01, -1.0407e-01, -4.3665e-01,
         7.3561e-02,  1.9546e-02, -1.5012e-01,  1.8499e-01, -2.4364e-01,
         2.0327e-01,  2.8916e-01, -2.1694e-01,  2.8351e-01, -1.0092e-01,
        -4.2189e-02, -7.3457e-02,  2.7325e-01, -1.2898e-01, -5.9407e-02,
        -7.3329e-02,  1.2490e-02, -2.0459e-01, -4.4558e-01,  4.0863e-02,
         2.4588e-01, -2.6111e-01, -8.6821e-02,  1.3628e-01,  1.1094e-01,
        -1.0835e-01,  9.8775e-03,  1.7394e-01,  6.4750e-03,  2.7467e-01,
        -9.7433e-03,  1.6561e-01, -1.6975e-01, -1.2561e-01, -7.1688e-02,
        -5.6815e-02, -2.8632e-01, -2.4231e-01,  2.7819e-01,  2.4112e-01,
        -9.1420e-03, -5.3634e-02,  4.3907e-01,  3.9