In [1]:
from datetime import datetime
import os
import pickle
import math
import time

import torch
from torch import nn, optim
from torch.utils import data
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import numpy as np

In [2]:
with open('embeddings/bag_of_words_train.pickle', 'rb') as f:
    transcript_embeddings = pickle.load(f)

In [3]:
transcript_qna = []
for i in range(len(transcript_embeddings)):
    transcript_qna.append(transcript_embeddings[i][1])
qna_pairs = []

for i in range(len(transcript_qna)):
    counter = 0
    while(True):
        if counter > len(transcript_qna[i]) - 2: break
        qna_pairs.append((transcript_qna[i][counter][0].toarray()[0], transcript_qna[i][counter+1][0].toarray()[0]))
        counter+=2
print(len(qna_pairs))

34586


In [4]:
class QADataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, qna_list):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.qna = qna_list

    def __len__(self):
        return len(self.qna)

    def __getitem__(self, idx):
        return self.qna[idx][0], self.qna[idx][1]

In [5]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [6]:
class SiameseNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.L1 = nn.Linear(input_size, hidden_size)
        nn.init.kaiming_normal_(self.L1.weight)
        self.L2 = nn.Linear(hidden_size, output_size)
        nn.init.kaiming_normal_(self.L2.weight)
    
    def forward(self, x):
        x_L1 = self.L1(x)
        x_relu = torch.nn.functional.relu(x_L1)
        x_L2 = self.L2(x_relu)  
        return x_L2

In [7]:
loss_per_epoch = []
cos_sim_per_epoch = []
def train(network, train_data, dev_data, output_path, batch_size=16, n_epochs=1, lr=0.001):
    """ Train the Siamese Network.

    @param network (network): SiameseNetwork
    @param train_data ():
    @param dev_data ():
    @param output_path (str): Path to which model weights and results are written.
    @param batch_size (int): Number of examples in a single batch
    @param n_epochs (int): Number of training epochs
    @param lr (float): Learning rate
    """
    best_dev_cos_sim = 0
    optimizer = optim.Adam(network.parameters(),lr = lr)
    loss_func = nn.CosineEmbeddingLoss()
    
    for epoch in range(n_epochs):
        print("Epoch {:} out of {:}".format(epoch+2, n_epochs+2))
        dev_cos_sim = train_for_epoch(network, train_data, dev_data, optimizer, loss_func, batch_size)
        best_dev_cos_sim = dev_cos_sim
        print("Saving model for epoch: ", epoch+2)
        
        torch.save(network.state_dict(), output_path + "_" + str(epoch+2))
        print("")


def train_for_epoch(network, train_data, dev_data, optimizer, loss_func, batch_size):
    """ Train network for single epoch.

    @param network (Network): SiameseNetwork
    @param train_data ():
    @param dev_data ():
    @param optimizer (nn.Optimizer): Adam Optimizer
    @param loss_func (cos_sim): Cosine Similarity Loss Function
    @param lr (float): learning rate

    @return dev_cos_sim (float): Cosine Similarity scores for dev data
    """

    n_minibatches = math.ceil(len(train_data) / batch_size)
    loss_meter = AverageMeter()
    train_generator = data.DataLoader(train_data, batch_size = batch_size)
    dev_generator = data.DataLoader(dev_data, batch_size = batch_size)
    
    with tqdm(total=(n_minibatches)) as prog:
        for questions, answers in train_generator:
            optimizer.zero_grad()   # remove any baggage in the optimizer
            loss = 0. # store loss for this batch here
            train_q = questions.float()
            train_a = answers.float()
            output_q = network.forward(train_q)
            output_a = network.forward(train_a)
            
            loss = loss_func(output_q, output_a, torch.tensor(1, dtype=torch.float))
            for a_index in range(answers.shape[0]):
                q_index = np.random.choice(list(range(a_index)) + list(range(a_index + 1, output_a.shape[0])))  
                loss += loss_func(torch.reshape(output_q[q_index], (1, output_q[q_index].shape[0])), 
                                          torch.reshape(output_a[a_index], (1, output_a[a_index].shape[0])),torch.tensor(-1, dtype=torch.float)) 
                
            loss_per_epoch.append(loss)
            loss.backward()
            optimizer.step()

            ### END YOUR CODE
            prog.update(1)
            loss_meter.update(loss.item())

    print ("Average Train Loss: {}".format(loss_meter.avg))
    print("Evaluating on cosine similarity dev set",)
    with torch.set_grad_enabled(False):
        cosine_sim = 0.
        for questions, answers in dev_generator:
            dev_q = questions.float()
            dev_a = answers.float()
            output_q = network.forward(dev_q)
            output_a = network.forward(dev_a)
            cosine_sim += torch.sum(torch.nn.functional.cosine_similarity(output_q, output_a)).item()
    
    
    avg_cosine_sim = cosine_sim / len(dev_data)
    cos_sim_per_epoch.append(avg_cosine_sim)
    print("- dev cosine similarity: {:.2f}".format(avg_cosine_sim))
    return avg_cosine_sim

In [8]:
input_size = len(qna_pairs[0][0])
hidden_dim = 256 #1000
output_dim = 128 #768

print(80 * "=")
print("INITIALIZING")
print(80 * "=")

test_val_split = int(len(qna_pairs)*0.8)
train_data = QADataset(qna_pairs[0:test_val_split])
dev_data = QADataset(qna_pairs[test_val_split:])

start = time.time()
network =  SiameseNetwork(input_size, hidden_dim, output_dim)  
network.load_state_dict(torch.load("./results/model.weights_1"))
print("took {:.2f} seconds\n".format(time.time() - start))

print(80 * "=")
print("TRAINING")
print(80 * "=")
output_dir = "results/"
output_path = output_dir + "model.weights"

if not os.path.exists(output_dir): os.makedirs(output_dir)

train(network, train_data, dev_data, output_path)

INITIALIZING


  0%|          | 0/1730 [00:00<?, ?it/s]

took 0.41 seconds

TRAINING
Epoch 2 out of 3


100%|██████████| 1730/1730 [10:30<00:00,  3.00it/s]


Average Train Loss: 1.3333989169556282
Evaluating on cosine similarity dev set
- dev cosine similarity: 0.02
Saving model for epoch:  2



In [10]:
with open('embeddings/bag_of_words_test.pickle', 'rb') as f:
    test_embeddings = pickle.load(f)

In [11]:
print(test_embeddings[0])

[[<1x65245 sparse matrix of type '<class 'numpy.int64'>'
	with 40 stored elements in Compressed Sparse Row format>, <1x65245 sparse matrix of type '<class 'numpy.int64'>'
	with 52 stored elements in Compressed Sparse Row format>, <1x65245 sparse matrix of type '<class 'numpy.int64'>'
	with 26 stored elements in Compressed Sparse Row format>, <1x65245 sparse matrix of type '<class 'numpy.int64'>'
	with 61 stored elements in Compressed Sparse Row format>, <1x65245 sparse matrix of type '<class 'numpy.int64'>'
	with 53 stored elements in Compressed Sparse Row format>, <1x65245 sparse matrix of type '<class 'numpy.int64'>'
	with 57 stored elements in Compressed Sparse Row format>, <1x65245 sparse matrix of type '<class 'numpy.int64'>'
	with 50 stored elements in Compressed Sparse Row format>, <1x65245 sparse matrix of type '<class 'numpy.int64'>'
	with 39 stored elements in Compressed Sparse Row format>, <1x65245 sparse matrix of type '<class 'numpy.int64'>'
	with 47 stored elements in Com

In [28]:
network =  SiameseNetwork(input_size, hidden_dim, output_dim)  
network.load_state_dict(torch.load("./results/model.weights_2"))
embeded_transcripts = []

with torch.set_grad_enabled(False):
    for i in range(len(test_embeddings)):
        new_transcript, embed_statement, embed_qna = [], [], []

        statement =  test_embeddings[i][0]
        for j in range(len(statement)):
            output = network.forward(torch.tensor(statement[j].toarray(), dtype=torch.float32))
            embed_statement.append(output.numpy())  
        new_transcript.append(embed_statement)

        qna = test_embeddings[i][1]
        for j in range(len(qna)): 
            output = network.forward(torch.tensor(qna[j][0].toarray(), dtype=torch.float32))
            embed_qna.append((output.numpy(), qna[j][1]))  
        new_transcript.append(embed_qna)
        embeded_transcripts.append(new_transcript)

In [30]:
with open('embeddings/siameseBOW_epoch3.pickle', 'wb') as f:
    pickle.dump(embeded_transcripts, f)