# IMPORTING AND LOADING

In [1]:
import subprocess
import torch
import torch.nn as nn
import time
from torch.autograd import Variable
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import seaborn as sns

experiment = "seperate_models"

train_vectors = np.load("data/"+ experiment + "/train_vectors.npy")[:2000]
train_sentences = np.load("data/"+ experiment + "/train_sentences.npy")[:2000]

vec = train_vectors[::5], train_vectors[1::5], train_vectors[2::5], train_vectors[3::5], train_vectors[4::5]
sen = train_sentences[::5], train_sentences[1::5], train_sentences[2::5], train_sentences[3::5], train_sentences[4::5]

dataset1 = np.asarray(vec[0:2])
dataset2 = np.asarray(vec[0:3])
dataset3 = np.asarray(vec[0:4])
dataset4 = np.asarray(vec[0:5])


sentences1 = np.asarray(sen[0:2])
sentences2 = np.asarray(sen[0:3])
sentences3 = np.asarray(sen[0:4])
sentences4 = np.asarray(sen[0:5])



test_vectors = np.load("data/"+ experiment + "/test_vectors.npy")[:2000]
test_sentences = np.load("data/"+ experiment + "/test_sentences.npy")[:2000]


vec = test_vectors[::5], test_vectors[1::5], test_vectors[2::5], test_vectors[3::5], test_vectors[4::5]
sen = test_sentences[::5], test_sentences[1::5], test_sentences[2::5], test_sentences[3::5], test_sentences[4::5]

test_dataset1 = np.asarray(vec[0:2])
test_dataset2 = np.asarray(vec[0:3])
test_dataset3 = np.asarray(vec[0:4])
test_dataset4 = np.asarray(vec[0:5])

vec = None

test_sentences1 = np.asarray(sen[0:2])
test_sentences2 = np.asarray(sen[0:3])
test_sentences3 = np.asarray(sen[0:4])
test_sentences4 = np.asarray(sen[0:5])

sen = None


# MODEL DEFINITON AND PARAMETERS

In [2]:
class BasicGRU(nn.Module):
    def __init__(self, hidden_size, n_layers=1):
        super(BasicGRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=0, bidirectional=True)       
        self.lin = nn.Linear(hidden_size, hidden_size)

    def forward(self, input_seq, input_lengths, hidden=None):

        packed = torch.nn.utils.rnn.pack_padded_sequence(input_seq, input_lengths, batch_first=True)

        outputs, hidden = self.gru(packed, hidden)
        
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs , batch_first=True)

        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]

        output = self.lin(outputs[:,-1,:].unsqueeze(1))
        return output

In [3]:
class StoryVectors(Dataset):

    def __init__(self, dataset, sentences):
        self.dataset = dataset
        self.type = self.dataset.shape[0]
        self.sen = sentences

        

    def __len__(self):
        return self.dataset.shape[1]

    def __getitem__(self, idx):
      
        if self.type == 2:
            X = [self.dataset[0][idx]]
            y = [self.dataset[1][idx]]
            sentences = [self.sen[0][idx], self.sen[1][idx]]
          
          
        elif self.type == 3:
            X = [self.dataset[0][idx], self.dataset[1][idx]]
            y = [self.dataset[2][idx]]
            sentences = [self.sen[0][idx], self.sen[1][idx], self.sen[2][idx]]
        
        elif self.type == 4:
            X = [self.dataset[0][idx], self.dataset[1][idx], self.dataset[2][idx]]
            y = [self.dataset[3][idx]]
            sentences = [self.sen[0][idx], self.sen[1][idx], self.sen[2][idx], self.sen[3][idx]]
        
        elif self.type == 5:
            X = [self.dataset[0][idx], self.dataset[1][idx], self.dataset[2][idx], self.dataset[3][idx]]
            y = [self.dataset[4][idx]]
            sentences = [self.sen[0][idx], self.sen[1][idx], self.sen[2][idx], self.sen[3][idx], self.sen[4][idx]]
        
        
        return [X, len(X), y, sentences]
      
def vocab_collate_func(batch):
    X = []
    y = []
    lengths = []
    sentences = []

    for datum in batch:
        X.append(datum[0])
        lengths.append(datum[1])
        y.append(datum[2])
        sentences.append(datum[3])

    return [torch.FloatTensor(X), torch.LongTensor(lengths), torch.FloatTensor(y), sentences]

In [4]:
model1 = torch.load("model/model1_2000.tar", map_location={'cuda:0': 'cpu'})
model2 = torch.load("model/model2_2000.tar", map_location={'cuda:0': 'cpu'})
model3 = torch.load("model/model3_2000.tar", map_location={'cuda:0': 'cpu'})
model4 = torch.load("model/model4_2000.tar", map_location={'cuda:0': 'cpu'})

# EXPERIMENTS

In [47]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
def distr(array):
    plt.scatter([i for i in range(len(array))], sorted(array))
    plt.savefig('1.pdf')
    plt.show()
    
def freq(array):
    sns.set_style('whitegrid')
    sns.kdeplot(np.array(array), bw=0.5)
    

def nn(qvec, vectors, array, k=5):
    qvec /= np.linalg.norm(qvec)
    vectors = np.asarray([ i / np.linalg.norm(i) for i in vectors.tolist()])
    scores = np.dot(qvec, vectors.T).flatten()
    sorted_args = np.argsort(scores)[::-1]
    sentences = [(array[a], scores[a]) for a in sorted_args[:k]]
    for i, s in enumerate(sentences):
        print (s, sorted_args[i])

def analyse(predicted, actual, sentences, mode_vectors, mode_sentences): #mode can be train or test
    
    assert len(predicted) == len(actual), "Check dimensions"
    
    ranks = []
    for p, a, s in zip(predicted, actual, sentences): #iterating through the batch 0-31
        story = sen
        scores = np.dot(p, mode_vectors.T).flatten()
        score_actpred = np.dot(p, a)
        #print("score of act and pred:", score_actpred)
        rank = -1
        sorted_scores = sorted(scores, reverse=True)
        for index, score in enumerate(sorted_scores):
            if np.isclose(score, score_actpred):
                rank = index
                break

        ranks.append(rank)

    return ranks
  
def show_inp_out(actual, predicted, mode_vectors, mode_sentences):
    
    print("Actual Output")
    nn(actual.squeeze().tolist(), mode_vectors, mode_sentences, k=1)

    print("Predicted Output")
    nn(predicted.squeeze().tolist(), mode_vectors, mode_sentences, k=5)

In [21]:
print(test_dataset1.shape)
no, sample, dim = test_dataset1.shape
d = test_dataset1.reshape(no*sample, dim)
s = test_sentences1.reshape(no*sample)

(2, 400, 4800)


In [42]:

def get_loader(dataset, sentences):
    test_dataset = StoryVectors(dataset, sentences)
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                           batch_size=32,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True,
                                           num_workers=4)
    return test_loader
    


def test_model(model, test_loader, test_vectors, test_sentences):
    model.eval()
    ranks = []
    
#     no, sample, dim = test_vectors.shape
#     test_vectors = test_vectors.reshape(no*sample, dim)
#     test_sentences = test_sentences.reshape(no*sample)

    for data, lengths, labels, sentences in test_loader:
        pred = model(data, lengths)
        pred = pred.detach().numpy().squeeze()
        labels = labels.detach().numpy().squeeze()  
        #print(len(pred.tolist()), len(labels.tolist()))
        batch_ranks = analyse(pred.tolist(), labels.tolist(), sentences, test_vectors,  test_sentences)
        ranks.extend(batch_ranks)
    return ranks

test_loader1 = get_loader(test_dataset1, test_sentences1)
test_loader2 = get_loader(test_dataset2, test_sentences2)
test_loader3 = get_loader(test_dataset3, test_sentences3)
test_loader4 = get_loader(test_dataset4, test_sentences4)
 
ranks1 = test_model(model1, test_loader1, test_dataset1[-1], test_sentences1[-1]) 
ranks2 = test_model(model2, test_loader2, test_dataset2[-1], test_sentences2[-1]) 
ranks3 = test_model(model3, test_loader3, test_dataset3[-1], test_sentences3[-1]) 
ranks4 = test_model(model4, test_loader4, test_dataset4[-1], test_sentences4[-1]) 



In [52]:
train_loader1 = get_loader(dataset1, sentences1)
rankst1 = test_model(model1, train_loader1, dataset1[-1], sentences1[-1]) 