# STORY CLOZE GENERATOR

In [2]:
import torch
import torch.nn as nn
import time
from torch.autograd import Variable
from torch.utils.data import Dataset
import pandas as pd
import numpy as np


BATCH_SIZE = 32
n = 10
csv_file = "/Users/shrey/Text Generation/story_cloze/data/process_data.csv"
numpy_file = "vectors_"+ str("preprocessed") + ".npy"

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

## MODEL

In [3]:
class BasicGRU(nn.Module):
    def __init__(self, hidden_size, n_layers=1, dropout=0.3):
        super(BasicGRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        #self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
        
        self.lin = nn.Linear(hidden_size, hidden_size)

    def forward(self, input_seq, input_lengths, hidden=None):
#         # Convert word indexes to embeddings
#         embedded = self.embedding(input_seq)
#         # Pack padded batch of sequences for RNN module
        packed = torch.nn.utils.rnn.pack_padded_sequence(input_seq, input_lengths, batch_first=True)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs , batch_first=True)
        #print(outputs.shape)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        
        output = self.lin(outputs)
        #print(outputs.shape)
        # Return output and final hidden state
        #assert torch.equal(outputs[-1,:,:], hidden.squeeze(0))
        #print(hidden.shape, outputs[:,-1,:].shape)
        return output
    
model = BasicGRU(hidden_size = 4800)
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-4)

## DATA

In [4]:
story_0 = []
story_1 = []
class StoryVectors(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, numpy_file, csv_file):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        #"/Users/shrey/Text Generation/story_cloze/data/stories.csv"
        # "vectors_"+ str(n) + ".npy"
        n = 10
        self.sentences = pd.read_csv(csv_file).values[:n,1:].reshape(-1).tolist()
        vecn = np.load(numpy_file)
        vec = vecn.tolist()
        
        v1, v2, v3, v4, v5 = vec[::5], vec[1::5], vec[2::5], vec[3::5], vec[4::5]
        story_0.append([v1[0], v2[0], v3[0], v4[0], v5[0]])
        story_1.append([v1[1], v2[1], v3[1], v4[1], v5[1]])

        self.X = []
        self.y = []
        m = len(v1)
        for i in range(m):
            self.X.append([v1[i], v2[i], v3[i], v4[i]])
            self.y.append([v2[i], v3[i], v4[i], v5[i]])
#             #lengths.append(4)
#             self.X.append([v1[i], v2[i], v3[i]])
#             self.y.append(v4[i])
#             #lengths.append(3)
#             self.X.append([v1[i], v2[i]])
#             self.y.append(v3[i])
#             #lengths.append(2)
#             self.X.append([v1[i]])
#             self.y.append(v2[i])
#             #lengths.append(1)

        assert (len(self.X) == len(self.y))

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        """
        Triggered when you call dataset[i]
        """
        return [self.X[idx], len(self.X[idx]), self.y[idx]]
    

In [None]:
def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    X = []
    y = []
    lengths = []

    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]),
                                pad_width=((0, 4-datum[1]), (0,0)),
                                mode="constant", constant_values=0)
        X.append(padded_vec)
        y.append(datum[2])
        lengths.append(datum[1])
        
    ind_dec_order = np.argsort(lengths)[::-1]
    X = np.array(X)[ind_dec_order]
    lengths = np.array(lengths)[ind_dec_order]
    y = np.array(y)[ind_dec_order]
    return [torch.FloatTensor(X), 
            torch.LongTensor(lengths), torch.FloatTensor(y)]

In [7]:
train_dataset = StoryVectors(numpy_file, csv_file)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True,
                                           num_workers=4)

In [13]:
d1, l1, la1 = iter(train_loader).next()
la1.shape

## TRAINING

In [None]:
def training(num_epochs):
    # Train the model
    total_step = len(train_loader)
    for epoch in range(num_epochs):
        t0 = time.time()
        for i, (data, lengths, labels) in enumerate(train_loader):
            model.train()
            optimizer.zero_grad()
            # Forward pass
            y_pred = model(data, lengths)
            print(y_pred.shape)
            loss = criterion(y_pred, labels)
            #print(epoch, i,  loss.item())

            # Backward and optimize
            loss.backward()
            optimizer.step()
            if i % 1 == 0:
                loss_data = loss.data[0]
                #train_losses.append(loss_data)
                print(
                    'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.
                    format(epoch, i * len(data), len(train_loader.dataset),
                           100. * i / len(train_loader), loss_data))
                
        print('Time taken by the epoch: {} seconds'.format(time.time() - t0))

            # validate every 100 iterations
    #         if i > 0 and i % 100 == 0:
    #             # validate
    #             val_acc = test_model(val_loader, model)
    #             print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
    #                        epoch+1, num_epochs, i+1, len(train_loader), val_acc))

training(5)

## VALIDATION

In [20]:
df = pd.read_csv(csv_file)
model = torch.load("model/GRU_Ln_np_100.tar", map_location={'cuda:0': 'cpu'})

array = df.values[:,3:].reshape(-1).tolist()
vecn = np.load("vectors_"+ str("preprocessed") + ".npy")
vec = vecn.tolist()
print(len(vec))


1995


In [None]:
train_dataset = StoryVectors(numpy_file, csv_file)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True,
                                           num_workers=4)

def test_model(test_loader):
    model.eval()
    ranks1, ranks2, ranks3, ranks4 = [], [], [], []
    for data, lengths, labels in train_loader:
        pred = model(data, lengths)
        #print(len(pred.tolist()), len(labels.tolist()))
        s1s = data[:,0,:].tolist()
        batch_ranks1, batch_ranks2, batch_ranks3, batch_ranks4 = analyse(s1s, pred.tolist(), vecn, labels.tolist(), array)
        ranks1.extend(batch_ranks1)
        ranks2.extend(batch_ranks2)
        ranks3.extend(batch_ranks3)
        ranks4.extend(batch_ranks4)

    return ranks1, ranks2, ranks3, ranks4


ranks1, ranks2, ranks3, ranks4 = test_model(train_loader)    
    

## Experiments

In [30]:
#%matplotlib inline
# from matplotlib import interactive
# interactive(True)
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
def distr(array):
    plt.scatter([i for i in range(len(array))], sorted(array))
    plt.savefig('1.pdf')
    plt.show()
    
def freq(array):
    plt.hist(array, bins=np.arange(min(array), max(array)+1))
    plt.show()
    

def nn(qvec, vectors, array, k=5):
    qvec /= np.linalg.norm(qvec)
    vectors = np.asarray([ i / np.linalg.norm(i) for i in vectors.tolist()])
    scores = np.dot(qvec, vectors.T).flatten()
    #distr(scores)
    #analyse(scores)
    sorted_args = np.argsort(scores)[::-1]
    sentences = [(array[a], scores[a]) for a in sorted_args[:k]]
    for i, s in enumerate(sentences):
        print (s, sorted_args[i])

def analyse(s1s, predicted, vectors, actual, array):
    
    assert len(predicted) == len(actual), "Oh"

    vectors = np.asarray([ i / np.linalg.norm(i) for i in vectors.tolist()])
    ranks1, ranks2, ranks3, ranks4 = [], [], [], []
    count = 0
    
    for pred, act, s1 in zip(predicted, actual, s1s): #iterating through the batch 0-31
        #print(Story)
        story = [s1] + act
        print(len(story))
        [nn(sen, vectors, array, k=1) for sen in story]
        
        for i, (p,a) in enumerate(zip(pred, act)): #iterating through the sequence 0-3
            #print(len(p))
            p /= np.linalg.norm(p)
            a /= np.linalg.norm(a)
            scores = np.dot(p, vectors.T).flatten()
            score_actpred = np.dot(p, a)
            #print("score of act and pred:", score_actpred)

            rank = -1
            sorted_scores = sorted(scores, reverse=True)
            for index, score in enumerate(sorted_scores):
                if np.isclose(score, score_actpred):
                    rank = index
                    break
                    
            if i == 0:
              ranks1.append(rank)
            elif i == 1:
              ranks2.append(rank)
            elif i == 2:
              ranks3.append(rank)
              show_inp_out(a,p)
            elif i == 3:
              ranks4.append(rank)

    return ranks1, ranks2, ranks3, ranks4
  
def show_inp_out(actual, predicted):
#     print("Input Sentences")
#     inp3 = data[index,3,:]
#     inp2 = data[index,2,:]
#     inp1 = data[index,1,:]
#     inp0 = data[index,0,:]
#     nn(inp0.numpy().squeeze().tolist(), vecn, array, k=1)
#     nn(inp1.numpy().squeeze().tolist(), vecn, array, k=1)
#     nn(inp2.numpy().squeeze().tolist(), vecn, array, k=1)
#     nn(inp3.numpy().squeeze().tolist(), vecn, array, k=1)

    print("Actual Output")
    nn(actual.squeeze().tolist(), vecn, array, k=1)

    print("Predicted Output")
    nn(predicted.squeeze().tolist(), vecn, array, k=5)
        


In [101]:
inp = d1.numpy()[p:q,3,:].squeeze().tolist()
np.linalg.norm(act-pred) 

0.8380942

In [226]:
# import subprocess
# #subprocess.run(["python", "import sys; print sys.version_info[0]"], stdout=subprocess.PIPE)
# subprocess.check_call(["python", ""])

import subprocess

#p = subprocess.check_output(["python", "/Users/shrey/test.py"])
#p = subprocess.check_output(["echo" "hi"])
#p = subprocess.run(["echo $(python /Users/shrey/test.py)"], shell=True, stdout=subprocess.PIPE)
p1 = subprocess.run(['/usr/bin/python', '/Users/shrey/test.py'], stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print (p1.stdout)


b'hello\n'


## ROUGH

In [None]:
############## Pack Padded and Pad packed #########################
# Padd all your sequences with 0 so that they are the same length. 
# But, record the actual length (unpadded) of each sequence in lengths(int) vector
# to the function pack_padded_sequence pass Batch_Size * Longest Sequence length * num_directions(other dimensions)
# Also, int tensor length with len(length) = Batch_Size


################ Rough Work #####################################

# pack = torch.nn.utils.rnn.pack_padded_sequence(X_t, lengths , batch_first=True)
# unpack, s = torch.nn.utils.rnn.pad_packed_sequence(pack, batch_first=True)

# a = [torch.tensor([3,4,5,6]), torch.tensor([1,2,3]), torch.tensor([3,4])]
# print(a)
# b = torch.nn.utils.rnn.pad_sequence(a, batch_first=True)
# print(b)
# c = torch.nn.utils.rnn.pack_padded_sequence(b, batch_first=True, lengths=[4,3,2])
# print(c)
# d, _ = torch.nn.utils.rnn.pad_packed_sequence(c)
# print(d)

# batch_size = 3
# max_length = 3
# hidden_size = 2
# n_layers =1

# # container
# batch_in = torch.zeros((batch_size, 1, max_length))

# #data
# vec_1 = torch.FloatTensor([[1, 2, 3]])
# vec_2 = torch.FloatTensor([[1, 2, 0]])
# vec_3 = torch.FloatTensor([[1, 0, 0]])

# batch_in[0] = vec_1
# batch_in[1] = vec_2
# batch_in[2] = vec_3

# batch_in = Variable(batch_in)

# seq_lengths = [3,2,1] # list of integers holding information about the batch size at each sequence step

# # pack it
# # pack = torch.nn.utils.rnn.pack_padded_sequence(batch_in, seq_lengths, batch_first=True)
# # unpack, _ = torch.nn.utils.rnn.pad_packed_sequence(pack)



# steps = []
# batch_sizes = []
# X_t = X_t.transpose(0, 1)

# # # lengths is a Tensor, so we must convert to [int] before reversed()
# # lengths_iter = reversed(lengths.tolist())

# # batch_size = X_t.size(1)

# # if len(lengths) != batch_size:
# #     raise ValueError("Expected `len(lengths)` to be equal to batch_size, but got "
# #                      "{} (batch_size={}).".format(len(lengths), batch_size))

# # prev_l = 0
# # for i, l in enumerate(lengths_iter):
# #     if l > prev_l:
# #         c_batch_size = batch_size - i
# #         print(X_t[prev_l:l, :c_batch_size])
# #         steps.append(X_t[prev_l:l, :c_batch_size].contiguous().view(-1, *X_t.size()[2:]))
# #         batch_sizes.extend([c_batch_size] * (l - prev_l))
# #         prev_l = l

# #     elif prev_l > l:
# #         raise ValueError("'lengths' array has to be sorted in decreasing order")

# padded_vec = np.pad(np.array([v1[0]]),
#                                 pad_width=((3,0), (0,0)),
#                                 mode="constant", constant_values=0)
# print(padded_vec[1,:])
# len(v1[0])

In [379]:
# import pandas as pd
# df = pd.read_csv("/Users/shrey/Text Generation/story_cloze/data/stories.csv")

# import numpy as np
# n = 10 
# array = df.values[:n,1:].reshape(-1).tolist()
# vecn = np.load("vectors_"+ str(n) + ".npy")
# vec = vecn.tolist()
# len(vec)

# def nn(qvec, vectors, array, k=5):
#     #qvec /= norm(qvec)
#     scores = np.dot(qvec, vectors.T).flatten()
#     sorted_args = np.argsort(scores)[::-1]
#     sentences = [array[a] for a in sorted_args[:k]]
#     for i, s in enumerate(sentences):
#         print (s, sorted_args[i])
        
# vt = vec[::6]
# v1 = vec[1::6]
# v2 = vec[2::6]
# v3 = vec[3::6]
# v4 = vec[4::6]
# v5 = vec[5::6]

# v0 = np.zeros(4800).tolist()
# X = []
# y = []
# lengths = []
# m = len(v1)
# for i in range(m):
#     X.append([v1[i], v2[i], v3[i], v4[i]])
#     y.append(v5[i])
#     lengths.append(4)
#     X.append([v0, v1[i], v2[i], v3[i]])
#     y.append(v4[i])
#     lengths.append(3)
#     X.append([v0, v0, v1[i], v2[i]])
#     y.append(v3[i])
#     lengths.append(2)
#     X.append([v0 , v0 , v0 , v1[i]])
#     y.append(v2[i])
#     lengths.append(1)

    
# data = [ (k, m, l)  for k, m, l in sorted(zip(X,y,lengths), key=lambda pair: pair[2], reverse=True)]
# X = list(zip(*data))[0]
# y = list(zip(*data))[1]
# lengths = list(zip(*data))[2]
    
# X = np.asarray(X) #.reshape(5, 8, 4, 4800) # X.shape is (samples, timesteps, features)
# y = np.asarray(y) #.reshape(5, 8, 4800)
# lengths = np.asarray(lengths) #.reshape(5, 8)

# from torch.autograd import Variable
# X_t =torch.FloatTensor(X)
# y_t =torch.FloatTensor(y)
# lengths = torch.IntTensor(lengths)
# print(X_t.requires_grad)


########################## Py torch Basics ###################
# import torch
# x = torch.ones(2, 2, requires_grad=True)
# print(x)

# y = x * 2
# z = y + 5
# out = z * z / 2
# out = out.mean()

# out.grad


############ Laoding Validation ##############################

# val_dataset = VocabDataset(val_data, char2id)
# val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=vocab_collate_func,
#                                            shuffle=True)

# test_dataset = VocabDataset(test_data, char2id)
# test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=vocab_collate_func,
#                                            shuffle=False)


# array = df.values[:,3:].reshape(-1).tolist()
# vecn = np.load("vectors_"+ str("preprocessed") + ".npy")
# vec = vecn.tolist()
# print(len(vec))

# def nn(qvec, vectors, array, k=5):
#     qvec /= np.linalg.norm(qvec)
#     vectors = np.asarray([ i / np.linalg.norm(i) for i in vectors.tolist()])
#     scores = np.dot(qvec, vectors.T).flatten()
#     #distr(scores)
#     #analyse(scores)
#     sorted_args = np.argsort(scores)[::-1]
#     sentences = [(array[a], scores[a]) for a in sorted_args[:k]]
#     for i, s in enumerate(sentences):
#         print (s, sorted_args[i])
        
# #len(story_0[0][0])        
# # story_0s = [nn(i, vecn, array, k=1) for i in story_0[0]]
# # story_1s = [nn(i, vecn, array, k=1) for i in story_1[0]]
# # print(story_1s)


######################## Old Code ################################################

# train_dataset = StoryVectors(numpy_file, csv_file)
# train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
#                                            batch_size=BATCH_SIZE,
#                                            collate_fn=vocab_collate_func,
#                                            shuffle=True,
#                                            num_workers=4)

# d1, l1, la1 = iter(train_loader).next()

# d1[1,3,:].shape
# la1[1,:].shape

# p = 1
# q = 2
# pred = model(d1[p:q,:,:], torch.tensor([1]))
# pred = pred.squeeze().detach().numpy()

# # nn(vec[7], vecn, array)
# # array[7]

# # nn(vec[7], vecn, array)
# # array[7]
# print("Input Sentence")
# nn(d1.numpy()[p:q,3,:].squeeze().tolist(), vecn, array, k=1)


# print("Actual Output")
# nn(la1.numpy()[p:q,:].squeeze().tolist(), vecn, array, k=1)

# print("Predicted Output")
# nn(pred.tolist(), vecn, array, k=20)

# act = la1.numpy()[p:q,:].squeeze().T
# #analyse(pred, vecn, act)
# # norm_pred  = pred / np.linalg.norm(pred)

# # norm_act =  act / np.linalg.norm(act)
# np.dot(pred, act)

# import torch
# import torch.nn as nn
# m = nn.Linear(20, 30)
# inp = torch.randn(128, 20)
# output = m(inp)
# print(output.size())
# print(inp.size())


# # pred_ = np.load('pred.npy')

# # np.dot(pred, pred_.T)
# #print(pred_.shape)
# nn(pred_, dataset2[2,:,:], sentences2[2,:], k=5)
# pred.shape

### Testing rough

In [8]:
test_vectors = np.load("data/"+ experiment + "/test_vectors.npy")
test_sentences = np.load("data/"+ experiment + "/test_sentences.npy")
train_vectors = np.load("data/"+ experiment + "/train_vectors.npy")
train_sentences = np.load("data/"+ experiment + "/train_sentences.npy")

vectors = np.concatenate((train_vectors, test_vectors), axis=0)
sentences = np.concatenate((test_sentences, train_sentences), axis=0)

def pad(vector, length):
    padded_vec = np.pad(vector,
                        pad_width=((0, 4-length), (0,0)),
                        mode="constant", constant_values=0)
    return padded_vec

def nn(qvec, vectors, array, k=5):
#     print("processing")
#     qvec /= np.linalg.norm(qvec)
#     vectors = np.asarray([ i / np.linalg.norm(i) for i in vectors.tolist()])
    print("computing scores")
    scores = np.dot(qvec, vectors.T).flatten()
    #distr(scores)
    #analyse(scores)
    print("sorting scores")
    sorted_args = np.argsort(scores)[::-1]
    sentences = [(array[a], scores[a]) for a in sorted_args[:k]]
    for i, s in enumerate(sentences):
        print (s, sorted_args[i])
        
        
def suggestions(vector, length, vectors, sentences, k=5):
    pv1 = pad(vector, length)
    print("predicting vector")
    pred = model(torch.FloatTensor([pv1]), torch.LongTensor([4]))
    pred = pred.detach().numpy().squeeze(axis=0)
    pred = pred[-1]
    print("searching sentence")
    nn(pred, vectors, sentences, k=5)
 

v1 = np.asarray([train_vectors[0]])
v2 = np.asarray(train_vectors[0:2])
v3 = np.asarray(train_vectors[0:3])
v4 = np.asarray(train_vectors[0:4])

l1, l2, l3, l4 = 1, 2, 3, 4

suggestions(v4, l4, train_vectors, train_sentences, k=5)
print(train_sentences[0], train_sentences[1], train_sentences[2], train_sentences[3], train_sentences[4])
# suggestions(v2, l2, vectors, sentences, k=5)
# suggestions(v3, l3, vectors, sentences, k=5)
# suggestions(v4, l4, vectors, sentences, k=5)

In [None]:
# test_vectors = np.load("data/"+ experiment + "/test_vectors.npy")[:5]
# test_sentences = np.load("data/"+ experiment + "/test_sentences.npy")[:5]

# test_vectors, temp = test_vectors[:1], np.expand_dims(test_vectors[4], axis=0)


# test_vectors = np.pad(test_vectors, pad_width=((0, 3), (0,0)), mode="constant", constant_values=0)

# print(test_vectors.shape, temp.shape)

# test_vectors = np.concatenate((test_vectors, temp), axis=0)

# print(test_vectors.shape)



#nn(pred, d, s, k=5)

# test_vectors = np.load("data/"+ experiment + "/test_vectors.npy")
# test_sentences = np.load("data/"+ experiment + "/test_sentences.npy")

# for data, lengths, labels, sentences in test_loader:
#     pred = model(data, lengths)
#     pred = pred.detach().numpy().squeeze(axis=0)
#     pred = pred[-1]
#     print("searching sentence")
#     nn(pred, test_vectors, test_sentences, k=5)
    
# test_sentences[:5]  

# test_dataset = StoryVectors(dataset4[:, 6:7, :], sentences4[:, 6:7])
# test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
#                                            batch_size=1,
#                                            collate_fn=vocab_collate_func,
#                                            shuffle=True,
#                                            num_workers=4)

# model4.eval()

# for data, lengths, labels, sentences in test_loader:
#         pred = model4(data, lengths)

        
# pred = pred.detach().numpy().squeeze()
# print(pred.shape)
# print(sentences)
# no, sample, dim = dataset4.shape
# d = dataset4.reshape(no*sample, dim)
# s = sentences4.reshape(no*sample)
# print(d.shape)
# nn(pred, d, s, k=5)
