Example Pytorch code for what the paper's model might look like.
A quick tutorial on Pytorch can be found at https://cs230-stanford.github.io/pytorch-getting-started.html
There's also the Pytorch documentation intro at https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html
which goes into a few more details.


In [192]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset
from torch.utils.data import DataLoader


torch.manual_seed(1)

import numpy as np



In [193]:
'''
Modified LSTM modal
This contains both the embedding layers, Pytorch's default LSTM layer,
and 2 linear layers
'''
class PathLSTM(nn.Module):

    def __init__(self, e_emb_dim, t_emb_dim, r_emb_dim, hidden_dim, vocab_size, tagset_size):
        super(PathLSTM, self).__init__()
        self.hidden_dim = hidden_dim

        self.entity_embeddings = nn.Embedding(vocab_size, e_emb_dim)
        self.type_embeddings = nn.Embedding(vocab_size, t_emb_dim)
        self.rel_embeddings = nn.Embedding(vocab_size, r_emb_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(e_emb_dim + t_emb_dim + r_emb_dim, hidden_dim)

        # The linear layer that maps from hidden state space to to tags
        self.linear1 = nn.Linear(hidden_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, tagset_size)

    def forward(self, paths, path_lengths):      
        #transpose, so entities 1st row, types 2nd row, and relations 3nd (these are dim 1 and 2 since batch is 0)
        #this could just be the input if we want
        t_paths = torch.transpose(paths, 1, 2)
        
        #then concatenate embeddings, batch is index 0, so selecting along index 1
        #right now we do fetch embedding for padding tokens, but that these aren't used
        entity_embed = self.entity_embeddings(t_paths[:,0,:])
        type_embed = self.type_embeddings(t_paths[:,1,:])
        rel_embed = self.rel_embeddings(t_paths[:,2,:])
        triplet_embed = torch.cat((entity_embed, type_embed, rel_embed), 2) #concatenates lengthwise
        
        #we need dimensions to be input size x batch_size x embedding dim, so transpose first 2 dim
        batch_sec_embed = torch.transpose(triplet_embed, 0 , 1)
        
        #pack padded sequences, so we don't do extra computation
        packed_embed = nn.utils.rnn.pack_padded_sequence(batch_sec_embed, path_lengths)
        
        #last_out is the output state before padding for each path, since we only want final output
        packed_out, (last_out, _) = self.lstm(packed_embed)
    
        ##can visualize unpacked seq to see that last_out is what we want
        #lstm_out, lstm_out_lengths = nn.utils.rnn.pad_packed_sequence(packed_out)
        #print(lstm_out, lstm_out_lengths)
        
        #pass through linear layers
        tag_scores = self.linear2(F.relu(self.linear1(last_out[-1])))
        
        #Paper uses relu as final activation, but for Pytorch's nllloss it seems like we need a softmax layer
        #to convert to probability distribution?
        #return F.relu(tag_score)
        return F.log_softmax(tag_scores, dim=1)
      

In [194]:
#For now just construct example, later would want to automatically create maps from vocab
e_to_ix = {'Sam': 0, 'Weijia': 1, 'Rosa': 2, 'Joey':3, 'Song1': 4, 'Song2': 5, 'Song3': 6, 'Pop': 7}
t_to_ix = {'u': 0, 's': 1, 't': 2}
r_to_ix = {'rate': 0, 'category': 1, 'belong': 2, '_rate': 3, '_category': 4, '_belong':5, 'UNK': 6}

#this could be transposed to [[entity1, entity2, ...], [type1, type2, ...], [rel1, rel2, ...]]
#since we do that in the model
training_data = [
    ([['Sam', 'u', 'rate'], ['Song1', 's', 'category'], ['Pop', 't', '_belong'], ['Song2', 's', 'UNK']], 1),
    ([['Sam', 'u', 'rate'], ['Song2', 's', 'UNK']], 1),
    ([['Sam', 'u', 'rate'], ['Song1', 's', '_rate'], ['Joey', 'u', 'rate'],['Song3', 's', 'UNK']], 0)
]

In [195]:
#find max path length
def find_max_length(data):
    max_len = 0
    for (path, _) in data:
        max_len = max(len(path), max_len)
    return max_len

#construct tensor of item, type, and relation ids
def prepare_path(seq, e_to_ix, t_to_ix, r_to_ix, max_len, pad_num):
    id_pairs = []
    for step in seq:
        e,t,r = step[0], step[1], step[2]
        id_pairs.append([len(t_to_ix) + len(r_to_ix) + e_to_ix[e], len(r_to_ix) + t_to_ix[t], r_to_ix[r]])
    
    while len(id_pairs) < max_len:
        id_pairs.append([pad_num, pad_num, pad_num])
        
    return torch.tensor(id_pairs, dtype=torch.long)

#for padding paths before packing paths, want value not in our paths
PATH_PADDING =  len(t_to_ix) + len(r_to_ix) + len(e_to_ix)
max_len = find_max_length(training_data)

#formatted data are tuples of (path, tag, path_length), where the path is padded
formatted_data = []
for path, tag in training_data:
    formatted_data.append((prepare_path(path, e_to_ix, t_to_ix, r_to_ix, max_len, PATH_PADDING), tag, len(path)))
print(formatted_data)

[(tensor([[10,  7,  0],
        [14,  8,  1],
        [17,  9,  5],
        [15,  8,  6]]), 1, 4), (tensor([[10,  7,  0],
        [15,  8,  6],
        [18, 18, 18],
        [18, 18, 18]]), 1, 2), (tensor([[10,  7,  0],
        [14,  8,  3],
        [13,  7,  0],
        [16,  8,  6]]), 0, 4)]


In [196]:
#sorts a batch of paths by path length, in decreasing order
def sort_batch(batch, targets, lengths):
    seq_lengths, perm_idx = lengths.sort(0, descending=True)
    seq_tensor = batch[perm_idx]
    target_tensor = targets[perm_idx]
    return seq_tensor, target_tensor, seq_lengths

In [198]:
E_EMBEDDING_DIM = 3 #64 in paper
T_EMBEDDING_DIM = 3 #32 in paper
R_EMBEDDING_DIM = 3 #32 in paper
HIDDEN_DIM = 6 #this might be unit number = 256
TARGET_SIZE = 2

vocab_size = len(e_to_ix) + len(t_to_ix) + len(r_to_ix) + 1 #plus 1 for padding
model = PathLSTM(E_EMBEDDING_DIM, T_EMBEDDING_DIM, R_EMBEDDING_DIM, HIDDEN_DIM, vocab_size, TARGET_SIZE)
loss_function = nn.NLLLoss() #negative log likelihood loss
#loss_function = nn.CrossEntropyLoss() #This seems to work with relu activation but nllloss does not
#this is because crossEntropyLoss actually automatically adds the softmax layer to normalize results into p-distribution


# l2 regularization is tuned from {10−5 , 10−4 , 10−3 , 10−2 }, I think this is weight decay
# Learning rate is found from {0.001, 0.002, 0.01, 0.02} with grid search
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=.001)

#DataLoader used for batches
train_loader = DataLoader(dataset=formatted_data, batch_size=3, shuffle=False)

for epoch in range(300):  # tiny data so 300 epochs
    for path_batch, targets, lengths in train_loader:
        
        #sort based on path lengths, largest first, so that we can pack paths
        s_path_batch, s_targets, s_lengths = sort_batch(path_batch, targets, lengths)
        
        #Pytorch accumulates gradients, so we need to clear before each instance
        model.zero_grad()

        #Run the forward pass.
        tag_scores = model(s_path_batch, s_lengths)

        #Compute the loss, gradients, and update the parameters by calling .step()
        loss = loss_function(tag_scores, s_targets)
        loss.backward()
        optimizer.step()

        # print statistics
        if epoch % 30 == 0:
            print("loss is:", loss.item())
        

loss is: 0.6723564267158508
loss is: 0.05917897820472717
loss is: 0.002011139178648591
loss is: 0.001670647761784494
loss is: 0.0018656658940017223
loss is: 0.0018535585841163993
loss is: 0.001736925565637648
loss is: 0.0015992639819160104
loss is: 0.001500293263234198
loss is: 0.0014211212983354926


In [199]:
# See what the scores are after training, on the training dataset
with torch.no_grad():
    test_loader = DataLoader(dataset=formatted_data, batch_size=3, shuffle=False)
    for path_batch, target_batch, lengths in train_loader:
        s_path_batch, s_targets, s_lengths = sort_batch(path_batch, targets, lengths)
        tag_scores = model(s_path_batch, s_lengths)
        print(tag_scores)

tensor([[-7.1787e+00, -7.6301e-04],
        [-2.0246e-03, -6.2034e+00],
        [-6.6478e+00, -1.2978e-03]])
