## Exercise 7-1

## StructuredE

In this exercise we will study a latent distance model StructuredE. The StructuredE method models structured KB data by learning latent representation of entities and linear operator for relations. The scoring function ensures higher score for existing than for non-existing entities. 

The StructuredE model scores the triple in given $KG=<a_i, a_j, r_k$ as:

$f_{ijk}^{SE} = - ||W_k^sa_i - W_k^oa_j||$

where for specific k-th relation the matrices $W_k^s and W_k^o$ transform the global latent features of the entities. 

For evaluation of method we will use wordnet dataset. In wordnet entities correspond to word senses and relationships define lexical relations between them. 

In [1]:
import torch.nn as nn
import torch
import numpy as np
import torch.nn.functional as F
from torch.autograd import Variable
from utils import get_minibatches, sample_negatives, accuracy, auc
from time import time

In [20]:
class StructuredE(nn.Module):
    """
    StructuredE embedding model
    ----------------------
    Bordes, Antoine, et al. 
    "Learning Structured Embeddings of Knowledge Bases." AAAI. 2011.
    """

    def __init__(self, n_e, n_r, k, margin, distance='l2', gpu=False):
        """
        StructuredE embedding model
        ----------------------

        Params:
        -------
            n_e: int
                Number of entities in dataset.

            n_r: int
                Number of relationships in dataset.

            k: int
                Embedding size.

            margin: float
                Margin size for StructuredE's hinge loss.

            distance: {'l1', 'l2'}
                Distance measure to be used in the loss.

            gpu: bool, default: False
                Whether to use GPU or not.
        """
        super(StructuredE, self).__init__()

        # Hyperparams
        self.n_e = n_e 
        self.n_r = n_r  
        self.k = k
        self.gamma = margin
        self.distance = distance
        self.gpu = gpu
        # Nets
        self.emb_E = nn.Embedding(self.n_e, self.k)
        self.W_s = nn.Embedding(self.n_r, self.k)
        self.W_o = nn.Embedding(self.n_r, self.k)

        # Initialization
        r = 6/np.sqrt(self.k)
        self.emb_E.weight.data.uniform_(-r, r)
        self.W_s.weight.data.uniform_(-r, r)
        self.W_o.weight.data.uniform_(-r, r)

        # Copy all params to GPU if specified
        if self.gpu:
            self.cuda()

    def forward(self, X):
        X = Variable(torch.from_numpy(X)).long()
        X = X.cuda() if self.gpu else X

        # Decompose X into head, relationship, tail
        hs, rel, ts = X[:, 0], X[:,1], X[:, 2]

        e_hs = self.emb_E(hs)
        e_ts = self.emb_E(ts)
        w_s = self.W_s(rel)
        w_o = self.W_o(rel)
        
        if self.distance == 'l1':
            f = torch.sum(torch.abs(w_s*e_hs - w_o*e_ts), 1)
        else:
            f = torch.sqrt(torch.sum((w_s*e_hs - w_o*e_ts)**2, 1, keepdim=True))
        return f
    def ranking_loss(self, y_pos, y_neg, C=1, average=True):
        """
        Compute loss max margin ranking loss.

        Params:
        -------
        y_pos: vector of size Mx1
            Contains scores for positive samples.

        y_neg: np.array of size Mx1 (binary)
            Contains the true labels.

        margin: float, default: 1
            Margin used for the loss.

        C: int, default: 1
            Number of negative samples per positive sample.

        average: bool, default: True
            Whether to average the loss or just summing it.

        Returns:
        --------
        loss: float
        """
        M = y_pos.size(0)

        y_pos = y_pos.view(-1).repeat(C) # repeat to match y_neg
        y_neg = y_neg.view(-1)
        target = Variable(torch.from_numpy(-np.ones(M*C, dtype=np.float32)))
        loss = nn.MarginRankingLoss(margin=self.gamma)
        loss = loss(y_pos, y_neg, target)
        return loss
    
    def normalize_embeddings(self):
        self.emb_E.weight.data.renorm_(p=2, dim=0, maxnorm=1)
    
    def predict(self, X, sigmoid=False):
        
        y_pred = self.forward(X).view(-1, 1)

        if sigmoid:
            y_pred = F.sigmoid(y_pred)

        if self.gpu:
            return y_pred.cpu().data.numpy()
        else:
            return y_pred.data.numpy()

In [21]:
# Set random seed
randseed = 9999
np.random.seed(randseed)
torch.manual_seed(randseed)

<torch._C.Generator at 0x7fd9ea19b370>

In [22]:
# Data Loading
# Load dictionary lookups
idx2ent = np.load('data/wordnet/bin/idx2ent.npy')
idx2rel = np.load('data/wordnet/bin/idx2rel.npy')

n_e = len(idx2ent)
n_r = len(idx2rel)

# Load dataset
X_train = np.load('data/wordnet/bin/train.npy')
X_val = np.load('data/wordnet/bin/val.npy')
y_val = np.load('data/wordnet/bin/y_val.npy')

X_val_pos = X_val[y_val.ravel() == 1, :]  # Take only positive samples

M_train = X_train.shape[0]
M_val = X_val.shape[0]

# Model Parameters
k = 50
distance = 'l2'
margin = 1.0
model = StructuredE(n_e=n_e, n_r=n_r, k=k, margin=margin, distance=distance, gpu= False)

In [23]:
normalize_embed = True
C = 5 # Negative Samples
n_epoch = 20
lr = 0.1
lr_decay_every = 20
#weight_decay = 1e-4
mb_size = 100  
print_every = 100
average = False
# Optimizer Initialization
#solver = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
solver = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
# Begin training
for epoch in range(n_epoch):
    print('Epoch-{}'.format(epoch+1))
    print('----------------')
    it = 0
    # Shuffle and chunk data into minibatches
    mb_iter = get_minibatches(X_train, mb_size, shuffle=True)

    # Anneal learning rate
    lr = lr * (0.5 ** (epoch // lr_decay_every))
    for param_group in solver.param_groups:
        param_group['lr'] = lr

    for X_mb in mb_iter:
        start = time()

        # Build batch with negative sampling
        m = X_mb.shape[0]
        # C x M negative samples
        X_neg_mb = np.vstack([sample_negatives(X_mb, n_e) for _ in range(C)])
        X_train_mb = np.vstack([X_mb, X_neg_mb])

        y_true_mb = np.vstack([np.zeros([m, 1]), np.ones([C*m, 1])])

        # Training step
        y = model.forward(X_train_mb)
        y_pos, y_neg = y[:m], y[m:]
        loss = model.ranking_loss(y_pos, y_neg, C=C, average=average)        
        loss.backward()
        solver.step()
        solver.zero_grad()

        end = time()
        if normalize_embed:
            model.normalize_embeddings()

        end = time()
        # Training logs
        if it % print_every == 0:
            # Training auc
            pred = model.predict(X_train_mb, sigmoid=True)
            train_acc = auc(pred, y_true_mb)

            # Validation auc
            y_pred_val = model.forward(X_val)
            y_prob_val = F.sigmoid(y_pred_val)
            y_prob_val = 1 - y_prob_val
            val_acc = auc(y_prob_val.data.numpy(), y_val)

            print('Iter-{}; loss: {:.4f}; train_auc: {:.4f}; val_auc: {:.4f}; time per batch: {:.2f}s'
                    .format(it, loss.data[0], train_acc, val_acc, end-start))


        it += 1

Epoch-1
----------------
Iter-0; loss: 0.9946; train_auc: 0.5140; val_auc: 0.5005; time per batch: 0.04s
Iter-100; loss: 1.0126; train_auc: 0.4468; val_auc: 0.5104; time per batch: 0.04s
Iter-200; loss: 0.9943; train_auc: 0.5278; val_auc: 0.5209; time per batch: 0.03s
Iter-300; loss: 0.9966; train_auc: 0.5237; val_auc: 0.5288; time per batch: 0.03s
Iter-400; loss: 0.9969; train_auc: 0.5080; val_auc: 0.5364; time per batch: 0.04s
Iter-500; loss: 0.9931; train_auc: 0.5189; val_auc: 0.5436; time per batch: 0.03s
Iter-600; loss: 0.9871; train_auc: 0.5669; val_auc: 0.5494; time per batch: 0.04s
Iter-700; loss: 0.9872; train_auc: 0.5507; val_auc: 0.5543; time per batch: 0.04s
Iter-800; loss: 0.9868; train_auc: 0.5381; val_auc: 0.5593; time per batch: 0.04s
Iter-900; loss: 0.9776; train_auc: 0.5567; val_auc: 0.5630; time per batch: 0.04s
Iter-1000; loss: 0.9750; train_auc: 0.5668; val_auc: 0.5666; time per batch: 0.04s
Iter-1100; loss: 0.9813; train_auc: 0.5415; val_auc: 0.5698; time per batc

Iter-200; loss: 0.4332; train_auc: 0.7295; val_auc: 0.6050; time per batch: 0.04s
Iter-300; loss: 0.4013; train_auc: 0.7397; val_auc: 0.6048; time per batch: 0.04s
Iter-400; loss: 0.3733; train_auc: 0.7734; val_auc: 0.6059; time per batch: 0.03s
Iter-500; loss: 0.3474; train_auc: 0.7699; val_auc: 0.6031; time per batch: 0.04s
Iter-600; loss: 0.3606; train_auc: 0.7543; val_auc: 0.6044; time per batch: 0.04s
Iter-700; loss: 0.3928; train_auc: 0.7519; val_auc: 0.6010; time per batch: 0.04s
Iter-800; loss: 0.3881; train_auc: 0.7400; val_auc: 0.6034; time per batch: 0.04s
Iter-900; loss: 0.3356; train_auc: 0.7743; val_auc: 0.6007; time per batch: 0.04s
Iter-1000; loss: 0.3390; train_auc: 0.7678; val_auc: 0.5992; time per batch: 0.04s
Iter-1100; loss: 0.3819; train_auc: 0.7332; val_auc: 0.6022; time per batch: 0.04s
Epoch-10
----------------
Iter-0; loss: 0.3108; train_auc: 0.7772; val_auc: 0.6015; time per batch: 0.04s
Iter-100; loss: 0.3939; train_auc: 0.7412; val_auc: 0.5987; time per bat

Iter-400; loss: 0.1004; train_auc: 0.8716; val_auc: 0.5687; time per batch: 0.04s
Iter-500; loss: 0.1210; train_auc: 0.8459; val_auc: 0.5666; time per batch: 0.04s
Iter-600; loss: 0.1012; train_auc: 0.8659; val_auc: 0.5663; time per batch: 0.03s
Iter-700; loss: 0.1069; train_auc: 0.8518; val_auc: 0.5666; time per batch: 0.03s
Iter-800; loss: 0.1206; train_auc: 0.8730; val_auc: 0.5656; time per batch: 0.03s
Iter-900; loss: 0.1097; train_auc: 0.8561; val_auc: 0.5660; time per batch: 0.04s
Iter-1000; loss: 0.0985; train_auc: 0.8700; val_auc: 0.5663; time per batch: 0.03s
Iter-1100; loss: 0.0771; train_auc: 0.8936; val_auc: 0.5661; time per batch: 0.04s
Epoch-18
----------------
Iter-0; loss: 0.0983; train_auc: 0.8761; val_auc: 0.5662; time per batch: 0.04s
Iter-100; loss: 0.1179; train_auc: 0.8648; val_auc: 0.5666; time per batch: 0.04s
Iter-200; loss: 0.1228; train_auc: 0.8650; val_auc: 0.5657; time per batch: 0.04s
Iter-300; loss: 0.0998; train_auc: 0.8859; val_auc: 0.5652; time per bat

## Task: Randomly select 10 entities from entity set and display their k-nn (k=5)

In [None]:
###########################
###### Your Code Here
###########################