In [1]:
import numpy as np
import torch
from torch import optim
import math
from metric import get_mrr, get_recall
import datetime
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import pickle
from entmax import  entmax_bisect


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(0)

test64 = pickle.load(open('data/diginetica/test.txt', 'rb'))
train64 = pickle.load(open('data/diginetica/train.txt', 'rb'))

train64_x = train64[1]
train64_y = train64[2]

test64_x = test64[1]
test64_y = test64[2]
train_pos = list()
test_pos = list()

item_set = set()
item_set = set()

for items in train64[1]:
    pos = list()
    for id_ in range(len(items)):
        item_set.add(items[id_])
        pos.append(id_ + 1)
    pos.append(len(items)+1)
    train_pos.append(pos)

for item in train64[2]:
    item_set.add(item)

for items in test64[1]:
    pos = []
    for id_ in range(len(items)):
        item_set.add(items[id_])
        pos.append(id_ + 1)
    pos.append(len(items)+1)
    test_pos.append(pos)
    
for item in test64[2]:
    item_set.add(item)
item_list = sorted(list(item_set))
item_dict = dict()
for i in range(1, len(item_set)+1):
    item = item_list[i-1]
    item_dict[item] = i


In [3]:
train64_x = list()
train64_y = list()

test64_x = list()
test64_y = list()
    
for items in train64[1]:
    new_list = []
    for item in items:
        new_item = item_dict[item]
        new_list.append(new_item)
    train64_x.append(new_list)
for item in train64[2]:
    new_item = item_dict[item]
    train64_y.append(new_item)
for items in test64[1]:
    new_list = []
    for item in items:
        new_item = item_dict[item]
        new_list.append(new_item)
    test64_x.append(new_list)
for item in test64[2]:
    new_item = item_dict[item]
    test64_y.append(new_item)

In [4]:
max_length = 0
for sample in train64_x:
    max_length = len(sample) if len(sample) > max_length else max_length
for sample in test64_x:
    max_length = len(sample) if len(sample) > max_length else max_length 

train_seqs = np.zeros((len(train64_x), max_length))
train_poses = np.zeros((len(train64_x), max_length+1))
test_seqs = np.zeros((len(test64_x), max_length))
test_poses = np.zeros((len(test64_x), max_length+1))

for i in range(len(train64_x)):
    seq = train64_x[i]
    pos = train_pos[i]
    length = len(seq)
    train_seqs[i][-length:] = seq
    train_poses[i][-length-1:] = pos
    
for i in range(len(test64_x)):
    seq = test64_x[i]
    pos = test_pos[i]
    length = len(seq)
    test_seqs[i][-length:] = seq
    test_poses[i][-length-1:] = pos

target_seqs = np.array(train64_y)
target_test_seqs = np.array(test64_y)

In [5]:
item_set = set()
for items in train64_x:
    for item in items:
        item_set.add(item)
for item in train64_y:
    item_set.add(item)
for items in test64_x:
    for item in items:
        item_set.add(item)
for item in test64_y:
    item_set.add(item)

In [6]:
train_x = torch.Tensor(train_seqs)
train_pos = torch.Tensor(train_poses)
train_y = torch.Tensor(target_seqs)
test_x = torch.Tensor(test_seqs)
test_pos = torch.Tensor(test_poses)
test_y = torch.Tensor(target_test_seqs)
train_label = torch.Tensor([40841]).repeat(len(train64_x)).unsqueeze(1)
test_label = torch.Tensor([40841]).repeat(len(test64_x)).unsqueeze(1)

In [7]:
train_x = torch.cat((train_x, train_label), 1)
test_x = torch.cat((test_x, test_label), 1)

In [8]:
class DualAttention(nn.Module):

    def __init__(self, item_dim, pos_dim, n_items, n_pos, w, atten_way='dot', decoder_way='bilinear', dropout=0,
                 activate='relu'):
        super(DualAttention, self).__init__()
        self.item_dim = item_dim
        self.pos_dim = pos_dim
        dim = item_dim + pos_dim
        self.dim = dim
        self.n_items = n_items
        self.embedding = nn.Embedding(n_items + 1, item_dim, padding_idx=0,max_norm=1.5)
        self.pos_embedding = nn.Embedding(n_pos, pos_dim, padding_idx=0, max_norm=1.5)
        self.atten_way = atten_way
        self.decoder_way = decoder_way
        self.atten_w0 = nn.Parameter(torch.Tensor(1, dim))
        self.atten_w1 = nn.Parameter(torch.Tensor(dim, dim))
        self.atten_w2 = nn.Parameter(torch.Tensor(dim, dim))
        self.atten_bias = nn.Parameter(torch.Tensor(dim))
        self.w_f = nn.Linear(2*dim, item_dim)
        self.dropout = nn.Dropout(dropout)
        self.self_atten_w1 = nn.Linear(dim, dim)
        self.self_atten_w2 = nn.Linear(dim, dim)
        
        self.LN = nn.LayerNorm(dim)
        self.LN2 = nn.LayerNorm(item_dim)
        self.is_dropout = True
        self.attention_mlp = nn.Linear(dim, dim)
        self.alpha_w = nn.Linear(dim, 1)
        self.w = w
        
        if activate == 'relu':
            self.activate = F.relu
        elif activate == 'selu':
            self.activate = F.selu

        self.initial_()

    def initial_(self):
        
        init.normal_(self.atten_w0, 0, 0.05)
        init.normal_(self.atten_w1, 0, 0.05)
        init.normal_(self.atten_w2, 0, 0.05)
        init.constant_(self.atten_bias, 0)
        init.constant_(self.attention_mlp.bias, 0)
        init.constant_(self.embedding.weight[0], 0)
        init.constant_(self.pos_embedding.weight[0], 0)

    def forward(self, x, pos):
        self.is_dropout = True
        x_embeddings = self.embedding(x)  # B,seq,dim
        pos_embeddings = self.pos_embedding(pos)  # B, seq, dim 
        mask = (x != 0).float()  # B,seq
        x_ = torch.cat((x_embeddings, pos_embeddings), 2)  # B seq, 2*dim
        x_s = x_[:, :-1, :]  # B, seq-1, 2*dim
        alpha_ent = self.get_alpha(x = x_[:, -1, :], number= 0)
        m_s, x_n = self.self_attention(x_, x_, x_, mask, alpha_ent)
        alpha_global = self.get_alpha(x= m_s, number=1)
        global_c = self.global_attention(m_s, x_n, x_s, mask, alpha_global)  # B, 1, dim
        h_t = global_c
        result = self.decoder(h_t, m_s)
        return result
    
    def get_alpha(self, x=None, number=None):
        if number == 0:
            alpha_ent = torch.sigmoid(self.alpha_w(x)) + 1
            alpha_ent = self.add_value(alpha_ent).unsqueeze(1)
            alpha_ent = alpha_ent.expand(-1, 70, -1)
            return alpha_ent
        if number == 1:
            alpha_global = torch.sigmoid(self.alpha_w(x)) + 1
            alpha_global = self.add_value(alpha_global)
            return alpha_global

    def add_value(self, value):

        mask_value = (value ==1).float()
        value = value.masked_fill(mask_value == 1, 1.00001)
        return value
        
    def self_attention(self, q, k, v, mask=None, alpha_ent = 1):

        if self.is_dropout:
            q_ = self.dropout(self.activate(self.attention_mlp(q)))
        else:
            q_ = self.activate(self.attention_mlp(q))
        scores = torch.matmul(q_, k.transpose(1, 2)) / math.sqrt(self.dim)
        if mask is not None:
            mask = mask.unsqueeze(1).expand(-1, q.size(1), -1)
            scores = scores.masked_fill(mask == 0, -np.inf)      
        alpha = entmax_bisect(scores, alpha_ent, dim=-1)

        att_v = torch.matmul(alpha, v)  # B, seq, dim
        if self.is_dropout:
            att_v = self.dropout(self.self_atten_w2(self.activate(self.self_atten_w1(att_v)))) + att_v
        else:
            att_v = self.self_atten_w2(self.activate(self.self_atten_w1(att_v))) + att_v
        att_v = self.LN(att_v)
        c = att_v[:, -1, :].unsqueeze(1)
        x_n = att_v[:, :-1, :]
        return c, x_n

    def global_attention(self,target,k, v, mask=None, alpha_ent=1):
        alpha = torch.matmul(
            torch.relu(k.matmul(self.atten_w1) + target.matmul(self.atten_w2) + self.atten_bias),
            self.atten_w0.t())  # (B,seq,1)
        if mask is not None:
            mask = mask.unsqueeze(-1)
            mask = mask[:, :-1, :]
            alpha = alpha.masked_fill(mask == 0, -np.inf)
        alpha = entmax_bisect(alpha, alpha_ent, dim=1)
        c = torch.matmul(alpha.transpose(1, 2), v)  # (B, 1, dim)
        return c

    def decoder(self, global_c, self_c):
        if self.is_dropout:
            c = self.dropout(torch.selu(self.w_f(torch.cat((global_c, self_c), 2))))
        else:
            c = torch.selu(self.w_f(torch.cat((global_c, self_c), 2)))
        c = c.squeeze()
        l_c = (c/torch.norm(c, dim=-1).unsqueeze(1))
        l_emb = self.embedding.weight[1:-1]/torch.norm(self.embedding.weight[1:-1], dim=-1).unsqueeze(1)
        z = self.w * torch.matmul(l_c, l_emb.t())

        return z


    def predict(self, x, pos, k=20):
        self.is_dropout = False
        x_embeddings = self.embedding(x)  # B,seq,dim
        pos_embeddings = self.pos_embedding(pos)  # B, seq, dim
        mask = (x != 0).float()  # B,seq
        x_ = torch.cat((x_embeddings, pos_embeddings), 2)  # B seq, 2*dim
        x_s = x_[:, :-1, :]  # B, seq-1, 2*dim
        alpha_ent = self.get_alpha(x = x_[:, -1, :], number= 0)
        m_s, x_n = self.self_attention(x_, x_, x_, mask, alpha_ent)
        alpha_global = self.get_alpha(x= m_s, number=1)
        global_c = self.global_attention(m_s, x_n, x_s, mask, alpha_global)  # B, 1, dim
        h_t = global_c
        result = self.decoder(h_t, m_s)
        rank = torch.argsort(result, dim=1, descending=True)
        return rank[:, 0:k]


In [11]:
w_list = [20]
record = list()
for w in w_list:
    np.random.seed(1)
    torch.manual_seed(1)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    
    train_sets = TensorDataset(train_x.long(), train_pos.long(), train_y.long())
    train_dataload = DataLoader(train_sets, batch_size=512, shuffle=True)
    criterion = nn.CrossEntropyLoss().cuda()
    test_x, test_pos, test_y = test_x.long(), test_pos.long(), test_y.long()
    all_test_sets = TensorDataset(test_x, test_pos, test_y)
    test_dataload = DataLoader(all_test_sets, batch_size=512,shuffle=False)
    model = DualAttention(100, 100, 40842, 71, w, dropout=0.5, activate='relu').cuda()
    opti = optim.Adam(model.parameters(), lr=0.001, weight_decay=0, amsgrad=True)
    best_result = 0
    total_time = 0
    best_result_5 = 0
    best_result_ = []
#     for epoch in range(50):
#         start_time = datetime.datetime.now()
#         print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
#         losses = 0
#         for step, (x_train, pos_train, y_train) in enumerate(train_dataload):
#             opti.zero_grad()
#             q = model(x_train.cuda(), pos_train.cuda())
#             loss = criterion(q, y_train.cuda()-1)
#             loss.backward()
#             opti.step()
#             losses += loss.item()
#             if (step + 1) % 100 == 0:
#                 print("[%02d/%d] [%03d/%d] mean_loss : %0.2f" % (epoch, 50, step, len(train_sets) / 512, losses / step + 1))
#         end_time = datetime.datetime.now()
#         with torch.no_grad():
#             y_pre_all = torch.LongTensor().cuda()
#             y_pre_all_10 = torch.LongTensor()
#             y_pre_all_5 = torch.LongTensor()
#             for x_test, pos_test, y_test in test_dataload:
#                 with torch.no_grad():
#                     y_pre = model.predict(x_test.cuda(), pos_test.cuda(), 20)
#                     y_pre_all = torch.cat((y_pre_all, y_pre), 0)
#                     y_pre_all_10 = torch.cat((y_pre_all_10, y_pre.cpu()[:, :10]), 0)
#                     y_pre_all_5 = torch.cat((y_pre_all_5, y_pre.cpu()[:, :5]), 0)
#             recall = get_recall(y_pre_all, test_y.cuda().unsqueeze(1)-1)
#             recall_10 = get_recall(y_pre_all_10, test_y.unsqueeze(1)-1)
#             recall_5 = get_recall(y_pre_all_5, test_y.unsqueeze(1)-1)
#             mrr = get_mrr(y_pre_all, test_y.cuda().unsqueeze(1)-1)
#             mrr_10 = get_mrr(y_pre_all_10, test_y.unsqueeze(1)-1)
#             mrr_5 = get_mrr(y_pre_all_5, test_y.unsqueeze(1)-1)
#     
#             print("Recall@20: " + "%.4f" %recall + " Recall@10: " + "%.4f" %recall_10 +"  Recall@5:" + "%.4f" %recall_5)
#             print("MRR@20:" + "%.4f" % mrr.tolist() + " MRR@10:" + "%.4f" % mrr_10.tolist() + " MRR@5:" + "%.4f" % mrr_5.tolist())
#             if best_result < recall:
#                 best_result = recall
#                 best_result_ = [recall_5, recall_10, recall, mrr_5, mrr_10, mrr]
#                 torch.save(model.state_dict(), 'BestModel/best_dn_w_%s.pth' % str(w))
#             print("best result: " + str(best_result))
#             print("==================================")
#     record.append(best_result_)
# print(record)

In [14]:
model = DualAttention(100, 100, 40842, 71, 20, atten_way='MLP', decoder_way='trilinear2', dropout=0.5, activate='relu').cuda()
model.load_state_dict(torch.load('BestModel/best_dn_w_20.pth'))


<All keys matched successfully>

In [15]:
with torch.no_grad():
    y_pre_all = torch.LongTensor().cuda()
    y_pre_all_10 = torch.LongTensor()
    y_pre_all_5 = torch.LongTensor()
    for x_test, pos_test, y_test in test_dataload:
        with torch.no_grad():
            y_pre = model.predict(x_test.cuda(), pos_test.cuda(), 20)
            y_pre_all = torch.cat((y_pre_all, y_pre), 0)
            y_pre_all_10 = torch.cat((y_pre_all_10, y_pre.cpu()[:, :10]), 0)
            y_pre_all_5 = torch.cat((y_pre_all_5, y_pre.cpu()[:, :5]), 0)
    recall = get_recall(y_pre_all, test_y.cuda().unsqueeze(1)-1)
    recall_10 = get_recall(y_pre_all_10, test_y.unsqueeze(1)-1)
    recall_5 = get_recall(y_pre_all_5, test_y.unsqueeze(1)-1)
    mrr = get_mrr(y_pre_all, test_y.cuda().unsqueeze(1)-1)
    mrr_10 = get_mrr(y_pre_all_10, test_y.unsqueeze(1)-1)
    mrr_5 = get_mrr(y_pre_all_5, test_y.unsqueeze(1)-1)

    print("Recall@20: " + "%.4f" %recall + " Recall@10: " + "%.4f" %recall_10 +"  Recall@5:" + "%.4f" %recall_5)
    print("MRR@20:" + "%.4f" % mrr.tolist() + " MRR@10:" + "%.4f" % mrr_10.tolist() + " MRR@5:" + "%.4f" % mrr_5.tolist())

Recall@20: 0.5376 Recall@10: 0.4029  Recall@5:0.2869
MRR@20:0.1899 MRR@10:0.1805 MRR@5:0.1651


	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  hits = (pre == truths).nonzero()
