In [1]:
import pandas as pd
import numpy as np
import torch as th
from torch.utils.data.dataloader import DataLoader

In [2]:
MIN_USER_FREQ = 20
MIN_ITEM_FREQ = 100

#ratings = pd.read_csv("data/Books.csv", header=None, nrows=500000)
ratings = pd.read_csv("data/Books.csv", header=None)
ratings.columns = ['userId', 'itemId', 'rate', 'timestamp']

# user filtering
userFreq = ratings.groupby(['userId'])['userId'].count()
validSet = set(userFreq.loc[userFreq >= MIN_USER_FREQ].index)
ratings = ratings.loc[ratings['userId'].apply(lambda x: x in validSet), :]

# item filtering
itemFreq = ratings.groupby(['itemId'])['itemId'].count()
validSet = set(itemFreq.loc[itemFreq >= MIN_ITEM_FREQ].index)
ratings = ratings.loc[ratings['itemId'].apply(lambda x: x in validSet), :]

In [3]:
# encode his
ukv, ikv = list(enumerate(ratings['userId'].unique())), list(enumerate(ratings['itemId'].unique()))
userRawId = {encId: rawId for encId, rawId in ukv}
userEncId = {rawId: encId for encId, rawId in ukv}

# encode tar, id 0 is for padding, item encode id start from 1
itemRawId = {encId + 1: rawId for encId, rawId in ikv}
itemEncId = {rawId: encId + 1 for encId, rawId in ikv}

# 编码
ratings['userId'] = ratings['userId'].apply(lambda x: userEncId[x])
ratings['itemId'] = ratings['itemId'].apply(lambda x: itemEncId[x])

ratings.sort_values(by=['userId', 'timestamp'], inplace=True, ignore_index=True)

In [4]:
winSize = 15

def padOrCut(seq, l):
    if (len(seq) < l): return np.concatenate([seq, (l - len(seq)) * [0]])
    elif (len(seq) > l): return seq[:l]
    else: return seq

# generate user sample by sliding window
def genUserSamples(userDf):
    userDf.reset_index(drop=True, inplace=True)
    his, tar = [], []
    for i in range(1, userDf.shape[0]): # enumerate y from 1
        # x = window [i - winSize, i - 1], y = item[i]
        his.append(padOrCut(userDf.iloc[max(0, i - winSize):i]['itemId'].values, winSize))
        tar.append(userDf.iloc[i]['itemId'])
    # split train and test as 9:1
    i = int(len(his) * 0.9)
    trainHis, testHis = his[:i], his[i:]
    trainTar, testTar = tar[:i], tar[i:]

    return (np.stack(trainHis), np.stack(trainTar)), (np.stack(testHis), np.stack(testTar))

samples = ratings.groupby('userId') \
    .filter(lambda x: x.shape[0] >= 10) \
    .groupby('userId') \
    .apply(genUserSamples)

trainHis = samples.apply(lambda x: x[0][0]).values
trainTar = samples.apply(lambda x: x[0][1]).values
testHis = samples.apply(lambda x: x[1][0]).values
testTar = samples.apply(lambda x: x[1][1]).values

assert len(trainHis) > 0
trainHis = np.concatenate(trainHis).astype(np.int32)
trainTar = np.concatenate(trainTar).astype(np.int32)
testHis = np.concatenate(testHis).astype(np.int32)
testTar = np.concatenate(testTar).astype(np.int32)

In [5]:
class Dataset:
    def __init__(self, his, tar):
        self.his = his
        self.tar = tar
        assert self.his.shape[0] == self.tar.shape[0]
    
    def __getitem__(self, i):
        return self.his[i], self.tar[i]
    
    def __len__(self):
        return self.his.shape[0]

trainData = DataLoader(
    Dataset(trainHis, trainTar),
    batch_size = 1028,
    shuffle=True
)

In [6]:
class MIND(th.nn.Module):
    def __init__(self, D, K, R, L, nNeg, embedNum):
        super(MIND, self).__init__()
        self.D = D
        self.K = K
        self.R = R
        self.L = L
        self.nNeg = nNeg
        # weights initialization
        self.itemEmbeds = th.nn.Embedding(embedNum, D, padding_idx=0)
        # matmul([batch_size, k, 1, dim], [k, dim, dim']) = [batch_size, k, 1, dim']
        self.dense1 = th.empty(K, D, 2 * D)
        self.dense2 = th.empty(K, 2 * D, D)
        # one S for all routing operations, first dim is for batch broadcasting
        self.S = th.empty(D, D)
        th.nn.init.normal_(self.S, mean=0.0, std=1.0)
        self.dense1 = th.nn.init.normal_(self.dense1, mean=0.0, std=1.0)
        self.dense2 = th.nn.init.normal_(self.dense2, mean=0.0, std=1.0)
        self.opt = th.optim.Adam(self.parameters(), lr=0.05)

    # output caps' length is in (0, 1)
    def squash(self, caps, bs):
        n = th.norm(caps, dim=2).view(bs, self.K, 1)
        nSquare = th.pow(n, 2)

        return (nSquare / ((1 + nSquare) * n + 1e-9)) * caps
    
    def B2IRouting(self, his, bs):
        # B2I dynamic routing, input behaviors, output caps
        # init b, bji = b[j][i] no grad for b: https://github.com/Ugenteraan/CapsNet-PyTorch/blob/master/CapsNet-PyTorch.ipynb
        b = th.normal(0, 1, (self.K, self.L)).detach()
        # k is fixed for batch forward, cannot find a way to use variant k with batch
        I = self.itemEmbeds(his) # (batch_size, len, dim)
        # bilinear transform & l2 norm, Sei is fixde during routing 
        I = th.matmul(I, self.S) # (batch_size, L, dim)
        I = I / (th.norm(I, dim=2) + 1e-9).view(bs, self.L, 1)
        for i in range(self.R):
            # routing, cut w's gradient, because caps have gradients from w and w is changing over loops
            w = th.softmax(b, dim=1).detach() # (K, L)
            if i != self.R - 1:
                # no grads
                with th.no_grad():
                    caps = self.squash(th.matmul(w, I), bs) # (batch_size, K, dim)
                    # update routing logits
                    _b = th.matmul(caps, th.transpose(I, 1, 2)) # (batch_size, K, L), _bji = _b[j][i]
                    # sum over batch dim first, then add to b
                    b += th.sum(_b, dim=0) # (K, L)
            else:
                caps = self.squash(th.matmul(w, I), bs) # (batch_size, K, dim)
                # skip routing logits update in last round
        # mlp
        caps = th.matmul(caps.view(bs, self.K, 1, self.D), self.dense1)
        caps = th.matmul(caps, self.dense2).view(bs, self.K, self.D)
        # l2 norm
        caps = caps / (th.norm(caps, dim=2).view(bs, self.K, 1) + 1e-9)
        
        return caps
    
    def labelAwareAttation(self, caps, tar, bs, p=2):
        # label-aware attention, input caps and targets, output logits
        tar = self.itemEmbeds(tar) # (batch_size, dim)
        # in-batch negative sampling
        his = th.matmul(
            th.softmax(
                th.pow(th.transpose(th.matmul(caps, tar.view(bs, self.D, 1)), 1, 2), p),
                dim=2
            ), 
            caps
        ).view(bs, self.D) # (batch_size, dim)

        # pos logits
        tmp = 0.01
        his = his / (th.norm(his, dim=1).view(bs, 1) + 1e-9)
        tar = tar / (th.norm(tar, dim=1).view(bs, 1) + 1e-9)
        posLogits = th.sigmoid(th.sum(his * tar, dim=1) / tmp)

        # neg logits
        tarNeg = tar[th.multinomial(th.ones(bs), self.nNeg * bs, replacement=True)].view(bs, self.nNeg, self.D) # (batch_size, nNeg, D)
        tarNegT = th.transpose(tarNeg, 1, 2) # (batch_size, D, nNeg)
        hisNeg = th.matmul(
            th.softmax(
                th.pow(th.transpose(th.matmul(caps, tarNegT), 1, 2), p),
                dim=2
            ),  # (batch_size, nNeg, K)
            caps
        ) # (batch_size, nNeg, D)
        # hisNeg[b][i].dot(tarNeg[b][i]) for all b, i
        negLogits = th.sigmoid(th.sum(hisNeg * tarNeg, dim=2).view(bs * self.nNeg) / tmp)

        logits = th.concat([posLogits, negLogits])
        labels = th.concat([th.ones(bs, ), th.zeros(bs * self.nNeg)])

        return logits, labels

In [7]:
model = MIND(D=8, K=3, R=3, L=winSize, nNeg=5, embedNum=len(itemEncId) + 1)
BCELoss = th.nn.BCELoss()
for epoch in range(30):
    epochTotalLoss = 0
    for step, (his, tar) in enumerate(trainData):
        bs = his.shape[0]
        caps = model.B2IRouting(his, bs)
        logits, labels = model.labelAwareAttation(caps, tar, bs)

        loss = BCELoss(logits, labels)
        loss.backward()
        model.opt.step()
        model.opt.zero_grad()
        epochTotalLoss += loss
        if (step % 200 == 0):
            print('Epoch {:02d} | Step {:05d} | Loss {:.6f}'.format(
                epoch,
                step,
                epochTotalLoss / (step + 1),
            ))

Epoch 00 | Step 00000 | Loss 21.873215
Epoch 00 | Step 00200 | Loss 20.231533
Epoch 00 | Step 00400 | Loss 18.824614
Epoch 00 | Step 00600 | Loss 17.532839
Epoch 00 | Step 00800 | Loss 16.270983
Epoch 00 | Step 01000 | Loss 15.079107
Epoch 00 | Step 01200 | Loss 13.975615
Epoch 00 | Step 01400 | Loss 12.952223
Epoch 00 | Step 01600 | Loss 12.010619
Epoch 00 | Step 01800 | Loss 11.153078
Epoch 00 | Step 02000 | Loss 10.388562
Epoch 00 | Step 02200 | Loss 9.704868
Epoch 00 | Step 02400 | Loss 9.096917
Epoch 00 | Step 02600 | Loss 8.560026
Epoch 00 | Step 02800 | Loss 8.081849
Epoch 00 | Step 03000 | Loss 7.653366
Epoch 01 | Step 00000 | Loss 1.504871
Epoch 01 | Step 00200 | Loss 1.428777
Epoch 01 | Step 00400 | Loss 1.387291
Epoch 01 | Step 00600 | Loss 1.353326
Epoch 01 | Step 00800 | Loss 1.325077
Epoch 01 | Step 01000 | Loss 1.298531
Epoch 01 | Step 01200 | Loss 1.277683
Epoch 01 | Step 01400 | Loss 1.257927
Epoch 01 | Step 01600 | Loss 1.241712
Epoch 01 | Step 01800 | Loss 1.225214
E

Epoch 13 | Step 01600 | Loss 0.670315
Epoch 13 | Step 01800 | Loss 0.669800
Epoch 13 | Step 02000 | Loss 0.669225
Epoch 13 | Step 02200 | Loss 0.668835
Epoch 13 | Step 02400 | Loss 0.668437
Epoch 13 | Step 02600 | Loss 0.667910
Epoch 13 | Step 02800 | Loss 0.667550
Epoch 13 | Step 03000 | Loss 0.667188
Epoch 14 | Step 00000 | Loss 0.648145
Epoch 14 | Step 00200 | Loss 0.661366
Epoch 14 | Step 00400 | Loss 0.661049
Epoch 14 | Step 00600 | Loss 0.660568
Epoch 14 | Step 00800 | Loss 0.659733
Epoch 14 | Step 01000 | Loss 0.659392
Epoch 14 | Step 01200 | Loss 0.659010
Epoch 14 | Step 01400 | Loss 0.658431
Epoch 14 | Step 01600 | Loss 0.658142
Epoch 14 | Step 01800 | Loss 0.657884
Epoch 14 | Step 02000 | Loss 0.657328
Epoch 14 | Step 02200 | Loss 0.656913
Epoch 14 | Step 02400 | Loss 0.656462
Epoch 14 | Step 02600 | Loss 0.656250
Epoch 14 | Step 02800 | Loss 0.655990
Epoch 14 | Step 03000 | Loss 0.655708
Epoch 15 | Step 00000 | Loss 0.669415
Epoch 15 | Step 00200 | Loss 0.650782
Epoch 15 | S

Epoch 27 | Step 00000 | Loss 0.570636
Epoch 27 | Step 00200 | Loss 0.572867
Epoch 27 | Step 00400 | Loss 0.572551
Epoch 27 | Step 00600 | Loss 0.572387
Epoch 27 | Step 00800 | Loss 0.572314
Epoch 27 | Step 01000 | Loss 0.572428
Epoch 27 | Step 01200 | Loss 0.572437
Epoch 27 | Step 01400 | Loss 0.572343
Epoch 27 | Step 01600 | Loss 0.572245
Epoch 27 | Step 01800 | Loss 0.571855
Epoch 27 | Step 02000 | Loss 0.571794
Epoch 27 | Step 02200 | Loss 0.571854
Epoch 27 | Step 02400 | Loss 0.571801
Epoch 27 | Step 02600 | Loss 0.571634
Epoch 27 | Step 02800 | Loss 0.571454
Epoch 27 | Step 03000 | Loss 0.571522
Epoch 28 | Step 00000 | Loss 0.560554
Epoch 28 | Step 00200 | Loss 0.566919
Epoch 28 | Step 00400 | Loss 0.567684
Epoch 28 | Step 00600 | Loss 0.568508
Epoch 28 | Step 00800 | Loss 0.569248
Epoch 28 | Step 01000 | Loss 0.569279
Epoch 28 | Step 01200 | Loss 0.569365
Epoch 28 | Step 01400 | Loss 0.569589
Epoch 28 | Step 01600 | Loss 0.569080
Epoch 28 | Step 01800 | Loss 0.568967
Epoch 28 | S

In [79]:
testData = DataLoader(
    Dataset(testHis, testTar),
    batch_size = 8,
    shuffle=True
)

with th.no_grad():
    ie = model.itemEmbeds.weight
    ie /= th.norm(ie, dim=1).view(ie.shape[0], 1) + 1e-9
    n, top = ie.shape[0], 30
    hit, total = 0, 0
    for his, tar in testData:
        bs = his.shape[0]
        caps = model.B2IRouting(his, bs)
        # TODO: should change to label aware attention
        logits = th.matmul(caps, th.transpose(ie, 0, 1)).detach().numpy()

        # index就是encId, 对第三维进行快速选择，第kth小元素(从0开始)的原始索引值将位于其排序后的最终位置，其他元素小于他的在左，大于他的在右
        res = np.argpartition(logits, kth=n - top, axis=2)[:, :, -top:]

        for r, truth in zip(res, tar):
            # 合并K个兴趣的召回结果
            r = set(r.flatten())
            if (truth.item() in r): hit += 1
            total += 1

    print("precision@{}: {}".format(model.K * top, hit / total))

precision@90: 0.0058033789204125245


In [89]:
th.norm(ie, dim=1)


tensor([0., 1., 1.,  ..., 1., 1., 1.], grad_fn=<CopyBackwards>)

### 模型过程实验代码 

In [None]:
D = 2
K = 4
R = 1
L = 3

itemEmbeds = th.nn.Embedding(len(itemEncId), D, padding_idx=0)

"""
    Get number of interest number using equation (9) in the paper
    @x: (batch_size, seq_len), input batch user history item seq
    @K: basic interest number

    @output: (batch_size, )
"""
def getK(x, K):
    return th.maximum(th.minimum(th.log2(x.count_nonzero(dim=1)), th.tensor([K])), th.tensor([1])).type(th.int8)

"""
    squash function using equation (7) in the paper
    @caps: (batch_size, k, dim), interest capsules
    
    @output: (batch_size, k, dim)
"""
def squash(caps):
    l2Norm = th.norm(caps, dim=2) # (batch_size, k)
    l2NormSqure = th.pow(l2Norm, 2)

    return (l2NormSqure / (1 + l2NormSqure)).view(bs, K, 1) * (caps / l2Norm.view(bs, K, 1))

# weights initialization, 
# init b, bji = b[j][i]
b = th.empty(K, L)
th.nn.init.normal_(b, mean=0.0, std=1.0)
# one S for all routing operations, first dim is for batch broadcasting
S = th.empty(D, D)
th.nn.init.normal_(S, mean=0.0, std=1.0)

his = th.tensor([[1, 2, 0], [3, 2, 0]])
tar = th.tensor([3, 1])
batch_labels = th.tensor([1, 0])

# B2I dynamic routing, input behaviors, output caps
bs = his.shape[0]
# k is fixed for batch forward, cannot find a way to use variant k with batch
#k = getK(his, K) 
I = itemEmbeds(his) # (batch_size, len, dim)
for i in range(R):
    w = th.softmax(b, dim=1) # (K, L)
    I = th.matmul(I, S) # (batch_size, len, dim), bilinear transform
    caps = squash(th.matmul(w, I)) # (batch_size, K, dim)
    _b = th.matmul(caps, th.transpose(I, 1, 2)) # (batch_size, K, L), _bji = _b[j][i]
    # sum over batch dim first, then add to b
    b += th.sum(_b, dim=0) # (K, L)

# label-aware attention, input caps and targets, output logits
tar = itemEmbeds(tar) # (batch_size, dim)
# in-batch negative sampling
"""
pos:
            caps                     y                  weights
    (batch_size, K, dim) * (batch_size, dim, 1) = (batch_size, K, 1)

            weights                caps
    (batch_size, 1, K) * (batch_size, K, dim) = (batch_size, dim)

neg:
            caps                     y                  weights
    (batch_size, K, dim) * (batch_size, dim, nNeg) = (batch_size, K, nNeg)

            weights                caps
    (batch_size, nNeg, K) * (batch_size, K, dim) = (batch_size, nNeg, dim)

"""
his = th.matmul(
    th.softmax(
        th.pow(th.transpose(th.matmul(caps, tar.view(bs, D, 1)), 1, 2), 2),
        dim=2
    ), 
    caps
).view(bs, D) # (batch_size, dim)

# pos logits
tmp = 0.01
his = his / th.norm(his, dim=1).view(bs, 1)
tar = tar / th.norm(tar, dim=1).view(bs, 1)
posLogits = th.sigmoid(th.sum(his * tar, dim=1) / tmp)

# neg logits
nNeg = 5
tarNeg = tar[th.multinomial(th.ones(bs), nNeg * bs, replacement=True)].view(bs, nNeg, D) # (batch_size, nNeg, D)
yNegT = th.transpose(tar[th.multinomial(th.ones(bs), nNeg * bs, replacement=True)].view(bs, nNeg, D), 1, 2) # (batch_size, D, nNeg)
hisNeg = th.matmul(
    th.softmax(
        th.pow(th.transpose(th.matmul(caps, yNegT), 1, 2), 2),
        dim=2
    ),  # (batch_size, nNeg, K)
    caps
) # (batch_size, nNeg, D)
# hisNeg[b][i].dot(tarNeg[b][i]) for all b, i
negLogits = th.sigmoid(th.sum(hisNeg * tarNeg, dim=2).view(bs * nNeg) / tmp)

logits = th.concat([posLogits, negLogits])
labels = th.concat([th.ones(bs, ), th.zeros(bs * nNeg)])

# loss
CELoss = th.nn.BCELoss()
loss = CELoss(logits, labels)


In [None]:
"""
wij * Se

wji is more convenient

[w00, w01, w02] * each sample seq -> cap0
[w10, w11, w12] * each sample seq -> cap1
[w20, w21, w22] * each sample seq -> cap2


[[w00, w01, w02]                            
 [w10, w11, w12]    *  each sample seq -> (k, dim)
 [w20, w21, w22]]
"""
w