In [1]:
import pandas as pd
import numpy as np
import torch as th
from torch.utils.data.dataloader import DataLoader

In [2]:
MIN_USER_FREQ = 20
MIN_ITEM_FREQ = 100

ratings = pd.read_csv("data/Books.csv", header=None)
ratings.columns = ['userId', 'itemId', 'rate', 'timestamp']

# item filtering
itemFreq = ratings.groupby(['itemId'])['itemId'].count()
validSet = set(itemFreq.loc[itemFreq >= MIN_ITEM_FREQ].index)
ratings = ratings.loc[ratings['itemId'].apply(lambda x: x in validSet), :]

# user filtering
userFreq = ratings.groupby(['userId'])['userId'].count()
validSet = set(userFreq.loc[userFreq >= MIN_USER_FREQ].index)
ratings = ratings.loc[ratings['userId'].apply(lambda x: x in validSet), :]


In [3]:
# encode his
ukv, ikv = list(enumerate(ratings['userId'].unique())), list(enumerate(ratings['itemId'].unique()))
userRawId = {encId: rawId for encId, rawId in ukv}
userEncId = {rawId: encId for encId, rawId in ukv}

# encode tar, id 0 is for padding, item encode id start from 1
itemRawId = {encId + 1: rawId for encId, rawId in ikv}
itemEncId = {rawId: encId + 1 for encId, rawId in ikv}

# 编码
ratings['userId'] = ratings['userId'].apply(lambda x: userEncId[x])
ratings['itemId'] = ratings['itemId'].apply(lambda x: itemEncId[x])

ratings.sort_values(by=['userId', 'timestamp'], inplace=True, ignore_index=True)

In [4]:
winSize = 20

def padOrCut(seq, l):
    if (len(seq) < l): return np.concatenate([seq, (l - len(seq)) * [0]])
    # return last len
    elif (len(seq) > l): return seq[len(seq) - l:]
    else: return seq

# split train and test users
import random
trainUsers, testUsers = set(), set()
for userId in range(len(userRawId)):
    if (random.random() <= 0.8): trainUsers.add(userId)
    else: testUsers.add(userId)

# generate user sample by sliding window
def genUserTrainSamples(userDf):
    userDf.reset_index(drop=True, inplace=True)
    his, tar = [], []
    for i in range(1, userDf.shape[0]): # enumerate y from 1
        # x = window [i - winSize, i - 1], y = item[i]
        his.append(padOrCut(userDf.iloc[max(0, i - winSize):i]['itemId'].values, winSize))
        tar.append(userDf.iloc[i]['itemId'])

    return np.stack(his), np.stack(tar)

def genUserTestSamples(userDf):
    userDf.reset_index(drop=True, inplace=True)
    idx = int(0.8 * userDf.shape[0])
    his = userDf['itemId'].iloc[:idx]
    tar = userDf['itemId'].iloc[idx:]

    return his, tar

boolIdx = ratings['userId'].apply(lambda x: x in trainUsers)
trainRatings = ratings.loc[boolIdx, :]
testRatings = ratings.loc[~boolIdx, :]

trainSamples = trainRatings.groupby('userId').apply(genUserTrainSamples)
trainHis = np.concatenate(trainSamples.apply(lambda x: x[0]).values)
trainTar = np.concatenate(trainSamples.apply(lambda x: x[1]).values)

testSamples = testRatings.groupby('userId').apply(genUserTestSamples)
testHis = testSamples.apply(lambda x: x[0]).values
testTar = testSamples.apply(lambda x: x[1]).values


In [14]:
trainHis = trainHis.astype(np.int32)
trainTar = trainTar.astype(np.int32)
testHis = testHis.astype(np.int32)
testTar = testTar.astype(np.int32)

In [15]:
import pickle

with open("data/trainHis.pkl", "wb") as f:
    f.write(pickle.dumps(trainHis))

with open("data/trainTar.pkl", "wb") as f:
    f.write(pickle.dumps(trainTar))

with open("data/testHis.pkl", "wb") as f:
    f.write(pickle.dumps(testHis))

with open("data/testTar.pkl", "wb") as f:
    f.write(pickle.dumps(testTar))

In [16]:
class Dataset:
    def __init__(self, his, tar):
        self.his = his
        self.tar = tar
        assert self.his.shape[0] == self.tar.shape[0]
    
    def __getitem__(self, i):
        return self.his[i], self.tar[i]
    
    def __len__(self):
        return self.his.shape[0]

trainData = DataLoader(
    Dataset(trainHis, trainTar),
    batch_size = 1028,
    shuffle=True
)

In [21]:
class MIND(th.nn.Module):
    def __init__(self, D, K, R, L, nNeg, embedNum):
        super(MIND, self).__init__()
        self.D = D
        self.K = K
        self.R = R
        self.L = L
        self.nNeg = nNeg
        # weights initialization
        self.itemEmbeds = th.nn.Embedding(embedNum, D, padding_idx=0)
        # matmul([batch_size, k, 1, dim], [k, dim, dim']) = [batch_size, k, 1, dim']
        self.dense1 = th.nn.Linear(D, 4 * D)
        self.dense2 = th.nn.Linear(4 * D, D)
        # one S for all routing operations, first dim is for batch broadcasting
        S = th.empty(D, D)
        th.nn.init.normal_(S, mean=0.0, std=1.0)
        self.S = th.nn.Parameter(S) # don't forget to make S as model parameter
        # fixed routing logits once initialized
        self.B = th.nn.init.normal(th.empty(K, L), mean=0.0, std=1.0)
        self.opt = th.optim.Adam(self.parameters(), lr=0.05)

    # output caps' length is in (0, 1)
    def squash(self, caps, bs):
        n = th.norm(caps, dim=2).view(bs, self.K, 1)
        nSquare = th.pow(n, 2)

        return (nSquare / ((1 + nSquare) * n + 1e-9)) * caps
    
    def B2IRouting(self, his, bs):
        """B2I dynamic routing, input behaviors, output caps
        """
        # init b, bji = b[j][i] rather than ij for matmul convinience
        # no grad for b: https://github.com/Ugenteraan/CapsNet-PyTorch/blob/master/CapsNet-PyTorch.ipynb
        b = self.B.detach()
        # except for first routing round, each sample's w is different, so need a dim for batch
        b = th.tile(b, (bs, 1, 1))

        his = self.itemEmbeds(his) # (batch_size, len, dim)
        his = th.matmul(his, self.S) # (batch_size, L, dim)
        
        for i in range(self.R):
            w = th.softmax(b, dim=2)
            if i < self.R - 1:
                with th.no_grad():
                    # weighted sum all i to each j
                    caps = th.matmul(w, his) # (bs, K, D)
                    caps = self.squash(caps, bs)
                    b = th.matmul(th.matmul(caps, self.S), th.transpose(his, 1, 2)).detach() # (bs, K, L)
            else:
                caps = th.matmul(w, his) # (bs, K, D)
                caps = self.squash(caps, bs)
                # skip routing logits update in last round

        # mlp
        caps = self.dense2(th.relu(self.dense1(caps))) # (bs, K, D)
        ## l2 norm
        #caps = caps / (th.norm(caps, dim=2).view(bs, self.K, 1) + 1e-9)
        
        return caps
    
    def labelAwareAttation(self, caps, tar, p=2):
        """label-aware attention, input caps and targets, output logits
            caps: (bs, K, D)
            tar: (bs, cnt, D)
            for postive tar, cnt = 1
            for negative tar, cnt = self.nNeg
        """
        tar = tar.transpose(1, 2) # (bs, D, cnt)
        w = th.softmax(
                # (bs, K, D) X (bs, D, cnt) -> (bs, K, cnt) -> (bs, cnt, K)
                th.pow(th.transpose(th.matmul(caps, tar), 1, 2), p),
                dim=2
            )
        w = w.unsqueeze(2) # (bs, cnt, K) -> (bs, cnt, 1, K)

        # (bs, cnt, 1, K) X (bs, 1, K, D) -> (bs, cnt, 1, D) -> (bs, cnt, D)
        caps = th.matmul(w, caps.unsqueeze(1)).squeeze(2)

        return caps

    def sampledSoftmax(self, caps, tar, bs, tmp=0.01):
        tarPos = self.itemEmbeds(tar) # (bs, D)
        capsPos = self.labelAwareAttation(caps, tarPos.unsqueeze(1)).squeeze(1) # (bs, D)
        # pos logits
        #his = his / (th.norm(his, dim=1).view(bs, 1) + 1e-9)
        #tar = tar / (th.norm(tar, dim=1).view(bs, 1) + 1e-9)
        # (bs, D) dot (bs, D) -> (bs, D) - sum > (bs, )
        posLogits = th.sigmoid(th.sum(capsPos * tarPos, dim=1) / tmp)

        # neg logits
        # in-batch negative sampling
        tarNeg = tarPos[th.multinomial(th.ones(bs), self.nNeg * bs, replacement=True)].view(bs, self.nNeg, self.D) # (batch_size, nNeg, D)
        capsNeg = self.labelAwareAttation(caps, tarNeg)
        # hisNeg[b][i].dot(tarNeg[b][i]) for all b, i
        negLogits = th.sigmoid(th.sum(capsNeg * tarNeg, dim=2).view(bs * self.nNeg) / tmp)

        logits = th.concat([posLogits, negLogits])
        labels = th.concat([th.ones(bs, ), th.zeros(bs * self.nNeg)])

        return logits, labels

In [22]:
NUM_EPOCHS = 5
model = MIND(D=8, K=3, R=3, L=winSize, nNeg=5, embedNum=len(itemEncId) + 1)
BCELoss = th.nn.BCELoss()
for epoch in range(NUM_EPOCHS):
    epochTotalLoss = 0
    for step, (his, tar) in enumerate(trainData):
        bs = his.shape[0]
        caps = model.B2IRouting(his, bs)
        logits, labels = model.sampledSoftmax(caps, tar, bs)

        loss = BCELoss(logits, labels)
        loss.backward()
        model.opt.step()
        model.opt.zero_grad()
        epochTotalLoss += loss
        if (step % 200 == 0):
            print('Epoch {:02d} | Step {:05d} | Loss {:.6f}'.format(
                epoch,
                step,
                epochTotalLoss / (step + 1),
            ))



Epoch 00 | Step 00000 | Loss 30.478069
Epoch 00 | Step 00200 | Loss 2.164661
Epoch 00 | Step 00400 | Loss 1.334919
Epoch 00 | Step 00600 | Loss 1.061007
Epoch 00 | Step 00800 | Loss 0.929948
Epoch 00 | Step 01000 | Loss 0.849167


KeyboardInterrupt: 

In [79]:
testData = DataLoader(
    Dataset(testHis, testTar),
    batch_size = 8,
    shuffle=True
)

with th.no_grad():
    ie = model.itemEmbeds.weight
    ie /= th.norm(ie, dim=1).view(ie.shape[0], 1) + 1e-9
    n, top = ie.shape[0], 30
    hit, total = 0, 0
    for his, tar in testData:
        bs = his.shape[0]
        caps = model.B2IRouting(his, bs)
        logits = th.matmul(caps, th.transpose(ie, 0, 1)).detach().numpy()

        # TODO: K个兴趣的所有logits和在一起取topN
        res = np.argpartition(logits, kth=n - top, axis=2)[:, :, -top:]

        for r, truth in zip(res, tar):
            # 合并K个兴趣的召回结果
            r = set(r.flatten())
            if (truth.item() in r): hit += 1
            total += 1

    print("precision@{}: {}".format(model.K * top, hit / total))

precision@90: 0.0058033789204125245


In [89]:
th.norm(ie, dim=1)


tensor([0., 1., 1.,  ..., 1., 1., 1.], grad_fn=<CopyBackwards>)

### 模型过程实验代码 

In [None]:
D = 2
K = 4
R = 1
L = 3

itemEmbeds = th.nn.Embedding(len(itemEncId), D, padding_idx=0)

"""
    Get number of interest number using equation (9) in the paper
    @x: (batch_size, seq_len), input batch user history item seq
    @K: basic interest number

    @output: (batch_size, )
"""
def getK(x, K):
    return th.maximum(th.minimum(th.log2(x.count_nonzero(dim=1)), th.tensor([K])), th.tensor([1])).type(th.int8)

"""
    squash function using equation (7) in the paper
    @caps: (batch_size, k, dim), interest capsules
    
    @output: (batch_size, k, dim)
"""
def squash(caps):
    l2Norm = th.norm(caps, dim=2) # (batch_size, k)
    l2NormSqure = th.pow(l2Norm, 2)

    return (l2NormSqure / (1 + l2NormSqure)).view(bs, K, 1) * (caps / l2Norm.view(bs, K, 1))

# weights initialization, 
# init b, bji = b[j][i]
b = th.empty(K, L)
th.nn.init.normal_(b, mean=0.0, std=1.0)
# one S for all routing operations, first dim is for batch broadcasting
S = th.empty(D, D)
th.nn.init.normal_(S, mean=0.0, std=1.0)

his = th.tensor([[1, 2, 0], [3, 2, 0]])
tar = th.tensor([3, 1])
batch_labels = th.tensor([1, 0])

# B2I dynamic routing, input behaviors, output caps
bs = his.shape[0]
# k is fixed for batch forward, cannot find a way to use variant k with batch
#k = getK(his, K) 
I = itemEmbeds(his) # (batch_size, len, dim)
for i in range(R):
    w = th.softmax(b, dim=1) # (K, L)
    I = th.matmul(I, S) # (batch_size, len, dim), bilinear transform
    caps = squash(th.matmul(w, I)) # (batch_size, K, dim)
    _b = th.matmul(caps, th.transpose(I, 1, 2)) # (batch_size, K, L), _bji = _b[j][i]
    # sum over batch dim first, then add to b
    b += th.sum(_b, dim=0) # (K, L)

# label-aware attention, input caps and targets, output logits
tar = itemEmbeds(tar) # (batch_size, dim)
# in-batch negative sampling
"""
pos:
            caps                     y                  weights
    (batch_size, K, dim) * (batch_size, dim, 1) = (batch_size, K, 1)

            weights                caps
    (batch_size, 1, K) * (batch_size, K, dim) = (batch_size, dim)

neg:
            caps                     y                  weights
    (batch_size, K, dim) * (batch_size, dim, nNeg) = (batch_size, K, nNeg)

            weights                caps
    (batch_size, nNeg, K) * (batch_size, K, dim) = (batch_size, nNeg, dim)

"""
his = th.matmul(
    th.softmax(
        th.pow(th.transpose(th.matmul(caps, tar.view(bs, D, 1)), 1, 2), 2),
        dim=2
    ), 
    caps
).view(bs, D) # (batch_size, dim)

# pos logits
tmp = 0.01
his = his / th.norm(his, dim=1).view(bs, 1)
tar = tar / th.norm(tar, dim=1).view(bs, 1)
posLogits = th.sigmoid(th.sum(his * tar, dim=1) / tmp)

# neg logits
nNeg = 5
tarNeg = tar[th.multinomial(th.ones(bs), nNeg * bs, replacement=True)].view(bs, nNeg, D) # (batch_size, nNeg, D)
yNegT = th.transpose(tar[th.multinomial(th.ones(bs), nNeg * bs, replacement=True)].view(bs, nNeg, D), 1, 2) # (batch_size, D, nNeg)
hisNeg = th.matmul(
    th.softmax(
        th.pow(th.transpose(th.matmul(caps, yNegT), 1, 2), 2),
        dim=2
    ),  # (batch_size, nNeg, K)
    caps
) # (batch_size, nNeg, D)
# hisNeg[b][i].dot(tarNeg[b][i]) for all b, i
negLogits = th.sigmoid(th.sum(hisNeg * tarNeg, dim=2).view(bs * nNeg) / tmp)

logits = th.concat([posLogits, negLogits])
labels = th.concat([th.ones(bs, ), th.zeros(bs * nNeg)])

# loss
CELoss = th.nn.BCELoss()
loss = CELoss(logits, labels)


In [None]:
"""
wij * Se

wji is more convenient

[w00, w01, w02] * each sample seq -> cap0
[w10, w11, w12] * each sample seq -> cap1
[w20, w21, w22] * each sample seq -> cap2


[[w00, w01, w02]                            
 [w10, w11, w12]    *  each sample seq -> (k, dim)
 [w20, w21, w22]]
"""
w