In [13]:
import json
import random
import itertools
import pandas as pd
import numpy as np
import torch as th


class Args:
    data_frac = 0.05
    min_user_freq = 10
    min_book_freq = 10
    max_user_freq = 200
    train_frac = 0.95
    his_len = 100
    n_neg = 10

    embed_dim = 4
    low_cap_dim = 4
    high_cap_dim = 4
    routing_rounds = 3

args = Args()

In [14]:
ratings = pd.read_csv("data/ratings_Books.csv", header=None, nrows=500000)
ratings.columns = ['userId', 'itemId', 'rate', 'timestamp']

# user filtering
userFreq = ratings.groupby(['userId'])['userId'].count()
validSet = set(userFreq.loc[userFreq >= 20].index)
ratings = ratings.loc[ratings['userId'].apply(lambda x: x in validSet), :]

# item filtering
movieFreq = ratings.groupby(['itemId'])['itemId'].count()
validSet = set(movieFreq.loc[movieFreq >= 10].index)
ratings = ratings.loc[ratings['itemId'].apply(lambda x: x in validSet), :]

In [15]:
# encode users
ukv, ikv = list(enumerate(ratings['userId'].unique())), list(enumerate(ratings['itemId'].unique()))
userRawId = {encId: rawId for encId, rawId in ukv}
userEncId = {rawId: encId for encId, rawId in ukv}

# encode items, id 0 is for padding, item encode id start from 1
itemRawId = {encId + 1: rawId for encId, rawId in ikv}
itemEncId = {rawId: encId + 1 for encId, rawId in ikv}

# 编码
ratings['userId'] = ratings['userId'].apply(lambda x: userEncId[x])
ratings['itemId'] = ratings['itemId'].apply(lambda x: itemEncId[x])

ratings.sort_values(by=['userId', 'timestamp'], inplace=True, ignore_index=True)

In [16]:
ratings

Unnamed: 0,userId,itemId,rate,timestamp
0,0,249,4.0,891734400
1,0,165,4.0,893376000
2,0,113,5.0,928022400
3,0,319,5.0,949363200
4,0,439,3.0,1014163200
...,...,...,...,...
9509,646,385,5.0,1224892800
9510,646,386,5.0,1257120000
9511,647,385,4.0,1295136000
9512,647,475,5.0,1296259200


In [17]:
def padOrCut(seq, l):
    if (len(seq) < l): return np.concatenate([seq, (l - len(seq)) * [0]])
    elif (len(seq) > l): return seq[:l]
    else: return seq

# generate user sample by sliding window
def genUserSamples(df):
    winSize = 15
    df.reset_index(drop=True, inplace=True)
    X, Y = [], []
    for i in range(1, df.shape[0]): # enumerate y
        # x = window [i - winSize, i - 1], y = item[i]
        X.append(padOrCut(df.iloc[max(0, i - winSize):i]['itemId'].values, winSize))
        Y.append(df.iloc[i]['itemId'])
    # split train and test as 9:1
    i = int(len(X) * 0.9)
    trainX, testX = X[:i], X[i:]
    trainY, testY = Y[:i], Y[i:]

    return (np.stack(trainX), np.stack(trainY)), (np.stack(testX), np.stack(testY))

ratings = ratings.groupby('userId').filter(lambda x: x.shape[0] >= 10).groupby('userId').apply(genUserSamples)
trainX = np.concatenate(ratings.apply(lambda x: x[0][0]).values)
trainY = np.concatenate(ratings.apply(lambda x: x[0][1]).values)
testX = np.concatenate(ratings.apply(lambda x: x[1][0]).values)
testY = np.concatenate(ratings.apply(lambda x: x[1][1]).values)

In [18]:
trainX

array([[249,   0,   0, ...,   0,   0,   0],
       [249, 165,   0, ...,   0,   0,   0],
       [249, 165, 113, ...,   0,   0,   0],
       ...,
       [381, 353, 246, ...,   0,   0,   0],
       [381, 353, 246, ...,   0,   0,   0],
       [381, 353, 246, ...,   0,   0,   0]], dtype=int64)

In [4]:
class Dataset:
    def __init__(self, samples):
        self.samples = samples
        self.users, self.histories, self.tars, self.labels, self.cap_nums = [], [], [], [], []
        for sample in samples:
            self.users.append(sample['user'])
            self.histories.append(sample['his'])
            self.tars.append(sample['tar'])
            self.labels.append(sample['label'])
            self.cap_nums.append(sample['cap_num'])
        self.users = th.tensor(self.users, dtype=th.int32)
        self.histories = th.tensor(self.histories, dtype=th.int32)
        self.tars = th.tensor(self.tars, dtype=th.int32)
        self.labels = th.tensor(self.labels, dtype=th.float)
        self.cap_nums = th.tensor(self.cap_nums, dtype=th.int32)
    
    def __getitem__(self, idx):
        return self.users[idx], self.histories[idx], self.tars[idx], self.labels[idx], self.cap_nums[idx]
    
    def __len__(self):
        return len(self.samples)

test_dataset = Dataset(test_samples)

test_dataloader = \
    th.utils.data.DataLoader(test_dataset, batch_size=2, shuffle=True)

for batch in test_dataloader:
    print(batch)
    break

[tensor([ 991, 1419], dtype=torch.int32), tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0, 10670,
           720,  8137,  6122,  3848, 11125,  9861,  7772,  9289,  8462,  8916,
          6065,  1100,  8791,  9993,  6612,  3766,  2207,   985,  1204,  9675,
         10062,  2676,   998, 12731,  3525,  4816,  9201,  2549,  7251,  2996,
          2709, 11376, 12791,  9222,  7696,  7779,  1578,  3996,  4603,  2121],
        [ 8748,  8361,  5610,  8590,  2146,  1289,  4157,   846,  6493, 11486,
          7967, 12198,  3308,  3705,  3538, 11498, 10359,  5002,  5921,  9384,
         

In [None]:
class Capsule(th.nn.Module):
    def __init__(self, ):
        pass


class MIND(th.nn.Module):
    def __init__(self, args, n_users, n_books):
        self.embed_dim = args.embed_dim
        self.n_caps_high = args.k
        self.n_caps_low = args.his_len
        # trainable weights
        self.user_embeds = th.nn.Embedding(n_users, args.dim)
        self.book_embeds = th.nn.Embedding(n_books, args.dim)
        self.S = th.nn.Linear(args.low_cap_dim, args.high_cap_dim)
    
    @staticmethod
    def squash(x):
        """
            @x: (batch_size, d)
        """
        l2_norm = th.linalg.norm(x, dim = 1, ord=2) # (batch_size, )
        l2_norm_squared = th.pow(l2_norm, 2) # (batch_size, )
        scale = l2_norm_squared / (1 + l2_norm_squared) / l2_norm # (batch_size, )

        return th.multiply(x, th.unsqueeze(scale, dim=1)) # (batch_size, d)
        

    def forward(self, batch):
        user_id, history, tars, labels, num_caps = batch[0], batch[1], batch[2], batch[3], batch[4]
        his_embeds = self.book_embeds(history) # (batch_size, his_len, dim)]



In [71]:
x = th.tensor([[1, 2, 3, 4], [1.1, 2.2, 3.3, 4.4]], dtype=th.float32)


tensor([0.1767, 0.1615])


tensor([[0.1767, 0.3534, 0.5301, 0.7067],
        [0.1777, 0.3554, 0.5330, 0.7107]])

In [57]:
print(x)
print(l2_norm)
th.div(x, l2_norm)

tensor([[1., 2., 3., 4.],
        [1., 2., 3., 4.]])
tensor([5.4772, 5.4772])


RuntimeError: The size of tensor a (4) must match the size of tensor b (2) at non-singleton dimension 1

In [62]:
b = th.tensor([ 0.8032,  0.2930, -0.8113, -0.2308])
print(b)
print(b / b)

tensor([ 0.8032,  0.2930, -0.8113, -0.2308])
tensor([1., 1., 1., 1.])


In [72]:
import math

x = [1.1, 2.2, 3.3, 4.4]

l2 = math.sqrt(sum([xx * xx for xx in x]))
l2_squred = l2 * l2


scale = l2_squred / (1 + l2_squred) / l2

[scale * xx for xx in x]

[0.17767943554457133,
 0.35535887108914266,
 0.533038306633714,
 0.7107177421782853]

In [None]:
5.477225575051661