In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Jester-jokes')
os.getcwd()

Mounted at /content/drive


'/content/drive/MyDrive/Jester-jokes'

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from Dataset import DataSet

In [3]:
class BPR(nn.Module, DataSet):
    def __init__(self):
        super(BPR, self).__init__()
        self.rawdf = None

        self.W = None             # user matrix
        self.H = None             # item matrix
        self.Wsc = None           # scorer
        self.Hsc = None

        self.uid = None
        self.iid = None

        self.user_items = {}
        self.dev_user_items = {}

        self.rating_exp = None   # softmax sum
        self.rating_exp_mul_H = None

    def preprocess(self, train_size=0.7, test_size=0.1):
        self.rawdf = pd.read_excel("./FINAL jester 2006-15.xlsx", header=None, usecols="B:EU", names=[i for i in range(150)])
        self.rawdf = self.rawdf[self.rawdf != 99]

        self.uid = np.asarray([i for i in range(self.rawdf.shape[0])])
        self.iid = np.asarray([i for i in range(self.rawdf.shape[1])])

        data = self.rawdf.apply(lambda x: x.dropna().index.tolist(), axis=1).tolist()

        train_val_set, test_set = self.split_data_randomly(data, test_ratio=test_size)
        train_set, val_set = self.split_data_randomly(train_val_set, test_ratio=1-train_size/(1-test_size))

        return train_set, val_set, test_set

    def generate_train_batch(self, batch, sets):
        train = []
        for b in range(batch):
            u = self.uid[np.random.randint(0, len(self.uid))]
            i = sets[u][np.random.randint(0, len(sets[u]))]
            j = self.iid[np.random.randint(0, len(self.iid))]
            while j in sets[u]:
                j = self.iid[np.random.randint(0, len(self.iid))]
            train.append([u, i, j])
        return np.asarray(train)

    def forward(self, uids, iids, device):
        self.rating_exp = torch.zeros(len(self.uid)).to(device)
        self.rating_exp_mul_H = torch.zeros([len(self.uid), self.H.shape[1]]).to(device)

        # 处理 idx 得到 embedded Wu Hi
        emb_idxs = [self.user_items[uid] for uid in uids]
        item_emb = nn.utils.rnn.pad_sequence([self.Hsc[emb_idx] for emb_idx in emb_idxs], batch_first=True)
        user_emb = self.Wsc[uids][:, None, :]

        # 计算批次内 user_item 得分
        user_item_exp_sc = torch.sum(item_emb * user_emb, dim = -1)
        max_sc_per_row = torch.max(user_item_exp_sc, 1).values

        mask = (user_item_exp_sc != 0).type(torch.float32)
        user_item_exp_sc = user_item_exp_sc - max_sc_per_row[:, None]

        # 取指数， mask 保证补 0 位还是 0
        user_item_exp_sc = torch.exp(user_item_exp_sc) * mask

        # 计算指数和
        self.rating_exp_mul_H[uids] = torch.sum(user_item_exp_sc.unsqueeze(2).repeat(1, 1, self.H.shape[1]) * item_emb, dim = 1)
        self.rating_exp[uids] = torch.sum(user_item_exp_sc, dim = 1)

        #返回 softmax probablilty of item i among user_items
        return torch.exp(torch.sum(self.Wsc[uids] * self.Hsc[iids], dim = 1) - max_sc_per_row) / self.rating_exp[uids]

    def fit_dds(self, df, dev, k, stepsize=0.1, max_iter=10, batch=10000, dev_batch=5000, score_stepsize=0.1):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.to(device)

        # 初始化 W，H
        self.W = nn.Parameter(torch.rand(len(self.uid), k).to(device) * 0.01)
        self.H = nn.Parameter(torch.rand(len(self.iid), k).to(device) * 0.01)

        # 初始化 scorer
        self.Wsc = torch.rand(len(self.uid), k).to(device) * 0.01
        self.Hsc = torch.rand(len(self.iid), k).to(device) * 0.01
        # 创建字典：用户u对应他访问过的所有items集合
        self.user_items = dict(zip(self.uid, df))
        self.dev_user_items = dict(zip(self.uid, dev))

        # 主模型优化器
        optimizer = optim.SGD([self.W, self.H], lr=stepsize)
        # with torch.autograd.detect_anomaly():
        for x in range(max_iter):
            #取训练批次：uij三元组
            uij = self.generate_train_batch(batch, self.user_items)
            u = uij[:, 0]
            i = uij[:, 1]
            j = uij[:, 2]
            u_emb = self.W[u]
            i_emb = self.H[i]
            j_emb = self.H[j]

            # 主模型参数更新
            score_prob = self.forward(u, i, device)
            optimizer.zero_grad()
            score_loss = -torch.sum(score_prob * torch.log(torch.sigmoid( torch.sum(u_emb * (i_emb - j_emb),dim = 1) ) ))
            bpr_loss = -torch.mean(torch.log(torch.sigmoid(torch.sum(u_emb * (i_emb - j_emb),dim = 1))))
            score_loss.backward()
            optimizer.step()

            # 训练集上 W,H 的梯度
            W_grad_sum = self.W.grad.clone()
            H_grad_sum = self.H.grad.clone()

            # 对数概率分布下 Wsc, Hsc 梯度
            log_prob_Wsc_grad = torch.zeros((len(self.uid), k)).to(device)
            log_prob_Hsc_grad = torch.zeros((len(self.iid), k)).to(device)
            log_prob_Wsc_grad[u] = self.Hsc[i] - self.rating_exp_mul_H[u] / self.rating_exp[u].unsqueeze(1).repeat(1, k)
            log_prob_Hsc_grad[i] = self.Wsc[u] * ((1 - score_prob).unsqueeze(1).repeat(1, k))

            #取 dev uij三元组
            uij = self.generate_train_batch(dev_batch, self.dev_user_items)
            u = uij[:, 0]
            i = uij[:, 1]
            j = uij[:, 2]
            u_emb = self.W[u]
            i_emb = self.H[i]
            j_emb = self.H[j]

            # 计算 dev 集上 W,H 的梯度
            optimizer.zero_grad()
            dev_loss = -torch.sum(torch.log(torch.sigmoid(torch.sum(u_emb * (i_emb - j_emb),dim = 1))))
            dev_loss.backward()
            W_grad_dev_sum = self.W.grad.clone()
            H_grad_dev_sum = self.H.grad.clone()

            # 计算 reward: reward 为 W,H 在训练集和 dev 集上的梯度积
            r_W = torch.sum(W_grad_sum * W_grad_dev_sum, dim=1)
            r_H = torch.sum(H_grad_sum * H_grad_dev_sum, dim=1)
            r_W = r_W.unsqueeze(1).repeat(1, k)
            r_H = r_H.unsqueeze(1).repeat(1, k)

            # Wsc，Hsc 更新
            self.Wsc += score_stepsize * r_W * log_prob_Wsc_grad
            self.Hsc += score_stepsize * r_H * log_prob_Hsc_grad

            if ( x + 1 ) % 10 == 0:
                print(f"Iteration: {x+1}, BPR loss: {bpr_loss.item()}")


    def fit_ori(self, df, k, stepsize=0.05, max_iter=10, batch=10000):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.to(device)

        self.W = nn.Parameter(torch.rand(len(self.uid), k).to(device) * 0.01)    # 初始化 W，H
        self.H = nn.Parameter(torch.rand(len(self.iid), k).to(device) * 0.01)

        # 创建字典：用户u对应他访问过的所有items集合
        self.user_items = dict(zip(self.uid, df))

        optimizer = optim.SGD([self.W, self.H], lr=stepsize)     # 主模型优化器
        for x in range(max_iter):
            #取训练批次：uij三元组
            uij = self.generate_train_batch(batch, self.user_items)

            u = uij[:, 0]
            i = uij[:, 1]
            j = uij[:, 2]
            u_emb = self.W[u]
            i_emb = self.H[i]
            j_emb = self.H[j]
            optimizer.zero_grad()
            loss = -torch.sum(torch.log(torch.sigmoid(torch.sum(u_emb * (i_emb - j_emb),dim = 1))))
            loss.backward()
            optimizer.step()

            if ( x + 1 ) % 10 == 0:
                print(f"Iteration: {x+1}, BPR loss: {loss.item() / batch}")

    def _predict(self, uid, items, n):
        scores = torch.mv(self.H[items], self.W[uid])
        if n > scores.shape[0]:
            n = scores.shape[0]
        top_N_val, top_N_idx = torch.topk(scores, k=n)
        return list(zip(items[top_N_idx.cpu()], top_N_val.cpu()))

    def NDCG(self, uid, test, n):         # 用模型排序+真实分数计算 DCG, 重排后计算 iDCG
        test_user = np.asarray(test[uid])
        rating = self._predict(uid, test_user, n)
        irating =sorted(self.rawdf.iloc[uid, test_user].to_list(), reverse=True)

        dcg = 0
        idcg = 0
        if n > len(irating): n = len(irating)
        for i in range(n):
            r = self.rawdf.iloc[uid, rating[i][0]]
            dcg += 1.0 * (2**r - 1) / math.log(i + 2, 2)
            idcg += 1.0 * (2**irating[i] - 1) / math.log(i + 2, 2)
        if idcg==0:
            return 0
        return dcg / idcg

    def performance(self, test, n):      # Output recall@n, precision@n, NDCG@n
        hit = 0
        n_recall = 0
        n_precision = 0
        ndcg = 0
        for i in self.uid:
            # Items that User i hasn't tried in training set
            unknown_items = np.setdiff1d(self.iid, self.user_items[i])
            # Items that User i actually tried in testing set
            known_items = test[i]

            #目标：预测 unknown items 中的top_N，若击中test中的items，则为有效预测
            ru = self._predict(i, unknown_items, n)

            hit += sum(1 for item, pui in ru if item in known_items)
            n_recall += len(known_items)
            n_precision += n
            ndcg += self.NDCG(i, test, n)

        recall = hit / (1.0 * n_recall)
        precision = hit / (1.0 * n_precision)
        ndcg /= len(self.uid)
        return recall, precision, ndcg

In [4]:
model1 = BPR()
train1, test1, dev1 = model1.preprocess()

### Pure BPR

In [5]:
%%time
model1.fit_ori(train1, k = 20, max_iter = 500)

Iteration: 10, BPR loss: 0.610598876953125
Iteration: 20, BPR loss: 0.41485166015625
Iteration: 30, BPR loss: 0.3443927001953125
Iteration: 40, BPR loss: 0.3110560791015625
Iteration: 50, BPR loss: 0.29342548828125
Iteration: 60, BPR loss: 0.281359619140625
Iteration: 70, BPR loss: 0.27257861328125
Iteration: 80, BPR loss: 0.268765380859375
Iteration: 90, BPR loss: 0.263113916015625
Iteration: 100, BPR loss: 0.24789072265625
Iteration: 110, BPR loss: 0.2594363525390625
Iteration: 120, BPR loss: 0.262428759765625
Iteration: 130, BPR loss: 0.244652099609375
Iteration: 140, BPR loss: 0.25047158203125
Iteration: 150, BPR loss: 0.2465968505859375
Iteration: 160, BPR loss: 0.244673779296875
Iteration: 170, BPR loss: 0.235258984375
Iteration: 180, BPR loss: 0.231139013671875
Iteration: 190, BPR loss: 0.244002783203125
Iteration: 200, BPR loss: 0.243602099609375
Iteration: 210, BPR loss: 0.240953466796875
Iteration: 220, BPR loss: 0.2280675537109375
Iteration: 230, BPR loss: 0.2282104736328125

In [6]:
%%time
n = 10
rec, pre, ndcg = model1.performance(test1, n)
print(f'Precision@{n}: {pre}')
print(f'Recall@{n}: {rec}')
print(f'NDCG@{n}: {ndcg}')

Precision@10: 0.2720207631363264
Recall@10: 0.3780753604111028
NDCG@10: 0.6889992358777942
CPU times: user 1min 6s, sys: 410 ms, total: 1min 6s
Wall time: 1min 9s


### BPR + Data Selection

In [7]:
%%time
model1.fit_dds(train1, dev1, k = 20, max_iter = 500, score_stepsize=0.1, stepsize=1)

Iteration: 10, BPR loss: 0.47072479128837585
Iteration: 20, BPR loss: 0.36783748865127563
Iteration: 30, BPR loss: 0.31012463569641113
Iteration: 40, BPR loss: 0.29375311732292175
Iteration: 50, BPR loss: 0.27312150597572327
Iteration: 60, BPR loss: 0.2647259831428528
Iteration: 70, BPR loss: 0.2651131749153137
Iteration: 80, BPR loss: 0.2621009349822998
Iteration: 90, BPR loss: 0.2619037926197052
Iteration: 100, BPR loss: 0.2602253258228302
Iteration: 110, BPR loss: 0.25641772150993347
Iteration: 120, BPR loss: 0.23983149230480194
Iteration: 130, BPR loss: 0.2519305944442749
Iteration: 140, BPR loss: 0.25606396794319153
Iteration: 150, BPR loss: 0.2468380481004715
Iteration: 160, BPR loss: 0.2396354228258133
Iteration: 170, BPR loss: 0.23802855610847473
Iteration: 180, BPR loss: 0.24400494992733002
Iteration: 190, BPR loss: 0.24050316214561462
Iteration: 200, BPR loss: 0.2450360804796219
Iteration: 210, BPR loss: 0.23917467892169952
Iteration: 220, BPR loss: 0.226120263338089
Iteratio

In [8]:
%%time
n = 10
rec, pre, ndcg = model1.performance(test1, n)
print(f'Precision@{n}: {pre}')
print(f'Recall@{n}: {rec}')
print(f'NDCG@{n}: {ndcg}')

Precision@10: 0.27510791366906473
Recall@10: 0.38236611945776955
NDCG@10: 0.7010044041956096
CPU times: user 1min 6s, sys: 155 ms, total: 1min 6s
Wall time: 1min 7s
