In [9]:
# Colab setting
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/SDSC3002_Project/')
os.getcwd()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/MyDrive/SDSC3002_Project'

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split

In [40]:
class BPR(nn.Module):
    def __init__(self):
        super(BPR, self).__init__()
        self.W = None             # user matrix
        self.H = None             # item matrix

        self.uid = None
        self.iid = None

        # 用户u对应他访问过的所有items集合
        self.train_user_items = None
        self.test_user_items = None

        # (u, i, rating) dataset
        self.train = None
        self.test = None

    def _split(self, df, ratio):
        train = pd.DataFrame(columns = df.columns, dtype=int)
        test = pd.DataFrame(columns = df.columns, dtype=int)
        for i in self.uid:
            train_1, test_1 = train_test_split(df[df.iloc[:, 0] == i], train_size = ratio, shuffle = True, random_state = 5)
            train = pd.concat([train, train_1])
            test = pd.concat([test, test_1])
        return train, test

    def preprocess(self, df, train_size=0.8):
        df = df.rename(columns = {df.columns[0]: 'ori_uid', df.columns[1]: 'ori_iid', df.columns[2]: 'rating'})
        df = df.groupby('ori_uid').filter(lambda x: x['ori_uid'].count()>=10)
        uid_map = pd.DataFrame({"ori_uid": np.asarray(list(set(df.iloc[:,0].values)))})
        uid_map["serial_uid"] = uid_map.index
        iid_map = pd.DataFrame({"ori_iid": np.asarray(list(set(df.iloc[:,1].values)))})
        iid_map["serial_iid"] = iid_map.index

        self.uid = uid_map["serial_uid"].values
        self.iid = iid_map["serial_iid"].values

        df = df.merge(uid_map, left_on = 'ori_uid', right_on = 'ori_uid', how="left")
        df = df.merge(iid_map, left_on = 'ori_iid', right_on = 'ori_iid', how="left")
        df = df[['serial_uid', 'serial_iid', 'rating']]

        train, test = self._split(df, train_size)

        self.train_user_items = train.groupby(train.columns[0])[train.columns[1]].apply(lambda x: list(x)).to_list()
        self.test_user_items = test.groupby(test.columns[0])[test.columns[1]].apply(lambda x: list(x)).to_list()

        self.train = train
        self.test = test

        count = self.train.groupby(['serial_uid', 'serial_iid']).size().reset_index(name='count')
        self.train = pd.merge(self.train.copy(), count, on=['serial_uid', 'serial_iid'], how='inner')


    def generate_train_batch(self, batch, sets,m):
        train_u = []
        train_i = []
        train_js = []
        for b in range(batch):
            u = self.uid[np.random.randint(0, len(self.uid))]
            i = sets[u][np.random.randint(0, len(sets[u]))]
            js = random.sample(list(set(self.iid) - set(sets[u])), m)

            train_u.append(u)
            train_i.append(i)
            train_js.append(js)
        return np.array(train_u), np.array(train_i), np.array(train_js)

    def fit(self, k, m=10, stepsize=0.05, max_iter=10, batch=10000, n=10):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.to(device)

        self.W = nn.Parameter(torch.rand(len(self.uid), k).to(device) * 0.01)    # 初始化 W，H
        self.H = nn.Parameter(torch.rand(len(self.iid), k).to(device) * 0.01)

        loss = 0
        criterion = torch.nn.BCELoss()
        optimizer = optim.Adam([self.W, self.H], lr=stepsize)     # 主模型优化器
        for x in range(max_iter):
            #取训练批次：uij三元组
            u, i, js = self.generate_train_batch(batch, self.train_user_items, m)
            # u = uij[:, 0].astype(int)
            # i = uij[:, 1].astype(int)
            # js = uij[:, 2]

            u_emb = self.W[u]
            i_emb = self.H[i]

            # TODO: optimize positve
            optimizer.zero_grad()
            pos_pred = torch.sigmoid(torch.sum(u_emb * i_emb,dim = 1))
#             pos_pred = (pos_pred >= 0.5).to(torch.int64)

            bceloss_pos = criterion(pos_pred, torch.FloatTensor([1]*batch).to(device))
            bceloss_pos.backward(retain_graph=True)
            optimizer.step()

            loss += bceloss_pos

            # TODO: optimize negative

            for i in range(m):
                optimizer.zero_grad()
                j_emb = self.H[js[:, i]]
                neg_pred = torch.sigmoid(torch.sum(u_emb * j_emb,dim = 1))
                # neg_pred = (neg_pred >= 0.5).to(torch.int64)
                bceloss_neg = criterion(neg_pred, torch.FloatTensor([0]*batch).to(device))
                bceloss_neg.backward(retain_graph=True)
                optimizer.step()

                loss += bceloss_neg

            loss = loss/(m+1)

            print(f"Train | {x+1}/{max_iter}, BPR loss: {loss / batch}")
            with torch.no_grad():
                rec, pre, ndcg = self.performance(n)
            print(f'Valid | {x+1}/{max_iter}, Pre@{n}: {pre}, Rec@{n}: {rec}, NDCG@{n}: {ndcg}')
            print('---------------------------------------------------')

    def _predict(self, uid, items, n):

        scores = torch.mv(self.H[items], self.W[uid])
        if n > scores.shape[0]:
            n = scores.shape[0]
        top_N_val, top_N_idx = torch.topk(scores, k=n)
        return list(zip(items[top_N_idx.cpu()], top_N_val.cpu()))

    def NDCG(self, uid, n):         # 用模型排序+真实分数计算 DCG, 重排后计算 iDCG
        # test 集中，uid 评过的 items
        test_user = self.test[self.test.iloc[:, 0] == uid]

        # 对这些 items 做 top-k
        rating = self._predict(uid, test_user.iloc[:, 1].values, n)

        # 排序真实评分
        irating =sorted(test_user.iloc[:, 2].values, reverse=True)
        irating = np.asarray(irating)

        if n > len(irating): n = len(irating)

        # 取出模型排序下 merge 到的真实分数
        rating_df = pd.DataFrame(rating, columns=['serial_iid', 'pred_rating'])
        merged_df = pd.merge(rating_df, test_user, on='serial_iid')
        r = np.array(merged_df['rating'])

        # 求 log 分母
        log = np.log(np.arange(2, n + 2))

        # 求 dcg 和 idcg
        dcg = np.log(2) * np.sum((2**r[:n] - 1) / log)
        idcg = np.log(2) * np.sum((2**irating[:n] - 1) / log)

        return dcg / idcg

    def performance(self, n):      # Output recall@n, precision@n, NDCG@n
        hit = 0
        n_recall = 0
        n_precision = 0
        ndcg = 0

        for i in self.uid:
            # Items that User i hasn't tried in training set
            unknown_items = np.setdiff1d(self.iid, self.train_user_items[i])
            # Items that User i actually tried in testing set
            known_items = self.test_user_items[i]

            #目标：预测 unknown items 中的top_N，若击中test中的items，则为有效预测
            ru = self._predict(i, unknown_items, n)

            hit += sum(1 for item, pui in ru if item in known_items)
            n_recall += len(known_items)
            n_precision += n
            ndcg += self.NDCG(i, n)

        recall = hit / (1.0 * n_recall)
        precision = hit / (1.0 * n_precision)
        ndcg /= len(self.uid)
        return recall, precision, ndcg

In [12]:
df1 = pd.read_csv("./ml-100k/u.data", sep="\t", names=['user id', 'item id', 'rating', 'timestamp'])
df2 = pd.read_csv("./ml-1m/ratings.dat", sep="::", names=['user id', 'item id', 'rating', 'timestamp'], engine='python')

### 100K

In [41]:
model1 = BPR()
model1.preprocess(df1)

In [60]:
%%time
model1.fit(k = 50, max_iter = 100, stepsize=1e-4, m=10)

Train | 1/100, BPR loss: 6.93640613462776e-05
Valid | 1/100, Pre@10: 0.1151643690349947, Rec@10: 0.05328492223149011, NDCG@10: 0.7244074890664074
---------------------------------------------------
Train | 2/100, BPR loss: 7.263950101332739e-05
Valid | 2/100, Pre@10: 0.13149522799575822, Rec@10: 0.060840979343506206, NDCG@10: 0.7302770170592982
---------------------------------------------------
Train | 3/100, BPR loss: 7.277988333953544e-05
Valid | 3/100, Pre@10: 0.11060445387062566, Rec@10: 0.05117511407683627, NDCG@10: 0.7206196409033162
---------------------------------------------------
Train | 4/100, BPR loss: 7.278125849552453e-05
Valid | 4/100, Pre@10: 0.07783669141039236, Rec@10: 0.0360139345468819, NDCG@10: 0.7096642750416402
---------------------------------------------------
Train | 5/100, BPR loss: 7.278017437784001e-05
Valid | 5/100, Pre@10: 0.0799575821845175, Rec@10: 0.03699524066532555, NDCG@10: 0.7093194396639823
---------------------------------------------------
Tra

KeyboardInterrupt: 

### 1M

In [46]:
model2 = BPR()
model2.preprocess(df2)

In [62]:
%%time
model2.fit(k = 50, max_iter = 100, stepsize=5e-4, m=10)

Train | 1/100, BPR loss: 6.935479905223474e-05
Valid | 1/100, Pre@10: 0.11827814569536424, Rec@10: 0.03528755106173839, NDCG@10: 0.7463039175082755
---------------------------------------------------
Train | 2/100, BPR loss: 7.562005339423195e-05
Valid | 2/100, Pre@10: 0.10225165562913907, Rec@10: 0.030506147166474852, NDCG@10: 0.7241160138369586
---------------------------------------------------
Train | 3/100, BPR loss: 7.617966184625402e-05
Valid | 3/100, Pre@10: 0.12377483443708609, Rec@10: 0.03692745405060978, NDCG@10: 0.7267446190120156
---------------------------------------------------
Train | 4/100, BPR loss: 7.621314580319449e-05
Valid | 4/100, Pre@10: 0.1579635761589404, Rec@10: 0.04712745306271641, NDCG@10: 0.7500836802670748
---------------------------------------------------
Train | 5/100, BPR loss: 7.61510236770846e-05
Valid | 5/100, Pre@10: 0.17009933774834438, Rec@10: 0.05074808225200172, NDCG@10: 0.7545036812478874
---------------------------------------------------
T