In [None]:
# Colab setting
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/SDSC3002_Project/')
os.getcwd()

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split

In [30]:
class BPR(nn.Module):
    def __init__(self):
        super(BPR, self).__init__()
        self.W = None             # user matrix
        self.H = None             # item matrix
        
        self.uid = None
        self.iid = None
        
        # 用户u对应他访问过的所有items集合
        self.train_user_items = None
        self.test_user_items = None
        
        # (u, i, rating) dataset
        self.train = None
        self.test = None
        
    def _split(self, df, ratio):
        train = pd.DataFrame(columns = df.columns, dtype=int)
        test = pd.DataFrame(columns = df.columns, dtype=int)
        for i in self.uid:
            train_1, test_1 = train_test_split(df[df.iloc[:, 0] == i], train_size = ratio, shuffle = True, random_state = 5)
            train = pd.concat([train, train_1])
            test = pd.concat([test, test_1])
        return train, test    
    
    def preprocess(self, df, train_size=0.8):
        df = df.rename(columns = {df.columns[0]: 'ori_uid', df.columns[1]: 'ori_iid', df.columns[2]: 'rating'})
        df = df.groupby('ori_uid').filter(lambda x: x['ori_uid'].count()>=10)
        uid_map = pd.DataFrame({"ori_uid": np.asarray(list(set(df.iloc[:,0].values)))})
        uid_map["serial_uid"] = uid_map.index
        iid_map = pd.DataFrame({"ori_iid": np.asarray(list(set(df.iloc[:,1].values)))})
        iid_map["serial_iid"] = iid_map.index
        
        self.uid = uid_map["serial_uid"].values
        self.iid = iid_map["serial_iid"].values
        
        df = df.merge(uid_map, left_on = 'ori_uid', right_on = 'ori_uid', how="left")
        df = df.merge(iid_map, left_on = 'ori_iid', right_on = 'ori_iid', how="left")
        df = df[['serial_uid', 'serial_iid', 'rating']]
        
        train, test = self._split(df, train_size)
        
        self.train_user_items = train.groupby(train.columns[0])[train.columns[1]].apply(lambda x: list(x)).to_list()
        self.test_user_items = test.groupby(test.columns[0])[test.columns[1]].apply(lambda x: list(x)).to_list()
        
        self.train = train
        self.test = test
    
    def generate_train_batch(self, batch, sets):
        train = []
        for b in range(batch):
            u = self.uid[np.random.randint(0, len(self.uid))]
            i = sets[u][np.random.randint(0, len(sets[u]))]
            j = self.iid[np.random.randint(0, len(self.iid))]
            while j in sets[u]:
                j = self.iid[np.random.randint(0, len(self.iid))]
            train.append([u, i, j])
        return np.asarray(train) 
    
    def fit(self, k, stepsize=0.05, max_iter=10, batch=10000, n=10):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.to(device)
        
        self.W = nn.Parameter(torch.rand(len(self.uid), k).to(device) * 0.01)    # 初始化 W，H
        self.H = nn.Parameter(torch.rand(len(self.iid), k).to(device) * 0.01)  
        
        # 创建字典：用户u对应他访问过的所有items集合
#         self.train_user_items = df.groupby(df.columns[0])[df.columns[1]].apply(lambda x: np.array(x)).to_list()
        
        optimizer = optim.Adam([self.W, self.H], lr=stepsize)     # 主模型优化器
        for x in range(max_iter):
            #取训练批次：uij三元组
            uij = self.generate_train_batch(batch, self.train_user_items)
            
            u = uij[:, 0]
            i = uij[:, 1]
            j = uij[:, 2]
            u_emb = self.W[u]
            i_emb = self.H[i]
            j_emb = self.H[j]
            
            optimizer.zero_grad()
            loss = -torch.sum(torch.log(torch.sigmoid(torch.sum(u_emb * (i_emb - j_emb),dim = 1))))
            loss.backward()
            optimizer.step()
            
            print(f"Train | {x+1}/{max_iter}, BPR loss: {loss.item() / batch}")
            
            rec, pre, ndcg = self.performance(n)
            print(f'Valid | {x+1}/{max_iter}, Pre@{n}: {pre}, Rec@{n}: {rec}, NDCG@{n}: {ndcg}')
            print('---------------------------------------------------')
    
    def _predict(self, uid, items, n):
        scores = torch.mv(self.H[items], self.W[uid])
        if n > scores.shape[0]: 
            n = scores.shape[0]
        top_N_val, top_N_idx = torch.topk(scores, k=n)
        return list(zip(items[top_N_idx.cpu()], top_N_val.cpu()))

    def NDCG(self, uid, n):         # 用模型排序+真实分数计算 DCG, 重排后计算 iDCG
        # test 集中，uid 评过的 items
        test_user = self.test[self.test.iloc[:, 0] == uid]
        
        # 对这些 items 做 top-k
        rating = self._predict(uid, test_user.iloc[:, 1].values, n)
        
        # 排序真实评分
        irating =sorted(test_user.iloc[:, 2].values, reverse=True)
        irating = np.asarray(irating)
        
        if n > len(irating): n = len(irating)  
            
        # 取出模型排序下 merge 到的真实分数    
        rating_df = pd.DataFrame(rating, columns=['serial_iid', 'pred_rating'])
        merged_df = pd.merge(rating_df, test_user, on='serial_iid')
        r = np.array(merged_df['rating'])
        
        # 求 log 分母
        log = np.log(np.arange(2, n + 2)) 
        
        # 求 dcg 和 idcg
        dcg = np.log(2) * np.sum((2**r[:n] - 1) / log)
        idcg = np.log(2) * np.sum((2**irating[:n] - 1) / log)
        
        return dcg / idcg

    def performance(self, n):      # Output recall@n, precision@n, NDCG@n
        hit = 0
        n_recall = 0
        n_precision = 0
        ndcg = 0
        
        for i in self.uid:
            # Items that User i hasn't tried in training set
            unknown_items = np.setdiff1d(self.iid, self.train_user_items[i])
            # Items that User i actually tried in testing set
            known_items = self.test_user_items[i]
            
            #目标：预测 unknown items 中的top_N，若击中test中的items，则为有效预测
            ru = self._predict(i, unknown_items, n)
            
            hit += sum(1 for item, pui in ru if item in known_items)
            n_recall += len(known_items)
            n_precision += n
            ndcg += self.NDCG(i, n)
            
        recall = hit / (1.0 * n_recall)
        precision = hit / (1.0 * n_precision)
        ndcg /= len(self.uid)
        return recall, precision, ndcg

In [4]:
df1 = pd.read_csv("./ml-100k/u.data", sep="\t", names=['user id', 'item id', 'rating', 'timestamp'])
df2 = pd.read_csv("./ml-1m/ratings.dat", sep="::", names=['user id', 'item id', 'rating', 'timestamp'], engine='python')

### 100K

In [31]:
model1 = BPR()
model1.preprocess(df1)

In [5]:
%%time
model1.fit(k = 50, max_iter = 100)

Train | 1/100, BPR loss: 0.69314052734375
Valid | 1/100, Pre@10: 0.018663838812301166, Recl@10: 0.008635493842304106, NDCG@10: 0.6945220327132166
---------------------------------------------------
Train | 2/100, BPR loss: 0.6847896484375
Valid | 2/100, Pre@10: 0.11717921527041357, Recl@10: 0.05421716304401158, NDCG@10: 0.7314359293011433
---------------------------------------------------
Train | 3/100, BPR loss: 0.614569580078125
Valid | 3/100, Pre@10: 0.13488865323435842, Recl@10: 0.06241106913301604, NDCG@10: 0.7322834425931577
---------------------------------------------------
Train | 4/100, BPR loss: 0.50909267578125
Valid | 4/100, Pre@10: 0.1354188759278897, Recl@10: 0.06265639566262696, NDCG@10: 0.7335292770190222
---------------------------------------------------
Train | 5/100, BPR loss: 0.400266357421875
Valid | 5/100, Pre@10: 0.1395546129374337, Recl@10: 0.06456994259359207, NDCG@10: 0.7364553319342508
---------------------------------------------------
Train | 6/100, BPR 

Train | 43/100, BPR loss: 0.13249813232421875
Valid | 43/100, Pre@10: 0.26871686108165427, Recl@10: 0.12433148520681027, NDCG@10: 0.7386429335652778
---------------------------------------------------
Train | 44/100, BPR loss: 0.12674990234375
Valid | 44/100, Pre@10: 0.2715800636267232, Recl@10: 0.1256562484667092, NDCG@10: 0.7396281238602823
---------------------------------------------------
Train | 45/100, BPR loss: 0.12901617431640625
Valid | 45/100, Pre@10: 0.2708377518557794, Recl@10: 0.1253127913252539, NDCG@10: 0.7397452604619902
---------------------------------------------------
Train | 46/100, BPR loss: 0.1233433349609375
Valid | 46/100, Pre@10: 0.2704135737009544, Recl@10: 0.12511653010156518, NDCG@10: 0.7376478393393293
---------------------------------------------------
Train | 47/100, BPR loss: 0.11816593017578125
Valid | 47/100, Pre@10: 0.27104984093319195, Recl@10: 0.12541092193709827, NDCG@10: 0.7378320018403769
---------------------------------------------------
Trai

Valid | 84/100, Pre@10: 0.25949098621420996, Recl@10: 0.12006280359158039, NDCG@10: 0.7424279925506984
---------------------------------------------------
Train | 85/100, BPR loss: 0.09132813720703126
Valid | 85/100, Pre@10: 0.25705196182396606, Recl@10: 0.1189343015553702, NDCG@10: 0.7424149198976535
---------------------------------------------------
Train | 86/100, BPR loss: 0.08782440185546875
Valid | 86/100, Pre@10: 0.25885471898197243, Recl@10: 0.1197684117560473, NDCG@10: 0.7433990462119722
---------------------------------------------------
Train | 87/100, BPR loss: 0.08135389404296875
Valid | 87/100, Pre@10: 0.2569459172852598, Recl@10: 0.11888523624944801, NDCG@10: 0.741515400371564
---------------------------------------------------
Train | 88/100, BPR loss: 0.08995355224609375
Valid | 88/100, Pre@10: 0.2598091198303287, Recl@10: 0.12020999950934694, NDCG@10: 0.7421071929620879
---------------------------------------------------
Train | 89/100, BPR loss: 0.0894281005859375
V

### 1M

In [6]:
model2 = BPR()
model2.preprocess(df2)

In [7]:
%%time
model2.fit(k = 50, max_iter = 100, stepsize=0.005)

Train | 1/100, BPR loss: 0.6931466796875
Valid | 1/100, Pre@10: 0.016109271523178807, Recl@10: 0.004806101229433295, NDCG@10: 0.6685183457893602
---------------------------------------------------
Train | 2/100, BPR loss: 0.68762822265625
Valid | 2/100, Pre@10: 0.06372516556291391, Recl@10: 0.019012007843873333, NDCG@10: 0.7081679037241245
---------------------------------------------------
Train | 3/100, BPR loss: 0.656017578125
Valid | 3/100, Pre@10: 0.10892384105960265, Recl@10: 0.03249675230055668, NDCG@10: 0.7281154896414167
---------------------------------------------------
Train | 4/100, BPR loss: 0.60083056640625
Valid | 4/100, Pre@10: 0.12693708609271523, Recl@10: 0.037870892215894215, NDCG@10: 0.7381618853583445
---------------------------------------------------
Train | 5/100, BPR loss: 0.52819755859375
Valid | 5/100, Pre@10: 0.13905629139072848, Recl@10: 0.041486581938345576, NDCG@10: 0.7456189506339896
---------------------------------------------------
Train | 6/100, BPR

Train | 43/100, BPR loss: 0.308310595703125
Valid | 43/100, Pre@10: 0.13208609271523178, Recl@10: 0.03940706640125265, NDCG@10: 0.7426009804482531
---------------------------------------------------
Train | 44/100, BPR loss: 0.318236962890625
Valid | 44/100, Pre@10: 0.13458609271523178, Recl@10: 0.04015292589317909, NDCG@10: 0.7427478563576005
---------------------------------------------------
Train | 45/100, BPR loss: 0.313182177734375
Valid | 45/100, Pre@10: 0.13605960264900663, Recl@10: 0.040592538441400636, NDCG@10: 0.7436070919590209
---------------------------------------------------
Train | 46/100, BPR loss: 0.306265625
Valid | 46/100, Pre@10: 0.13716887417218543, Recl@10: 0.04092348271927528, NDCG@10: 0.743810729570939
---------------------------------------------------
Train | 47/100, BPR loss: 0.3026563720703125
Valid | 47/100, Pre@10: 0.13844370860927152, Recl@10: 0.041303821665489425, NDCG@10: 0.7441601850494735
---------------------------------------------------
Train | 4

Train | 85/100, BPR loss: 0.222470654296875
Valid | 85/100, Pre@10: 0.1770860927152318, Recl@10: 0.0528325372559286, NDCG@10: 0.7315264421631569
---------------------------------------------------
Train | 86/100, BPR loss: 0.220691015625
Valid | 86/100, Pre@10: 0.17875827814569537, Recl@10: 0.05333142340615754, NDCG@10: 0.7319085590159774
---------------------------------------------------
Train | 87/100, BPR loss: 0.2126048828125
Valid | 87/100, Pre@10: 0.18082781456953642, Recl@10: 0.05394885676040128, NDCG@10: 0.7315499824171668
---------------------------------------------------
Train | 88/100, BPR loss: 0.224698095703125
Valid | 88/100, Pre@10: 0.18228476821192052, Recl@10: 0.054383529841788875, NDCG@10: 0.7315228056851971
---------------------------------------------------
Train | 89/100, BPR loss: 0.2110837158203125
Valid | 89/100, Pre@10: 0.18329470198675496, Recl@10: 0.05468483731865982, NDCG@10: 0.7314029500003244
---------------------------------------------------
Train | 90