In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split

In [2]:
class BPR(object):
    def __init__(self):
        self.W = None             # user matrix
        self.H = None             # item matrix
        
        self.uid = None            # uid,iid without duplicates
        self.iid = None
        
        self.user_items = {}       # 用户u对应他访问过的所有items集合
        
        self.uid_dict = None      # serialize uid and iid
        self.iid_dict = None      #  {(original id in dataset): (serial_idx)}
        self.uid_dict_rev = None  # reverse key and value
        self.iid_dict_rev = None  #  {(serial_idx): (original id in dataset)}
        
    def split(self, df, ratio=0.8):
        self.uid = np.array(list(set(df.iloc[:,0].values)))
        self.iid = np.array(list(set(df.iloc[:,1].values)))
        self.uid.sort()
        self.iid.sort()
        
        self.uid_dict = dict(zip(self.uid, [i for i in range(len(self.uid))]))
        self.iid_dict = dict(zip(self.iid, [i for i in range(len(self.iid))]))
        self.uid_dict_rev = {v : k for k, v in self.uid_dict.items()}
        self.iid_dict_rev = {v : k for k, v in self.iid_dict.items()}
        
        train = pd.DataFrame(columns = df.columns, dtype=int)
        test = pd.DataFrame(columns = df.columns, dtype=int)
        for i in self.uid:
            train_1, test_1 = train_test_split(df[df.iloc[:, 0] == i], train_size = ratio)
            train = pd.concat([train, train_1])
            test = pd.concat([test, test_1])
        return train, test
                            
    def generate_train_batch(self, df, batch):
        train = []
        for b in range(batch):
            u = np.random.choice(self.uid, size=1)[0]
            i = np.random.choice(self.user_items[u], size=1)[0]
            j = np.random.choice(self.iid, size=1)[0]
            while j in self.user_items[u]:
                j = np.random.choice(self.iid, size=1)[0]
            train.append([self.uid_dict[u], self.iid_dict[i], self.iid_dict[j]])
        return train            
            
    def fit(self, df, k, stepsize=0.05, regulation_rate=0.0001, max_iter=50, batch=10000):
        self.W = np.random.rand(len(self.uid), k)*0.01      # 初始化 W，H
        self.H = np.random.rand(len(self.iid), k)*0.01
        
        for u in self.uid:                                # 创建字典：用户u对应他访问过的所有items集合
            self.user_items[u] = df[df.iloc[:, 0]==u].iloc[:, 1].values
                            
        for x in range(max_iter):             # Use stochastic gradient descent method to solve W & H
            for u, i, j in self.generate_train_batch(df, batch):
                xuij = np.dot(self.W[u], self.H[i]) - np.dot(self.W[u], self.H[j])
                sigmoid = 1.0 / (1 + math.exp(xuij))
                self.W[u] += stepsize * (sigmoid * (self.H[i] - self.H[j]) + regulation_rate * self.W[u])
                self.H[i] += stepsize * (sigmoid * self.W[u] + regulation_rate * self.H[i])
                self.H[j] += stepsize * (-sigmoid * self.W[u] + regulation_rate * self.H[j])
    
    def predict(self, user, n):      # Top-N recommendation
        top_N = []
        for i in self.iid:
            if i not in self.user_items[user]:
                top_N.append((i, np.dot(self.W[self.uid_dict[user]], self.H[self.iid_dict[i]]))) 
        return sorted(top_N, key=lambda s: s[1], reverse=True)[:n]
    
    def _predict(self, uid, items, n):
        top_N = []
        
        for i in range(len(items)):
            user = self.uid_dict[uid]
            item = self.iid_dict[items[i]]
            top_N.append((items[i], np.dot(self.W[user], self.H[item])))
                
        return sorted(top_N, key=lambda s: s[1], reverse=True)[:n]
    
    def NDCG(self, uid, test, n):         # 用模型排序+真实分数计算 DCG, 重排后计算 iDCG
        test_user = test[test.iloc[:, 0] == uid]
        rating = self._predict(uid, test_user.iloc[:, 1].values, n)
        irating =sorted(test_user.iloc[:, 2].values, reverse=True)
        dcg = 0
        idcg = 0
        if n > len(irating): n = len(irating)  
        for i in range(n):
            r = test_user[test_user.iloc[:, 1]==rating[i][0]].iloc[0, 2]
            dcg += 1.0*(2**r - 1)/math.log(i + 2, 2)
            idcg += 1.0*(2**irating[i] - 1)/math.log(i + 2, 2)
        return dcg/idcg
    
    def performance(self, test, n):      # Output recall@n, precision@n, NDCG@n
        hit = 0
        n_recall = 0
        n_precision = 0
        ndcg = 0
        for i in self.uid:
            # Items that User i hasn't tried in training set
            unknown_items = np.setdiff1d(self.iid, self.user_items[i])
            # Items that User i actually tried in testing set
            known_items = test[test.iloc[:, 0]==i].iloc[:, 1].values
            
            #目标：预测 unknown items 中的top_N，若击中test中的items，则为有效预测
            ru = self._predict(i, unknown_items, n)
            for item ,pui in ru:
                if item in known_items:
                    hit += 1
            n_recall += len(known_items)
            n_precision += n
            ndcg += self.NDCG(i, test, n)  
            
        recall = hit / (1.0 * n_recall)
        precision = hit / (1.0 * n_precision)
        ndcg /= len(self.uid)
        return recall, precision, ndcg

In [3]:
df1 = pd.read_csv("./ml-100k/u.data", sep="\t", names=['user id', 'item id', 'rating', 'timestamp'])
df2 = pd.read_csv("./ml-1m/ratings.dat", sep="::", names=['user id', 'item id', 'rating', 'timestamp'], engine='python')

In [4]:
model1 = BPR()
train1, test1 = model1.split(df1)
print(train1.shape)
print(test1.shape)

(79619, 4)
(20381, 4)


In [5]:
model1.fit(train1, k = 50)

In [6]:
n = 10
rec, pre, ndcg = model1.performance(test1, n)
print(f'Precision@{n}: {pre}')
print(f'Recall@{n}: {rec}')
print(f'NDCG@{n}: {ndcg}')

Precision@10: 0.20965005302226936
Recall@10: 0.09700210980815466
NDCG@10: 0.7427278425337906


In [7]:
model1.predict(1, 5)

[(50, 4.1779601320160555),
 (286, 3.9158527882569265),
 (288, 3.62384365113412),
 (294, 3.5630401003526204),
 (300, 3.3064255809675807)]

In [8]:
model2 = BPR()
train2, test2 = model2.split(df2)
print(train2.shape)
print(test2.shape)

(797758, 4)
(202451, 4)


In [9]:
model2.fit(train2, k = 20)

In [10]:
n = 10
rec, pre, ndcg = model2.performance(test2, n)
print(f'Precision@{n}: {pre}')
print(f'Recall@{n}: {rec}')
print(f'NDCG@{n}: {ndcg}')

Precision@10: 0.17943708609271522
Recall@10: 0.05353394154634949
NDCG@10: 0.7572818308040425
