In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split

Dataset foramt：uid | movie id | rating | ...

Assumption:
1. rating is int and range 1-5

In [2]:
class ALS(object):
    def __init__(self):
        self.uid = None            # uid,iid without duplicates
        self.iid = None
        
        self.uid_dict = None      # serialize uid and iid
        self.iid_dict = None      #  {(original id in dataset): (serial_idx)}
        self.uid_dict_rev = None  # reverse key and value
        self.iid_dict_rev = None  #  {(serial_idx): (original id in dataset)}
        
        self.R = None             # Rating matrix
        self.user_matrix = None
        self.item_matrix = None
        self.shape = None         # (# of user, # of item)
        self.tuples = None        #  # of training set tuples
    
    def split(self, df, ratio = 0.8):
        self.uid = np.array(list(set(df.iloc[:,0].values)))
        self.iid = np.array(list(set(df.iloc[:,1].values)))
        self.uid.sort()
        self.iid.sort()
        
        self.shape = (self.uid.shape[0], self.iid.shape[0])
        
        self.uid_dict = dict(zip(self.uid, [i for i in range(self.shape[0])]))
        self.iid_dict = dict(zip(self.iid, [i for i in range(self.shape[1])]))
        self.uid_dict_rev = {v : k for k, v in self.uid_dict.items()}
        self.iid_dict_rev = {v : k for k, v in self.iid_dict.items()}
        
        train = pd.DataFrame(columns = df.columns, dtype=int)
        test = pd.DataFrame(columns = df.columns, dtype=int)
        for i in self.uid:
            train_1, test_1 = train_test_split(df[df.iloc[:, 0] == i], train_size = ratio)
            train = pd.concat([train, train_1])
            test = pd.concat([test, test_1])
        return train, test
    
    def preprocessing(self, df, k):
        self.tuples = df.shape[0]
        
        self.R = np.zeros((self.shape[0], self.shape[1]), dtype=int)
        for i in range(self.tuples):
            self.R[self.uid_dict[df.iloc[i, 0]], self.iid_dict[df.iloc[i, 1]]] = df.iloc[i, 2]
            
        self.user_matrix = np.random.rand(self.shape[0], k)*0.01
        self.item_matrix = np.random.rand(self.shape[1], k)*0.01
    
    def get_rmse(self):
        sum = 0
        for u in range(self.shape[0]):
            for i in range(self.shape[1]):
                if self.R[u, i] != 0:
                    sum += (self.R[u, i] - np.dot(self.user_matrix[u], self.item_matrix[i]))**2
        return (sum/self.tuples)**0.5
    
    def U_update(self, df):
        for i in range(self.shape[0]):
            # 取出用户 i 评价过的所有物品的索引
            items = [self.iid_dict[x] for x in df[df.iloc[:, 0]==self.uid_dict_rev[i]].iloc[:, 1].values]
            
            if len(items) == 0:
                continue
                
            # 取出用户 i 评价过的所有物品的评分
            r_u = df[df.iloc[:, 0]==self.uid_dict_rev[i]].iloc[:, 2].values
            
            # 取出 item matrix 中用户 i 评价过的所有物品的子矩阵
            I_u = self.item_matrix[items, :]

            A = np.dot(I_u.T, I_u)
            b = np.dot(I_u.T, r_u)
            self.user_matrix[i, :] = np.linalg.solve(A, b)
    
    def I_update(self, df):
        for j in range(self.shape[1]):
            # 取出评价物品 j 的所有用户的索引
            users = [self.uid_dict[x] for x in df[df.iloc[:, 1] == self.iid_dict_rev[j]].iloc[:, 0].values]
            
            if len(users) == 0:
                continue
                
            # 取出评价物品 j 的所有用户的评分
            r_j = df[df.iloc[:, 1] == self.iid_dict_rev[j]].iloc[:, 2].values
            
            # 取出 user matrix 中评价物品 j 的所有用户的子矩阵
            U_j = self.user_matrix[users, :]
            
            A = np.dot(U_j.T, U_j)
            b = np.dot(U_j.T, r_j)
            self.item_matrix[j, :] = np.linalg.solve(A, b)
    
    def fit(self, df, k, max_iter=50):
        self.preprocessing(df, k)
        
        prev_rmse = 0
        rmse = 0
        for ii in range(max_iter):
            if ii%2:
                # update user_matrix
                self.U_update(df)    
            else:
                # update item_matrix
                self.I_update(df)
                
            rmse = self.get_rmse()
            print("Iterations: %d, RMSE: %.6f" % (ii + 1, rmse))
            
            if abs(rmse - prev_rmse) <= 10**(-2):
                print('Converges to the defined limit.')
                return
            prev_rmse = rmse
        print('Reaches the max number of iteration.')
        
    def predict(self, user, n):      # Top-N recommendation
        user_dict = self.uid_dict[user]
        top_N = []
        
        for i in range(self.shape[1]):
            if self.R[user_dict, i] == 0:
                top_N.append((self.iid_dict_rev[i], np.dot(self.user_matrix[user_dict], self.item_matrix[i])))
                
        return sorted(top_N, key=lambda s: s[1], reverse=True)[:n]
    
    def _predict(self, test_user, n):
        top_N = []
        
        for i in range(test_user.shape[0]):
            user = self.uid_dict[test_user.iloc[i, 0]]
            item = self.iid_dict[test_user.iloc[i, 1]]
            top_N.append((test_user.iloc[i, 1], np.dot(self.user_matrix[user], self.item_matrix[item])))
                
        return sorted(top_N, key=lambda s: s[1], reverse=True)[:n]
    
    def NDCG(self, uid, test, n):         # 用模型排序+真实分数计算 DCG, 重排后计算 iDCG
        test_user = test[test.iloc[:, 0]==uid]
        rating = self._predict(test_user, n)
        irating =sorted(test_user.iloc[:, 2].values, reverse=True)
        dcg = 0
        idcg = 0
        if n > len(irating): n = len(irating)  
        for i in range(n):
            r = test_user[test_user.iloc[:, 1]==rating[i][0]].iloc[0, 2]
            dcg += 1.0*(2**r - 1)/math.log(i + 2, 2)
            idcg += 1.0*(2**irating[i] - 1)/math.log(i + 2, 2)
        return dcg/idcg
    
    def performance(self, test, n):      # Output recall@n, precision@n, NDCG@n
        hit = 0
        n_recall = 0
        n_precision = 0
        ndcg = 0
        for i in self.uid:
            test_rating = test[test.iloc[:, 0]==i]
            test_rating = test_rating[test_rating.iloc[:, 1] >= 3]
            #用户 i 喜爱的 item 集合，定义评分 >= 3 为喜爱
            tu = test_rating.iloc[:, 1].values
            #预测用户 i 的 top-N
            ru = self._predict(test[test.iloc[:, 0]==i], n)
            for item ,pui in ru:
                if item in tu:
                    hit += 1
            n_recall += len(tu)
            n_precision += n
            ndcg += self.NDCG(i, test, n)  
        recall = hit / (1.0 * n_recall)
        precision = hit / (1.0 * n_precision)
        ndcg /= self.shape[0]
        return recall, precision, ndcg

In [3]:
train_100k = pd.read_csv("./ml-100k/u.data", sep="\t", names=['user id', 'item id', 'rating', 'timestamp'])
train_1m = pd.read_csv("./ml-1m/ratings.dat", sep="::", names=['user id', 'item id', 'rating', 'timestamp'], engine='python')

In [4]:
model_100k = ALS()
train, test = model_100k.split(train_100k)
print(train.shape)
print(test.shape)

(79619, 4)
(20381, 4)


In [5]:
model_100k.fit(train, 40)

Iterations: 1, RMSE: 0.761503
Iterations: 2, RMSE: 0.562882
Iterations: 3, RMSE: 0.520485
Iterations: 4, RMSE: 0.494164
Iterations: 5, RMSE: 0.476349
Iterations: 6, RMSE: 0.461983
Iterations: 7, RMSE: 0.451246
Iterations: 8, RMSE: 0.441214
Iterations: 9, RMSE: 0.433021
Converges to the defined limit.


In [10]:
k = 10
rec, pre, ndcg = model_100k.performance(test, k)
print(f'Precision@{k}: {pre}')
print(f'Recall@{k}: {rec}')
print(f'NDCG@{k}: {ndcg}')

Precision@10: 0.8573700954400848
Recall@10: 0.39894404421198065
NDCG@10: 0.695507645646776


In [7]:
model_1m = ALS()
train1, test1 = model_1m.split(train_1m)
print(train1.shape)
print(test1.shape)

(797758, 4)
(202451, 4)


In [8]:
model_1m.fit(train1, 15)

Iterations: 1, RMSE: 1.071125
Iterations: 2, RMSE: 0.822210
Iterations: 3, RMSE: 0.800818
Iterations: 4, RMSE: 0.783098
Iterations: 5, RMSE: 0.772150
Iterations: 6, RMSE: 0.762730
Converges to the defined limit.


In [11]:
k = 10
rec, pre, ndcg = model_1m.performance(test1, k)
print(f'Precision@{k}: {pre}')
print(f'Recall@{k}: {rec}')
print(f'NDCG@{k}: {ndcg}')

Precision@10: 0.906523178807947
Recall@10: 0.2712164966837228
NDCG@10: 0.7590511233487202
