In [1]:
import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import math
import heapq
import warnings
from copy import deepcopy
warnings.filterwarnings("ignore")

In [7]:
df = pd.read_csv("train_set.csv")
user, movie, score = df['userId'].values, df['movieId'].values, df['rating'].values
users = {}
for i in range(len(user)):
    if user[i] not in users:
        users.update({user[i] : {movie[i] : score[i]}})
    else:
        users[user[i]].update({movie[i] : score[i]})

In [13]:
class minhashUsers(object):
    def __init__(self, users, user, movie, score, avg):
        self.users = users
        self.user = user
        self.movie = movie
        self.score = score
        self.avg = avg
        self.userList = list(set(user))
        self.usersSum = len(self.userList)
        self.moviesList = list(set(movie))
        self.moviesSum = len(self.moviesList)
        self.movieToNum = {self.moviesList[i] : i for i in range(len(self.moviesList))}

    def getUtilityMat(self):
        self.utilityMat = np.zeros((self.usersSum, self.moviesSum)) # 效用矩阵
        for user in self.userList:
            for movie, score in self.users[user].items():
                self.utilityMat[user - 1][self.movieToNum[movie]] = 0 if 0.5 <= score <= 2.5 else 1
        self.utilityMat = self.utilityMat.T

    def getHashFunc(self, x):
        self.hashFuncSum = x
        self.hash = []
        length = self.moviesSum
        for i in range(x):
            # 第i个hash
            hashTable = []
            for j in range(length):
                hashTable.append(((i + 1) * j + 1) % length)
            self.hash.append(hashTable)
        self.hash = np.array(self.hash)
        return None

    def getSignatureMat(self):
        self.signatureMat = np.full(shape=(self.hashFuncSum, self.usersSum), fill_value=1000000) # 初始化为很大的数
        for i in range(len(self.utilityMat)):
            oneIndex = np.array(np.where(self.utilityMat[i] == 1)) # 找到1的位置, 即可替换的位置
            for j in range(self.hashFuncSum):
                for k in oneIndex[0]:
                    if self.hash[j, i] < self.signatureMat[j, k]:
                        self.signatureMat[j, k] = self.hash[j, i]
        return None
    
    def calJaccard(self, a, b):
        return len(set(a) & set(b)) / len(set(a) | set(b))

    def getJaccardMat(self):
        self.jaccardMat = np.zeros((self.usersSum, self.usersSum))
        v = np.array([self.signatureMat[:, i] for i in range(self.usersSum)])
        for i in range(self.usersSum - 1):
            for j in range(i + 1, self.usersSum):
                self.jaccardMat[i][j] = self.jaccardMat[j][i] = self.calJaccard(v[i], v[j])
        return None

    def scoreMovies(self, k, testUsers, testMovies):
        predict = []
        for user, movie in zip(testUsers, testMovies):
            index = np.argsort(self.jaccardMat[user - 1])[-k:] # 最相似的k个用户的下标
            simSum, molecule = 0.0, 0.0
            for id in index:
                sim = self.jaccardMat[user - 1][id]
                simSum += sim
                if movie in self.users[id + 1]:
                    molecule += sim * self.users[id + 1][movie]
                else:
                    molecule += sim * self.avg[movie]
            if simSum != 0:
                predict.append(molecule / simSum)
            else:
                predict.append(0)
        return np.array(predict)

    def recommend(self, userId, k, n):
        # 为单个人userId推荐n个
        index = np.argsort(self.jaccardMat[userId - 1])[-k : ]  # k个最相似的用户号
        allMovieScore, contributionSum = {movie : 0 for movie in self.avg.keys()}, 0.0
        # 计算
        for id in index:
            # 问题1
            sim = self.jaccardMat[userId - 1][id]
            if id == 0 or sim < 0:
                continue
            else:
                for movie, score in self.avg.items():
                    allMovieScore[movie] += score * sim
                for movie, score in self.users[id + 1].items():
                    allMovieScore[movie] += score * sim - avg[movie] * sim
                contributionSum += sim
        # 已评分的电影id在推荐中删除
        for movie in self.users[userId].keys():
            if movie in allMovieScore:
                del allMovieScore[movie]
        # 经过k个最相似用户的评分处理后的userId用户的新评分
        for movie in allMovieScore.keys():
            allMovieScore[movie] /= contributionSum
        recommendList = sorted(allMovieScore.items(), key=lambda x:x[1], reverse=True)
        return allMovieScore, recommendList[:min(n, len(recommendList))] # 评分,推荐列表

In [4]:
avg = {} # 平均分
scoreSum, contributorSum = {}, {}
for user, statement in users.items():
    for movie, score in statement.items():
        if movie in scoreSum:
            scoreSum[movie] += score
            contributorSum[movie] += 1
        else:
            scoreSum.update({movie : score})
            contributorSum.update({movie : 1})
for movie, score in scoreSum.items():
    avg.update({movie : score / contributorSum[movie]})

In [14]:
# 没有矩阵时
import joblib
hasher = minhashUsers(users, user, movie, score, avg)
hasher.getUtilityMat()
hasher.getHashFunc(4)
hasher.getSignatureMat()
hasher.getJaccardMat()

In [15]:
testData = pd.read_csv("test_set.csv")
testUsers, testMovies, rating = testData['userId'].values, testData['movieId'].values, testData['rating'].values

In [16]:
predict = hasher.scoreMovies(30, testUsers, testMovies)
print(np.sum(np.square(predict - rating)))

80.33093218279444


In [17]:
allMovieScore, recommendList = hasher.recommend(4, 30, 5)
print(recommendList)

[(6598, 5.0), (3879, 5.0), (1859, 5.0), (4302, 5.0), (4731, 5.0)]


In [18]:
for num in range(10, 101, 10):
    predict = hasher.scoreMovies(num, testUsers, testMovies)
    SSE = np.sum(np.square(predict - rating))
    print(SSE)

81.00015236764898
80.07229285102497
80.33093218279444
80.86234447015973
80.6521307154941
80.33073089705636
80.31240126463544
80.17287280511215
80.31501152301841
80.36390345118966
