In [1]:
import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import math
import heapq
import warnings
from copy import deepcopy
warnings.filterwarnings("ignore")

In [2]:
df1 = pd.read_csv("movies.csv")
movieId, genres = df1['movieId'].values, df1['genres'].values
genresSet = set()
for i in range(len(genres)):
    genres[i] = genres[i].lower().split('|')
    for category in genres[i]:
        genresSet.add(category)

In [3]:
df = pd.read_csv("train_set.csv")
user, movie, score = df['userId'].values, df['movieId'].values, df['rating'].values
users = {}
moviesSet = set()
for i in range(len(user)):
    moviesSet.add(movie[i])
    if user[i] not in users:
        users.update({user[i] : {movie[i] : score[i]}})
    else:
        users[user[i]].update({movie[i] : score[i]})

In [4]:
class miniHashContent(object):
    def __init__(self, movieId, genres, genresSet, users):
        self.movieId = movieId
        self.genres = genres # 种类的列表
        self.genresList = list(genresSet)
        self.users = users

    def getFeaureMat(self):
        self.featureToNumber = {self.genresList[i] : i for i in range(len(self.genresList))} # 特征到序号的映射
        self.movieIdToNumber = {self.movieId[i] : i for i in range(len(self.movieId))} # 电影ID到序号的映射
        self.utilityMat = np.zeros(shape=(len(self.movieId), len(self.genresList))) # 效用矩阵
        for i in range(len(movieId)):
            for category in self.genres[i]:
                self.utilityMat[i][self.featureToNumber[category]] = 1
        self.utilityMat = self.utilityMat.T # 行为特征，列为集合
        return None

    def getHashFunc(self, x):
        self.hashFuncSum = x
        self.hash = []
        length = len(self.genresList)
        for i in range(x):
            # 第i个hash
            hashTable = []
            for j in range(length):
                hashTable.append(((i + 1) * j + 1) % length)
            self.hash.append(hashTable)
        self.hash = np.array(self.hash)
        return None

    def getSignatureMat(self):
        self.signatureMat = np.full(shape=(self.hashFuncSum, len(self.movieId)), fill_value=1000000) # 初始化为很大的数
        for i in range(len(self.utilityMat)):
            oneIndex = np.array(np.where(self.utilityMat[i] == 1)) # 找到1的位置, 即可替换的位置
            for j in range(self.hashFuncSum):
                for k in oneIndex[0]:
                    if self.hash[j, i] < self.signatureMat[j, k]:
                        self.signatureMat[j, k] = self.hash[j, i]
        self.v = np.array([self.signatureMat[:, i] for i in range(len(self.movieId))])
        return None

    def calJaccard(self, a, b):
        return len(set(a) & set(b)) / len(set(a) | set(b))

    def getJaccardMat(self):
        self.jaccardMat = np.zeros((len(self.movieId), len(self.movieId)))
        for i in range(len(self.movieId) - 1):
            for j in range(i + 1, len(self.movieId)):
                self.jaccardMat[i][j] = self.jaccardMat[j][i] = self.calJaccard(self.v[i], self.v[j])
        return None

    def scoreMovies(self, users, testUsers, testMovies)->list:
        predict = []
        for user, movie in zip(testUsers, testMovies):
            simSum, molecule = 0.0, 0.0
            for mov, mark in users[user].items():
                sim = self.jaccardMat[self.movieIdToNumber[mov]][self.movieIdToNumber[movie]]
                simSum += sim
                molecule += sim * mark  
            if simSum > 0:
                predict.append(molecule / simSum)
            else:
                predict.append(0)
        return predict

    def scoreSpecific(self, users, testUsers, testMovies):
        predict = []
        for user, movie in zip(testUsers, testMovies):
            simSum, molecule = 0.0, 0.0
            for mov, mark in users[user].items():
                # sim = self.jaccardMat[self.movieIdToNumber[mov]][self.movieIdToNumber[movie]]
                sim = self.calJaccard(self.v[self.movieIdToNumber[mov]], self.v[self.movieIdToNumber[movie]])
                simSum += sim
                molecule += sim * mark  
            if simSum > 0:
                predict.append(molecule / simSum)
            else:
                predict.append(0)
        return predict

    def recommend(self, peopleId, n):
        finalScore = {}
        for id in self.movieId:
            if id not in self.users[peopleId]:
                # 是未打分的电影，进行评分
                simSum, molecule = 0.0, 0.0
                for movie, score in self.users[peopleId].items():
                    sim = self.jaccardMat[self.movieIdToNumber[id]][self.movieIdToNumber[movie]]
                    if sim > 0:
                        molecule += score * sim
                        simSum += sim
                if simSum != 0:
                    finalScore.update({id : molecule / simSum})
                else:
                    finalScore.update({id : 0})
        finalScore = sorted(finalScore.items(), key=lambda x:x[1], reverse=True)
        return finalScore[:min(n, len(finalScore))]

In [5]:
hasher = miniHashContent(movieId, genres, genresSet, users)
hasher.getFeaureMat()
hasher.getHashFunc(4)
hasher.getSignatureMat()

In [6]:
import joblib
jaccardMat = joblib.load('JaccardMatrix\\1_mat_hash4')
hasher.jaccardMat = jaccardMat

In [7]:
testData = pd.read_csv("test_set.csv")
testUsers, testMovies, rating = testData['userId'].values, testData['movieId'].values, testData['rating'].values

In [8]:
predict = hasher.scoreMovies(users, testUsers, testMovies)
SSE = np.sum(np.square(np.array(predict) - rating))

In [9]:
SSE

65.9828575811148

In [13]:
hasher = miniHashContent(movieId, genres, genresSet, users)
hasher.getFeaureMat()
SSE_set = []
for num in range(2, 20, 2):
    hasher.getHashFunc(num)
    hasher.getSignatureMat()
    predict = hasher.scoreSpecific(users, testUsers, testMovies)
    SSE_set.append(np.sum(np.square(np.array(predict) - rating)))

In [14]:
SSE_set

[63.315543681475006,
 64.693131578845,
 66.55207838378904,
 65.57966841043574,
 65.61872688678098,
 65.68960588579287,
 66.01158800340758,
 66.52134546397016,
 66.63631574550122]

In [15]:
hasher.getHashFunc(1)
hasher.getSignatureMat()
predict = hasher.scoreSpecific(users, testUsers, testMovies)
print(np.sum(np.square(np.array(predict) - rating)))

63.44754044520913


In [10]:
recommends = hasher.recommend(29, 5)
print(recommends)

[(88932, 3.491442542787286), (426, 3.4877398720682287), (880, 3.4877398720682287), (968, 3.4877398720682287), (1433, 3.4877398720682287)]
