In [1]:
import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import math
import heapq
import warnings
from copy import deepcopy
warnings.filterwarnings("ignore")

In [2]:
df1 = pd.read_csv("movies.csv")

In [3]:
class moviePreprocess(object):
    def __init__(self, df):
        self.df = df
        self.categories = self.df['genres'].values
        self.movieId = self.df['movieId'].values
        
    def calSum(self):
        self.idToCategory = {self.movieId[i] : self.categories[i].lower().split('|') for i in range(len(self.movieId))}
        self.categorySet = set()
        self.categoryAppearSum = {}
        for key, value in self.idToCategory.items():
            for word in value:
                self.categorySet.add(word)
                if word in self.categoryAppearSum:
                    self.categoryAppearSum[word] += 1
                else:
                    self.categoryAppearSum.update({word : 1})

    def getTfidfMat(self):
        self.calSum()
        categoryList = list(self.categorySet)
        mapper = {categoryList[i] : i for i in range(len(categoryList))}
        tfidfMat = np.zeros(shape=(len(self.movieId), len(categoryList)), dtype=np.float32)
        length = len(self.movieId)
        for i in range(len(self.movieId)):
            for word in self.idToCategory[self.movieId[i]]:
                tfidfMat[i][mapper[word]] = 1 + math.log((1 + length) / (1 + self.categoryAppearSum[word])) # smooth_idf
        for i in range(len(tfidfMat)):
            tfidfMat[i] = np.divide(tfidfMat[i], np.sqrt(np.sum(np.square(tfidfMat[i])))) # L2 归一化
        return tfidfMat
    
    def getCosSimilarBetweenTwoMat(self, v1, v2):
        return np.dot(v1, np.array(v2).T) / (np.linalg.norm(v1, axis=1).reshape(-1, 1) * np.linalg.norm(v2, axis=1))

    def getSimilarityMat(self):
        tfidfMat = self.getTfidfMat()
        return self.getCosSimilarBetweenTwoMat(tfidfMat, tfidfMat)


In [4]:
processor = moviePreprocess(df1)
similarityMatrix = processor.getSimilarityMat()

In [5]:
df2 = pd.read_csv("train_set.csv")
user, movie, score = df2['userId'].values, df2['movieId'].values, df2['rating'].values
users = {}
for i in range(len(user)):
    if user[i] not in users:
        users.update({user[i] : {movie[i]: score[i]}}) 
    else:
        users[user[i]].update({movie[i]: score[i]})
    

In [6]:
class recommandByContent(object):
    def __init__(self, movieSimMatrix, movieId, mapper, users, k):
        self.movieSimMatrix = movieSimMatrix
        self.movieId = movieId
        self.mapper = mapper # movieId -> order in movieSimMatrix
        self.users = users
        self.k = k
        
    def recommandSingle(self, peopleId):
        # peopleId 是excel中对应的人序号，从1开始
        finalScore = {}
        for id in self.movieId:
            if id not in self.users[peopleId]:
                # 是未打分的电影，进行评分
                simSum, molecule = 0.0, 0.0
                for movie, score in self.users[peopleId].items():
                    sim = self.movieSimMatrix[self.mapper[id]][self.mapper[movie]]
                    if sim > 0:
                        molecule += score * sim
                        simSum += sim
                if simSum != 0:
                    finalScore.update({id : molecule / simSum})
                else:
                    finalScore.update({id : 0})
        return finalScore

    def recommandSpecificPerson(self, userID):
        finalScore = self.recommandSingle(userID)
        predictScore = deepcopy(finalScore)
        finalScore = sorted(finalScore.items(), key=lambda x:x[1], reverse=True)
        return predictScore, finalScore[: min(self.k, len(finalScore))]

    def scoreMovie(self, userID, movieID):
        simSum, molecule = 0.0, 0.0
        for movie, score in self.users[userID].items():
            sim = self.movieSimMatrix[self.mapper[movieID]][self.mapper[movie]]
            if sim > 0:
                molecule += score * sim
                simSum += sim
        if simSum != 0:
            return molecule / simSum
        else:
            return 0

In [7]:
movieId = df1['movieId'].values
mapper = {movieId[i] : i for i in range(len(movieId))}
recommander = recommandByContent(similarityMatrix, movieId, mapper, users, 5)

In [13]:
finalScore, recommandMovie = recommander.recommandSpecificPerson(4)

In [14]:
recommandMovie

[(746, 5.0), (1153, 5.0), (1154, 5.0), (2066, 5.0), (3292, 5.0)]

In [8]:
testData = pd.read_csv("test_set.csv")
testUsers, testMovies, rating = testData['userId'].values, testData['movieId'].values, testData['rating'].values

In [24]:
def testResult(testUsers, testMovies, recommander):
    res = []
    for user, movie in zip(testUsers, testMovies):
        res.append(recommander.scoreMovie(user, movie))
    return res

In [25]:
predict = testResult(testUsers, testMovies, recommander)

In [27]:
SSE = np.sum(np.square(np.array(predict) - rating))
print(SSE)

67.11922898128542
