## 基于物品的协同过滤算法itemCF

统计用户行为数据，计算电影之间的相似度，从而推荐用户喜爱电影中相似的电影

In [69]:
# coding:utf-8
import numpy as np
import pandas as pd
import random
import math
import operator
import time
from sklearn.utils import shuffle

### 读取数据

In [70]:
movies = pd.read_csv('data/v1_movie_info.csv',\
                        usecols=[0, 1, 6], header=0, names=['movieId', 'title', 'genres'])

df_train = pd.read_csv('data/v1_train.csv', \
                       usecols=[0, 1, 2])  # userId, movieId, rating

### 设置参数

In [71]:
# 推荐电影数
reco_num = 5
# 加权求和计算的相似项个数
sim_num = 10

### 功能函数

获取全局热门电影

In [72]:
def getHotItem(df_train, N=5):
    """
    param:
        df_train: 训练数据集 type:dataframe
        N: 推荐的电影数
    return: 
        hot_rank: 该用户的推荐热门电影列表 type:dict, key:user, value:dict, key:item, value:sim
    """

    item_count = df_train.groupby('movieId')['rating'].count().sort_values(ascending=False)

    hot_rank = {}

    r = 0
    for item_id in item_count[0:N].index:
        hot_rank[item_id] = 1 - 0.01 * r
        r += 1
    return hot_rank

hot_rank = getHotItem(df_train, reco_num)

hot_rank

{50: 1.0, 100: 0.97, 181: 0.99, 258: 0.98, 294: 0.96}

生成用户-电影排列表

In [73]:
def userItemDict(data):
    """
    param:
        data: lsit [user, item, rating]
    return:
        user_item: 用户-电影排列表 type:dict, key:user, value:dict, key:item, value:rate
    """
    user_item = {}
    for user, item, rate in data:
        if user not in user_item:
            user_item[user] = {}
        user_item[user].update({item: rate})
    return user_item

# 生成user-tiem排列表
user_item = userItemDict(df_train.values)

user_item[105]

{264.0: 2.0,
 270.0: 5.0,
 271.0: 2.0,
 288.0: 4.0,
 302.0: 5.0,
 313.0: 5.0,
 324.0: 4.0,
 327.0: 4.0,
 340.0: 3.0,
 343.0: 2.0,
 347.0: 3.0,
 690.0: 3.0,
 751.0: 2.0,
 880.0: 3.0}

计算电影相似度

In [74]:
def itemSimilarity(user_item):
    """
    param:
        user_item: 用户-电影排列表 type:dict, key:user, value:dict, key:item, value: rate
    return: 
        W：物品相似度矩阵，type:dict, key:item_i, value:dict, key:item_j, value:similarity
    """
    # C[i][j]存储观看电影i和j的用户数
    C = {}
    # 统计item的观看量 N[i]记录观看电影i的用户数
    N = {}
    for u, items in user_item.items():
        for item_i in items:
            N.setdefault(item_i, 0)
            N[item_i] += 1
            for item_j in items:
                if item_i == item_j:
                    continue
                C.setdefault(item_i, {})
                C[item_i].setdefault(item_j, 0)
                # 统计观看了电影i和电影j的用户数
                # 1.传统方法
                # C[item_i][item_j] += 1
                # 2.优化方法，削弱了活跃用户的贡献度，用户观看电影越多其影响越弱
                C[item_i][item_j] += 1 / math.log(1 + len(items) * 1.0)
    # 电影相似矩阵
    W = {}

    #  item1, {item2: num, item3: num}
    for item_i, related_items in C.items():

        for item_j, cij in related_items.items():
            W.setdefault(item_i, {})
            W[item_i].setdefault(item_j, 0)
            # 计算相似度
            W[item_i][item_j] = cij / math.sqrt(N[item_i] * N[item_j])

    return W

# 生成电影相似度字典
item_sim = itemSimilarity(user_item)

sorted(item_sim[5].items(), key=operator.itemgetter(1), reverse=True)[0:6]

[(672.0, 0.10234357345919975),
 (218.0, 0.09445000329581626),
 (447.0, 0.09218964096745205),
 (234.0, 0.09175163649384785),
 (559.0, 0.08846896278134712),
 (53.0, 0.084078336863534)]

推荐系统

In [75]:
def recommendation(user_item, user_id, W, hot_rank, K, R):
    """
    param:
        user_item: 用户-电影排列表 type:dict, key:user, value:dict, key:item, value:rate
        user_id: 推荐的用户id
        W: 电影相似矩阵, type:dict, key:item_i, value:dict, key:item_j, value:similarity
        hot_rank: 热门电影列表, type:dict, key:user, value:dict, key:item, value:sim
        K: 前K个最相似电影
        R: 推荐列表中电影个数
    return: 
        rank_sorted：该用户的推荐电影列表 type:dict, key:user, value:dict, key:item, value:sim
    """
    # 存储用户推荐电影
    rank = {}
    # 开辟用户空子字典 ('rank: ', {user_id: {}})
    rank.setdefault(user_id, {})

    # 如果该用户不在训练集中，则推荐热门电影
    if user_id not in user_item:
        print('user {} not in trainset, give hot rank list'.format(user_id))
        rank[user_id] = hot_rank
    else:
        # 用户已观看的电影集合
        watched_item_list = user_item[user_id]

        # item_i:项目号， ri:对应的评分（兴趣度）
        for item_i, ri in watched_item_list.items():

            # 如果该item不在相似度矩阵中，则推荐空序列
            if item_i not in W:
                print('unvalid item_id(item_id not in W): ', item_i)
                continue

            # 在遍历电影i与相似矩阵中前K个电影j的相似度
            for item_j, wj in sorted(W[item_i].items(), key=operator.itemgetter(1), reverse=True)[0:K]:

                # 如果电影j在该用户的电影观看列表中则跳过
                if item_j in watched_item_list:
                    continue

                rank[user_id].setdefault(item_j, 0)
                # 电影推荐度 = 用户评分（或者兴趣度）* 电影相似度
                # 此例中用户观看过电影则兴趣度为1
                rank[user_id][item_j] += ri * wj

    rank_sorted = {}
    rank_sorted[user_id] = sorted(rank[user_id].items(), key=operator.itemgetter(1), reverse=True)[0:R]

    return rank_sorted

### 电影推荐

In [76]:
# 输入用户ID
user_id = input('Please input your user ID:')

recom_list = recommendation(user_item, user_id, item_sim, hot_rank, sim_num, reco_num)

# 输出推荐电影
print('\nRecommended movies：\n')

ranknum = 0
for idx, rate  in recom_list.values()[0]:
    ranknum += 1
    print("%d. %s" %(ranknum, movies[movies['movieId'] == idx]['title'].values[0]))
    print '-'*50


Please input your user ID:7

Recommended movies：

1. Raiders of the Lost Ark
--------------------------------------------------
2. Indiana Jones and the Last Crusade
--------------------------------------------------
3. E.T. the Extra-Terrestrial
--------------------------------------------------
4. True Lies
--------------------------------------------------
5. Jurassic Park
--------------------------------------------------
