## 基于用户的协同过滤算法UserCF

统计用户行为数据，计算用户之间的相似度，从而推荐相似用户观看的电影

In [1]:
# coding:utf-8
import numpy as np
import pandas as pd
import random
import math
import operator
import time
from sklearn.utils import shuffle

### 读取数据

In [3]:
movies = pd.read_csv('/home/zwj/Desktop/recommend/movielens/moive_database/v1/v1_movie_info.csv',\
                        usecols=[0, 1, 6], header=0, names=['movieId', 'title', 'genres'])

df_train = pd.read_csv('/home/zwj/Desktop/recommend/movielens/moive_database/v1/v1_train.csv', \
                       usecols=[0, 1, 2])  # userId, movieId, rating

### 设置参数

In [21]:
# 推荐电影数
reco_num = 10
# 加权求和计算的相似项个数
sim_num = 10

### 功能函数

获取全局热门电影

In [5]:
def getHotItem(df_train, N=5):
    """
    param:
        df_train: 训练数据集 type:dataframe
        N: 推荐的电影数
    return: 
        hot_rank: 该用户的推荐热门电影列表 type:dict, key:user, value:dict, key:item, value:sim
    """

    item_count = df_train.groupby('movieId')['rating'].count().sort_values(ascending=False)

    hot_rank = {}

    r = 0
    for item_id in item_count[0:N].index:
        hot_rank[item_id] = 1 - 0.01 * r
        r += 1
    return hot_rank

hot_rank = getHotItem(df_train, reco_num)

hot_rank

{50: 1.0, 100: 0.97, 181: 0.99, 258: 0.98, 294: 0.96}

生成用户-电影排列表

In [13]:
def userItemDict(data):
    """
    param:
        data: lsit [user, item, rating]
    return:
        user_item: 用户-电影排列表 type:dict, key:user, value:dict, key:item, value:rate
    """
    user_item = {}
    for user, item, rate in data:
        if user not in user_item:
            user_item[user] = {}
        user_item[user].update({item: rate})
    return user_item

# 生成user-tiem排列表
user_item = userItemDict(df_train.values)

user_item[105]

{264.0: 2.0,
 270.0: 5.0,
 271.0: 2.0,
 288.0: 4.0,
 302.0: 5.0,
 313.0: 5.0,
 324.0: 4.0,
 327.0: 4.0,
 340.0: 3.0,
 343.0: 2.0,
 347.0: 3.0,
 690.0: 3.0,
 751.0: 2.0,
 880.0: 3.0}

生成电影用户倒排表

In [14]:
def itemUserDict(data):
    """
    param:
        data: lsit [user, item, rating]
    return:
        item_user: 电影-用户倒排表 type:dict, key:item, value:list, [user1, user2, ...]
    """

    item_user = {}
    for user, item, rate in data:
        if item not in item_user:
            item_user[item] = []
        item_user[item].append(user)
    return item_user

# 生成item-user排列表
item_user = itemUserDict(df_train.values)

item_user[103]

[42.0, 280.0, 416.0, 463.0, 533.0, 606.0, 648.0, 881.0]

计算用户相似度

In [16]:
def userSimilarity(item_user):
    """
    param:
        item_user: 电影-用户倒排表 type:dict, key:item, value:list, [user1, user2, ...]
    return: 
        W：用户相似度矩阵，type:dict, key:user_u, value:dict, key:user_v, value:similarity
    """

    # C[u][v]存储观看用户u和v共同看的电影数
    C = {}
    # 统计用户的观看电影数 N[u]记录用户u观看的电影数
    N = {}
    for itemid, users in item_user.items():
        for user_u in users:
            N.setdefault(user_u, 0)
            N[user_u] += 1
            for user_v in users:
                if user_u == user_v:
                    continue
                C.setdefault(user_u, {})
                C[user_u].setdefault(user_v, 0)
                # 统计用户u和用户v观看的电影数
                # 1.传统方法
                # C[user_u][user_v] += 1
                # 2.优化方法，削弱了热门电影的贡献度，电影观看的人数越多其影响越弱
                C[user_u][user_v] += 1 / math.log(1 + len(users) * 1.0)

    # 用户相似矩阵
    W = {}
    # C={user1, {user2: num, user3: num}}
    for user_u, related_users in C.items():
        for user_v, cuv in related_users.items():
            W.setdefault(user_u, {})
            W[user_u].setdefault(user_v, 0)
            # 计算相似度
            W[user_u][user_v] = cuv / math.sqrt(N[user_u] * N[user_v])

    return W

# 生成电影相似度字典
user_sim = userSimilarity(item_user)

sorted(user_sim[105].items(), key=operator.itemgetter(1), reverse=True)[0:6]

[(820.0, 0.1159516986763142),
 (898.0, 0.11531419341733552),
 (408.0, 0.10811016351745759),
 (808.0, 0.10771981569969562),
 (414.0, 0.10191327262464009),
 (775.0, 0.09990418853272287)]

推荐系统

In [17]:
def recommendation(user_item, user_id, W, hot_rank, K, R):
    """
    param:
        user_item: 用户-电影排列表 type:dict, key:user, value:dict, key:item, value:rate
        user_id: 推荐的用户id
        W: 用户相似度矩阵，type:dict, key:user_u, value:dict, key:user_v, value:similarity
        hot_rank: 热门电影列表, type:dict, key:user, value:dict, key:item, value:sim
        K: 前K个最相似用户
        R: 推荐列表中电影个数
    return: 
        rank_sorted：该用户的推荐电影列表 type:dict, key:user, value:dict, key:item, value:sim
    """

    # 存储用户推荐电影
    rank = {}
    # 开辟用户空子字典 ('rank: ', {user_id: {}})
    rank.setdefault(user_id, {})

    # 如果该用户不在训练集中，则推荐热门电影
    if user_id not in user_item:
        print('unvalid user_id(user_id not in user_item): ', user_id)
        rank[user_id] = hot_rank
    # 如果该用户不在相似度矩阵中，则推荐热门电影
    if user_id not in W:
        print('unvalid user_id(user_id not in W): ', user_id)
        rank[user_id] = hot_rank
    else:
        # 用户已观看的电影集合
        watched_item_list = user_item[user_id]

        # 遍历相似矩阵中该用户前K个最相似用户
        for v, wuv in sorted(W[user_id].items(), key=operator.itemgetter(1), reverse=True)[0:K]:
            # 将相似用户v中观看过的电影推荐给该用户
            for item_i, ri in user_item[v].items():
                # 如果电影是该用户观看过的电影，则跳过
                if item_i in watched_item_list:
                    continue
                rank[user_id].setdefault(item_i, 0)
                # 电影推荐度 = 用户相似度 * 用户对电影兴趣度（或者评分）
                # 此例中用户设置观看过电影的兴趣度为1
                rank[user_id][item_i] += wuv * ri

    rank_sorted = {}
    rank_sorted[user_id] = sorted(rank[user_id].items(), key=operator.itemgetter(1), reverse=True)[0:R]

    return rank_sorted

### 电影推荐

In [23]:
# 输入用户ID
user_id = input('Please input your user ID:')

recom_list = recommendation(user_item, user_id, user_sim, hot_rank, sim_num, reco_num)

# 输出推荐电影
print('\nRecommended movies：\n')

ranknum = 0
for idx, rate  in recom_list.values()[0]:
    ranknum += 1
    print("%d. %s" %(ranknum, movies[movies['movieId'] == idx]['title'].values[0]))
    print '-'*50


Please input your user ID:105

Recommended movies：

1. Air Force One
--------------------------------------------------
2. Contact
--------------------------------------------------
3. Apt Pupil
--------------------------------------------------
4. Good Will Hunting
--------------------------------------------------
5. Everyone Says I Love You
--------------------------------------------------
6. Liar Liar
--------------------------------------------------
7. In & Out
--------------------------------------------------
8. Conspiracy Theory
--------------------------------------------------
9. Amistad
--------------------------------------------------
10. Spawn
--------------------------------------------------
