## 基于电影风格偏好的推荐系统算法

该算法的核心思想：首先，找出用户最喜爱的电影风格；然后，找出各类风格中评分最高的电影集合；最后，将用户喜爱风格中评分最高的电影，按比例的数量推荐给用户。

In [1]:
# coding=utf-8
import sys
import pandas as pd
import operator
import time

### 读取数据

In [2]:
movies = pd.read_csv('/home/zwj/Desktop/recommend/movielens/moive_database/v1/v1_movie_info.csv',\
                        usecols=[0, 1, 6], header=0, names=['movieId', 'title', 'genres'])

df_train = pd.read_csv('/home/zwj/Desktop/recommend/movielens/moive_database/v1/v1_train.csv')  
# userId, movieId, rating, timestamp

### 设置参数

In [3]:
# 推荐电影数
reco_num = 10
# 电影信息表文件路径
movie_info_path = '/home/zwj/Desktop/recommend/movielens/moive_database/v1/v1_movie_info.csv'

### 功能函数

获取全局热门电影

In [4]:
def getHotItem(df_train, N=5):
    """
    param:
        df_train: 训练数据集 type:dataframe
        N: 推荐的电影数
    return: 
        hot_rank: 该用户的推荐热门电影列表 type:dict, key:user, value:dict, key:item, value:sim
    """

    item_count = df_train.groupby('movieId')['rating'].count().sort_values(ascending=False)

    hot_rank = {}

    r = 0
    for item_id in item_count[0:N].index:
        hot_rank[item_id] = 1 - 0.01 * r
        r += 1
    return hot_rank

hot_rank = getHotItem(df_train, reco_num)

hot_rank

{1: 0.94,
 50: 1.0,
 56: 0.91,
 100: 0.97,
 174: 0.92,
 181: 0.99,
 258: 0.98,
 288: 0.95,
 294: 0.96,
 300: 0.9299999999999999}

生成用户-电影排列表

In [5]:
def userItemDict(data):
    """
    param:
        data: lsit [user, item, rating]
    return:
        user_item: 用户-电影排列表 type:dict, key:user, value:dict, key:item, value:rate
    """
    user_item = {}
    for user, item, rate, timestamp in data:
        if user not in user_item:
            user_item[user] = {}
        user_item[user].update({item: rate})
    return user_item

# 生成user-tiem排列表
user_item = userItemDict(df_train.values)

user_item[105]

{264.0: 2.0,
 270.0: 5.0,
 271.0: 2.0,
 288.0: 4.0,
 302.0: 5.0,
 313.0: 5.0,
 324.0: 4.0,
 327.0: 4.0,
 340.0: 3.0,
 343.0: 2.0,
 347.0: 3.0,
 690.0: 3.0,
 751.0: 2.0,
 880.0: 3.0}

获得item的全局平均得分

In [6]:
def getItemAveScore(data):
    """
    param:
        data: type ndarray [[user, item, rating]]
    return:
        ave_score: item的全局平均得分字典 type dict, key:item, value: ratio
    """

    # 中间字典 key: item, value: list [rate_sum, rate_count]
    record_tmp = {}
    ave_score = {}

    for user, item, rate in data:
        if item not in record_tmp:
            record_tmp[item] = [0, 0]
        record_tmp[item][0] += rate
        record_tmp[item][1] += 1
    for item in record_tmp:
        # 保存item的平均得分
        ave_score[item] = round(record_tmp[item][0] / record_tmp[item][1], 3)
    return ave_score

ave_score = getItemAveScore(df_train[['userId', 'movieId', 'rating']].values)

规范化电影风格格式

In [7]:
def getGenre(genre_str):
    """
    param:
        genre_str: 字符串 如"Drama|Mystery|Romance|"
    return:
        genre_list: 风格列表 ['Drama', 'Mystery', 'Romance']
    """
    genre_list = []
    empty = 0
    # 当genres字段为空时，跳过
    if genre_str == "" or genre_str == "unknown":
        empty = 1
        return genre_list, empty
    # 字符串处理，按"|"分割字符串,list最后一个元素为空，所以去掉
    genre_list = genre_str.strip().split("|")

    return genre_list, empty

获得item-genre排列表和genre-item倒排表

In [8]:
def getItemgenre(movie_info_path, ave_score, K=50):
    """
    param:
        movie_info_path: 电影信息文件
        ave_score: item的全局平均得分字典 type dict, key:item, value: ratio
        K: genre_item排列表中记录前K个得分最高的item
    return:
        item_genre： item-genre排列表，记录每部电影的风格，以及风格的权重比， \
                     type dict, key: item, value:dict, key:genre, value:ratio
        genre_item： genre-item倒排表，记录每种风格中评分高的电影，\
                     type dict, key: genre, value:list [item1, item2, ...]
    """

    item_genre = {}
    genre_item = {}
    # 中间记录字典 key:genre, value: dict, key:item, value:ave_score
    record_tmp = {}
    # 读取电影信息
    movie_metadata = pd.read_csv(movie_info_path, usecols=[0, 6])

    for item, genres in movie_metadata.values:
        # 存储每部电影的风格
        print(item, genres)
        genre_list, f_void = getGenre(genres)
        if f_void:
            continue
        # 电影中每种风格的比重是=1/总的风格个数
        ratio = round(1.0 / len(genre_list), 3)

        if item not in item_genre:
            item_genre[item] = {}
        for genre in genre_list:
            item_genre[item][genre] = ratio

    # 遍历item-genre排列表，生成genre-item的中间表，记录每种风格下的电影item以及得分
    for item in item_genre:
        for genre in item_genre[item]:
            if genre not in record_tmp:
                record_tmp[genre] = {}
            # 赋值item的平均得分，如果在item_ave_score字典中没有该item，赋值为0
            record_tmp[genre][item] = ave_score.get(item, 0)

    # 遍历genre-item中间表，生成genre-item排列表，每种风格记录前K部评分最高电影
    for genre in record_tmp:
        if genre not in genre_item:
            genre_item[genre] = []
        for item, score in sorted(record_tmp[genre].iteritems(), key=operator.itemgetter(1), reverse=True)[:K]:
            genre_item[genre].append(item)
    return item_genre, genre_item

item_genre, genre_item = getItemgenre(movie_info_path, ave_score)

(1, 'Animation|Comedy|Family')
(2, 'Adventure|Action|Thriller')
(3, 'Crime|Comedy')
(4, 'Comedy|Thriller|Crime')
(5, 'Drama|Thriller')
(7, 'Science Fiction|Thriller|Mystery')
(8, 'Fantasy|Drama|Comedy|Family')
(9, 'Drama')
(10, 'Drama|War')
(13, 'Comedy|Romance')
(15, 'Music|Drama|Family')
(17, 'Horror|Action|Thriller|Crime')
(19, 'Drama|Comedy')
(20, 'Drama|Romance')
(21, 'Action|Comedy|Music|Family|Adventure')
(22, 'Action|Drama|History|War')
(23, 'Crime|Drama')
(24, 'Crime|Action|Comedy|Thriller')
(27, 'Action|Comedy|Crime|Thriller')
(28, 'Drama')
(29, 'Action|Crime|Fantasy')
(31, 'Action|Thriller|Drama')
(32, 'Documentary')
(33, 'Thriller|Action|Crime')
(36, 'Drama|Family|Romance')
(37, 'Horror|Thriller')
(39, 'Crime|Drama|Science Fiction|Thriller')
(40, 'Comedy')
(41, 'Comedy')
(42, 'Comedy')
(43, 'Drama|Thriller|Crime|Mystery|Romance')
(45, 'Comedy|Drama|Romance')
(46, 'Drama|Mystery')
(47, 'Comedy|Drama|History')
(48, 'Documentary')
(49, 'Comedy|Drama|Romance')
(50, 'Adventure|A

计算时间得分

In [9]:
def getTimeScore(timestamp):
    """
    param:
        timestamp: 时间戳
    return:
        时间权重得分
    """
    # 训练集中最近的日期，根据实际情况需要更改
    fix_time_stamp = 893286638
    total_sec = 24 * 60 * 60 * 100
    delta = (fix_time_stamp - timestamp) / total_sec
    # 返回时间得分，日期最近权重越大
    return round(1.0 / (1 + delta), 3)

获得用户画像，即用户最喜爱的电影类型

In [10]:
def getUserProfile(data, item_genre, K=3):
    """
    param:
        data: type ndarray [[user, item, rating]]
        item_genre: item-genre排列表，记录每部电影的风格，以及风格的权重比， \
                    type dict, key: item, value:dict, key:genre, value:ratio
        K: 用户最喜爱的电影类型个数
    return:
        user_profile: 记录用户喜爱的风格和兴趣度，type:dict, key:user, value:list [genre, score]
    """
    # 得分阈值，大于等于阈值设置为喜欢
    score_thr = 4.0
    # user-genre中间表，key:user, value:dict, key:genre, value:score
    record_tmp = {}
    user_profile = {}
    for user, item, rate, ts in data:
        # 小于阈值的数据忽略
        if rate < score_thr:
            continue
        # 如果item不在item-genre排列表中，也跳过
        if item not in item_genre:
            continue

        if user not in record_tmp:
            record_tmp[user] = {}
        for genre in item_genre[item]:
            if genre not in record_tmp[user]:
                # 初始化用户对每种风格的电影兴趣度
                record_tmp[user][genre] = 0
            # 用户风格兴趣度=电影评分*电影中该类型的权重比*时间权重
            record_tmp[user][genre] += rate * item_genre[item][genre] * getTimeScore(int(ts))

    for user in record_tmp:
        if user not in user_profile:
            user_profile[user] = []
        total_score = 0
        for genre, score in sorted(record_tmp[user].iteritems(), key=operator.itemgetter(1), reverse=True)[:K]:
            user_profile[user].append((genre, score))
            total_score += score
        # 遍历用户喜欢的风格
        for index in range(len(user_profile[user])):
            # 将每种风格的得分归一化
            user_profile[user][index] = (user_profile[user][index][0], \
                                         round(user_profile[user][index][1] / total_score, 3))
    return user_profile

user_profile = getUserProfile(df_train.values, item_genre)

user_profile[7]

[('Drama', 0.583), ('Comedy', 0.229), ('Thriller', 0.188)]

推荐系统

In [11]:
def recommendation(genre_item, user_profile, user, user_item, hot_rank, R=30):
    """
    param:
        genre_item: genre-item排列表，记录每种风格中评分高的电影，\
                   type dict, key: genre, value:list [item1, item2, ...]
        user_profile: 记录用户喜爱的风格和兴趣度，type:dict, key:user, value:list [genre, score]
        user: 用户id
        user_item: 用户-电影排列表 type:dict, key=user, value=dict, key=item, value=rate
        hot_rank: 热门电影列表, type:dict, key:user, value:dict, key:item, value:sim
        R: 推荐列表中电影个数
    return:
        recom_result: 推荐列表 type:dict, key:user, value:list [item1, item2, ...]
    """

    # 用户已观看的电影集合
    watched_item_list = user_item[user]

    recom_result = {}
    if user not in recom_result:
        recom_result[user] = []
    # 如果没有该用户的用户画像，推荐热门电影
    if user not in user_profile:
        recom_result[user] = hot_rank.keys()
    else:
        for genre, ratio in user_profile[user]:
            # 按照用户对电影风格的喜爱比重计算该风格推荐个数，向上取整
            num = int(R * ratio) + 1
            if genre not in genre_item:
                continue
            candidate = [item for item in genre_item[genre] if item not in watched_item_list]
            recom_list = candidate[:num]
            recom_result[user].extend(recom_list)
    return recom_result

### 电影推荐

In [12]:
# 输入用户ID
user_id = input('Please input your user ID:')

recom_list = recommendation(genre_item, user_profile, user_id, user_item, hot_rank, reco_num)

# 输出推荐电影
print('\nRecommended movies：\n')

ranknum = 0
for idx in recom_list.values()[0]:
    ranknum += 1
    print("%d. %s" %(ranknum, movies[movies['movieId'] == idx]['title'].values[0]))
    print '-'*50


Please input your user ID:7

Recommended movies：

1. They Made Me a Criminal
--------------------------------------------------
2. Prefontaine
--------------------------------------------------
3. Someone Else's America
--------------------------------------------------
4. Pather Panchali
--------------------------------------------------
5. Faust
--------------------------------------------------
6. Some Mother's Son
--------------------------------------------------
7. Santa with Muscles
--------------------------------------------------
8. Shall We Dance?
--------------------------------------------------
9. Stonewall
--------------------------------------------------
10. They Made Me a Criminal
--------------------------------------------------
11. North by Northwest
--------------------------------------------------
