## 基于电影简介相似度的推荐算法

将电影的简介转成向量，然后计算向量间的相似度，从而找出相似电影进行推荐。

In [1]:
# coding=utf-8
import sys
import pandas as pd
import numpy as np
import operator
import time
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

### 读取数据

In [6]:
movies = pd.read_csv('/home/zwj/Desktop/recommend/movielens/moive_database/v1/v1_movie_info.csv',\
                        usecols=[0, 1, 3], header=0, names=['movieId', 'title', 'overview'])

df_train = pd.read_csv('/home/zwj/Desktop/recommend/movielens/moive_database/v1/v1_train.csv', \
                       usecols=[0, 1, 2])  # userId, movieId, rating


### 设置参数

In [3]:
# 推荐电影数
reco_num = 10
# 加权求和计算的相似项个数
sim_num = 10

### 功能函数

获取全局热门电影

In [4]:
def getHotItem(df_train, N=5):
    """
    param:
        df_train: 训练数据集 type:dataframe
        N: 推荐的电影数
    return: 
        hot_rank: 该用户的推荐热门电影列表 type:dict, key:user, value:dict, key:item, value:sim
    """

    item_count = df_train.groupby('movieId')['rating'].count().sort_values(ascending=False)

    hot_rank = {}

    r = 0
    for item_id in item_count[0:N].index:
        hot_rank[item_id] = 1 - 0.01 * r
        r += 1
    return hot_rank

hot_rank = getHotItem(df_train, reco_num)

hot_rank

{1: 0.94,
 50: 1.0,
 56: 0.91,
 100: 0.97,
 174: 0.92,
 181: 0.99,
 258: 0.98,
 288: 0.95,
 294: 0.96,
 300: 0.9299999999999999}

生成用户-电影排列表

In [5]:
def userItemDict(data):
    """
    param:
        data: lsit [user, item, rating]
    return:
        user_item: 用户-电影排列表 type:dict, key:user, value:dict, key:item, value:rate
    """
    user_item = {}
    for user, item, rate in data:
        if user not in user_item:
            user_item[user] = {}
        user_item[user].update({item: rate})
    return user_item

# 生成user-tiem排列表
user_item = userItemDict(df_train.values)

user_item[105]

{264.0: 2.0,
 270.0: 5.0,
 271.0: 2.0,
 288.0: 4.0,
 302.0: 5.0,
 313.0: 5.0,
 324.0: 4.0,
 327.0: 4.0,
 340.0: 3.0,
 343.0: 2.0,
 347.0: 3.0,
 690.0: 3.0,
 751.0: 2.0,
 880.0: 3.0}

计算电影相似度

In [13]:
def overviewSimReco(movie_id, df_train, movie_info, n):
    """
    param:
        movie_id: 电影ID号
        n: 前n部相似电影
    return: 
        reco_list:  推荐电影列表 type:dict key:movieid value:sim_score
    """
    # 加载电影信息
    train_movie_set = df_train['movieId'].unique()

    # 将电影名称设置为dataframe索引
    movie_metadata = movie_info[movie_info['movieId'].isin(train_movie_set)].set_index('movieId')

    # 创建tf-idf矩阵，用于比较电影简介的相似度
    # stop_words='english'使用英语内建的停用词列表
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(movie_metadata['overview'].dropna())

    # 返回该电影在数据集中的位置索引
    index = movie_metadata.reset_index(drop=True)[movie_metadata.index==movie_id].index[0]

    # 存储该电影简介与其他电影的相似度
    sim_movie = []

    # 遍历所有的电影
    for idx in range(np.shape(tfidf_matrix)[0]):

        # 如果是剔除同一部电影，相似度置为0
        if idx == index:
            sim_movie.append(0.0)
        else:
            tmp = np.concatenate((tfidf_matrix[index].toarray(), tfidf_matrix[idx].toarray()), axis=0)
            similarity = cosine_similarity(tmp)[0, 1]
            sim_movie.append(similarity)


    # 获取前n部最相似电影的索引和相似度
    similar_movies_index = np.argsort(sim_movie)[::-1][:n]
    similar_movies_score = np.sort(sim_movie)[::-1][:n]

    # 获得相似电影的名称
    similar_movies_id = movie_metadata.iloc[similar_movies_index].index

    reco_list = {}
    for id, score in zip(similar_movies_id, similar_movies_score):
        reco_list[id] = score

    return reco_list

推荐系统

In [14]:
def recommendation(df_train, movie_info, user_item, user_id, hot_rank, K, R):
    """
    param：
        user_item: 训练集中user-item字典 {user1 : {item1 : rate1, item2 : rate2}, ...}}
        user_id：推荐的用户id
        hot_rank: 热门电影列表
        K：前K个最相似电影
        R：推荐列表中电影个数
    return：
        rank_sorted：该用户的推荐电影列表 type:dict, key:user, value:dict, key:item, value:score
    """

    # 存储用户推荐电影
    rank = {}
    # 开辟用户空子字典 ('rank: ', {user_id: {}})
    rank.setdefault(user_id, {})

    # 如果该用户不在训练集中，则推荐热门电影
    if user_id not in user_item:
        print('user {} not in trainset, give hot rank list'.format(user_id))
        rank[user_id] = hot_rank
    else:
        # 用户已观看的电影集合
        watched_item_list = user_item[user_id]

        # item_i:项目号， ri:对应的评分（兴趣度）
        for item_i, ri in watched_item_list.items():
            simMovie = overviewSimReco(item_i, df_train, movie_info, K)
            for item_j, simj in simMovie.items():
                if item_j in watched_item_list:
                    continue

                rank[user_id].setdefault(item_j, 0)
                # 电影推荐度 = 用户评分（或者兴趣度）* 电影相似度
                # 此例中用户观看过电影则兴趣度为1
                rank[user_id][item_j] += ri * simj

    rank_sorted = {}
    rank_sorted[user_id] = sorted(rank[user_id].items(), key=operator.itemgetter(1), reverse=True)[0:R]

    return rank_sorted

### 电影推荐

In [15]:
# 输入用户ID
user_id = input('Please input your user ID:')

recom_list = recommendation(df_train, movies, user_item, user_id, hot_rank, sim_num, reco_num)
# 输出推荐电影
print('\nRecommended movies：\n')

ranknum = 0
for idx, rate  in recom_list.values()[0]:
    ranknum += 1
    print("%d. %s" %(ranknum, movies[movies['movieId'] == idx]['title'].values[0]))
    print '-'*50


Please input your user ID:7

Recommended movies：

1. Young Frankenstein
--------------------------------------------------
2. Fluke
--------------------------------------------------
3. Jason's Lyric
--------------------------------------------------
4. Deceiver
--------------------------------------------------
5. Little Odessa
--------------------------------------------------
6. Basic Instinct
--------------------------------------------------
7. Faces
--------------------------------------------------
8. Foreign Student
--------------------------------------------------
9. Star Trek V: The Final Frontier
--------------------------------------------------
10. Mulholland Falls
--------------------------------------------------
