## 隐语义模型推荐算法LFM

利用用户行为数据，训练隐语义模型，分解出用户因子矩阵和电影因子矩阵，通过矩阵相乘获得用户电影得分，根据得分高低进行排序推荐。

In [1]:
# coding:utf-8
import pandas as pd
import numpy as np
import os
from surprise import Reader, Dataset
from surprise import NormalPredictor, BaselineOnly
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import SVD, SVDpp, NMF, model_selection
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
import math
import time
import operator

### 读取数据

In [2]:
movies = pd.read_csv('/home/zwj/Desktop/recommend/movielens/moive_database/v1/v1_movie_info.csv',\
                        usecols=[0, 1, 6], header=0, names=['movieId', 'title', 'genres'])

df_train = pd.read_csv('/home/zwj/Desktop/recommend/movielens/moive_database/v1/v1_train.csv', \
                       usecols=[0, 1, 2])  # userId, movieId, rating

### 设置参数

In [3]:
# 推荐电影数
reco_num = 10
# 评分转成0/1分类的阈值
thres_rate = 4.0

### 功能函数

获取全局热门电影

In [4]:
def getHotItem(df_train, N=5):
    """
    param:
        df_train: 训练数据集 type:dataframe
        N: 推荐的电影数
    return: 
        hot_rank: 该用户的推荐热门电影列表 type:dict, key:user, value:dict, key:item, value:sim
    """

    item_count = df_train.groupby('movieId')['rating'].count().sort_values(ascending=False)

    hot_rank = {}

    r = 0
    for item_id in item_count[0:N].index:
        hot_rank[item_id] = 1 - 0.01 * r
        r += 1
    return hot_rank

hot_rank = getHotItem(df_train, reco_num)

hot_rank

{1: 0.94,
 50: 1.0,
 56: 0.91,
 100: 0.97,
 174: 0.92,
 181: 0.99,
 258: 0.98,
 288: 0.95,
 294: 0.96,
 300: 0.9299999999999999}

生成用户-电影排列表

In [5]:
def userItemDict(data):
    """
    param:
        data: lsit [user, item, rating]
    return:
        user_item: 用户-电影排列表 type:dict, key:user, value:dict, key:item, value:rate
    """
    user_item = {}
    for user, item, rate in data:
        if user not in user_item:
            user_item[user] = {}
        user_item[user].update({item: rate})
    return user_item

# 生成user-tiem排列表
user_item = userItemDict(df_train.values)

user_item[105]

{264.0: 2.0,
 270.0: 5.0,
 271.0: 2.0,
 288.0: 4.0,
 302.0: 5.0,
 313.0: 5.0,
 324.0: 4.0,
 327.0: 4.0,
 340.0: 3.0,
 343.0: 2.0,
 347.0: 3.0,
 690.0: 3.0,
 751.0: 2.0,
 880.0: 3.0}

数据采样，调整正负样本比例1:1

In [10]:
def sampleData(data, thres_rate):
    """
    param:
        data: 二维矩阵 [item, user, rate]
        thres_rate: 评分转成0/1分类的阈值
    return:
        train_data 调整正负样本数之后的训练数据 type:list [(user, item, class)]
    """
    # 定义数据集
    train_data = []
    # 正样本字典 key:user value:tuple (item, rate)
    pos_dict = {}
    # 负样本字典 key:user value:tuple (item, rate)
    neg_dict = {}

    for user, item, rate in data:
        if user not in pos_dict:
            pos_dict[user] = []
        if user not in neg_dict:
            neg_dict[user] = []
        if rate >= thres_rate:
            pos_dict[user].append((item, rate))
        else:
            neg_dict[user].append((item, rate))
    for user in pos_dict:
        # 获取每个用户的正负样本数目，去原本正样本或者负样本的最小值，样本多余的截取
        data_num = min(len(pos_dict.get(user, [])), len(neg_dict.get(user, [])))
        if data_num > 0:
            # 按分值从大到小排序，保留data_num个样本数
            sorted_pos_list = sorted(pos_dict[user], key=lambda element: element[1], reverse=True)[:data_num]
            train_data += [(user, item, 1) for item, rate in sorted_pos_list]
            sorted_neg_list = sorted(neg_dict[user], key=lambda element: element[1], reverse=True)[:data_num]
            train_data += [(user, item, 0) for item, rate in sorted_neg_list]

    return train_data

训练模型

In [11]:
def trainModel(df_train):
    """
    param：
        df_train: 训练数据dataframe格式 包含字段 ('userId', 'movieId', 'rating')
    return:
        algo: 训练好的模型
    """
    # 读取数据
    reader = Reader()
    algo = SVD()
    data = Dataset.load_from_df(df_train[['userId', 'movieId', 'rating']], reader)
    ###################### train ######################
    # 训练模型
    # 方式 1: 交叉验证
    # (算法, 数据, loss计算方式， CV=交叉验证次数
    model_selection.cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

    # 方式 2: 没有交叉验证
    # trainset = data.build_full_trainset()
    # algo.fit(trainset)
    ###################################################

    # 返回训练好的模型
    return algo

In [12]:
# 未调整正负样本数
df_train['rating'] = df_train['rating'].apply(lambda x: 1.0 if x>=thres_rate else 0.0)
algo = trainModel(df_train)

# 调整正负样本数 1：1
# train_data = sampleData(df_train.values, thres_rate)
# df_sample = pd.DataFrame(train_data, columns=['userId', 'movieId', 'rating'])
# algo = trainModel(df_sample)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
MAE (testset)     0.4518  0.4568  0.4535  0.4540  0.0021  
RMSE (testset)    0.6714  0.6751  0.6724  0.6730  0.0015  
Fit time          7.38    9.64    4.71    7.24    2.01    
Test time         0.61    0.46    0.48    0.51    0.07    


推荐系统

In [13]:
def recommendation(model, user_item, user_id, item_set, hot_rank, R):
    """
    param：
        model: 训练的模型
        user_item: 训练集中user-item字典 {user1 : {item1 : rate1, item2 : rate2}, ...}}
        user_id：推荐的用户id
        item_set: 训练集中的电影集合
        hot_rank: 热门电影列表
        R：推荐列表中电影个数
    return：
        rank_sorted：该用户的推荐电影列表 type:dict, key:user, value:dict, key:item, value:score
    """
    # 存储用户推荐电影
    rank = {}
    # 开辟用户空子字典 ('rank: ', {user_id: {}})
    rank.setdefault(user_id, {})

    # 如果该用户不在训练集中，则推荐热门电影
    if user_id not in user_item:
        print('user {} not in trainset, give hot rank list'.format(user_id))
        rank[user_id] = hot_rank
    else:
        # 用户已观看的电影集合
        item_watched_list = user_item[user_id]

        for item_id in item_set:
            if item_id in item_watched_list:
                continue
            rank[user_id].setdefault(item_id, 0)

            # 将模型预测结果赋给rank[user_id][item_id]
            rank[user_id][item_id] = model.predict(user_id, item_id).est

    # 推荐列表按评分由高到低排序
    rank_sorted = {}
    rank_sorted[user_id] = sorted(rank[user_id].items(), key=operator.itemgetter(1), reverse=True)[0:R]

    return rank_sorted

### 电影推荐

In [15]:
# 输入用户ID
user_id = input('Please input your user ID:')

recom_list = recommendation(algo, user_item, user_id, df_train['movieId'].unique(), hot_rank, reco_num)
# 输出推荐电影
print('\nRecommended movies：\n')

ranknum = 0
for idx, rate  in recom_list.values()[0]:
    ranknum += 1
    print("%d. %s" %(ranknum, movies[movies['movieId'] == idx]['title'].values[0]))
    print '-'*50


Please input your user ID:105

Recommended movies：

1. Toy Story
--------------------------------------------------
2. GoldenEye
--------------------------------------------------
3. Four Rooms
--------------------------------------------------
4. Get Shorty
--------------------------------------------------
5. Copycat
--------------------------------------------------
6. Twelve Monkeys
--------------------------------------------------
7. Babe
--------------------------------------------------
8. Dead Man Walking
--------------------------------------------------
9. Richard III
--------------------------------------------------
10. Mighty Aphrodite
--------------------------------------------------
