# MSD歌曲推荐

In [1]:
# coding: utf-8
# _*_ coding:utf-8 _*_

import pandas as pd
import numpy as np
import json

import pickle as pk
import scipy.io as sio

#距离
import scipy.spatial.distance as ssd

users_mu = pk.load(open('users_mu.pkl', 'rb'))

## 1.对给定用户推荐物品/计算打分

### 1.1 基于用户的协同过滤

In [2]:
### 预测用户uid对item iid的打分
### similarity_matrix为用户与用户之间的相似矩阵
def user_CF_pred(uid, iid, similarity_matrix):
    sim_accumulate = 0.0
    rat_acc = 0.0
    for user_id in item_users[iid]: #对item iid打过分的所有用户
        sim = similarity_matrix[user_id, uid]
        
        if sim != 0:
            rat_acc += sim * (user_item_scores[user_id, iid] - users_mu[user_id])
            sim_accumulate += np.abs(sim)
            
        if sim_accumulate != 0:
            score = users_mu[uid] + rat_acc / sim_accumulate
        else:
            score = users_mu[uid]
        return score

### 1.2 基于物品的协同过滤

In [3]:
### n_Knns最相似的物品的数目
def item_CF_pred(uid, iid, similarity_matrix, n_Knns):
    sim_accumulate = 0.0
    rat_acc = 0.0
    n_nn_items = 0
    
    #相似度排序
    cur_items_similarity = np.array(similarity_matrix[iid, :])
    cur_items_similarity = cur_items_similarity.flatten()
    sort_index = sorted(((e,i) for i,e in enumerate(list(cur_items_similarity))), reverse=True)
    
    for i in range(0,len(sort_index)):
        cur_item_index = sort_index[i][1]
        
        if n_nn_items >= n_Knns: #相似度items已经足够多
            break;
            
        if cur_item_index in user_items[uid]: #对用户打过分的item
            #计算当前用户打过分item与其他item之间的相似度
            sim = similarity_matrix[iid, cur_item_index]
            
            if sim != 0:
                rat_acc += sim * (user_item_scores[uid, cur_item_index])
                sim_accumulate += np.abs(sim)
                
            n_nn_items += 1
            
        if sim_accumulate != 0:
            score = rat_acc / sim_accumulate
        else:
            score = users_mu[uid]
            
        if score < 0:
            score = 0.0
            
        return score

### 1.3 基于SVD的协同过滤

In [4]:
def svd_CF_pred(uid, iid):  
    score = mu + bi[iid] + bu[uid] + np.sum(qi[iid]* pu[uid])  
        
    #将打分范围控制在1-5之间
    #if score>5:  
        #score = 5  
    #elif score<1:  
        #score = 1  
        
    return score  

## 2.从文件读入训练好的模型

In [5]:
#读取模型函数
def load_json(filepath):
    with open(filepath, 'r') as file:
        dict_ = json.load(file)
        
        mu = dict_['mu']
        K = dict_['K']
        
        bi = np.asarray(dict_['bi'])
        bu = np.asarray(dict_['bu'])
        
        qi = np.asarray(dict_['qi'])
        pu = np.asarray(dict_['pu'])

In [6]:
#用户和item的索引
users_index = pk.load(open("users_index.pkl", 'rb'))
items_index = pk.load(open("items_index.pkl", 'rb'))

n_users = len(users_index)
n_items = len(items_index)

#用户-物品关系矩阵R
user_item_scores = sio.mmread("user_item_scores").todense()

#倒排表
##每个用户播放的歌曲
user_items = pk.load(open("user_items.pkl", 'rb'))
##时间参加的用户
item_users = pk.load(open("item_users.pkl", 'rb'))

#所有用户之间的相似度
similarity_matrix_users = pk.load(open("users_similarity_played.pkl", 'rb'))

#所有item之间的相似度
similarity_matrix_items = pk.load(open("items_similarity_played.pkl", 'rb'))

#svd模型
load_json('svd_model.json')

with open('svd_model.json') as file:
        dict_ = json.load(file)
        
        mu = dict_['mu']
        K = dict_['K']
        
        bi = np.asarray(dict_['bi'])
        bu = np.asarray(dict_['bu'])
        
        qi = np.asarray(dict_['qi'])
        pu = np.asarray(dict_['pu'])

## 3.根据模型，预测用户对item的打分

In [21]:
#返回推荐items及其打分（DataFrame）

N_KNNS = 10

def recommend(user):
    cur_user_id = users_index[user]
    
    #训练集中该用户打过份的item
    cur_user_items = user_items[cur_user_id]
    
    #该用户对所有item 的打分
    user_items_scores = np.zeros(n_items)
    
    #预测打分
    for i in range(n_items):
        if i not in cur_user_items: #训练集中没打过分
            #user_items_scores[i] = user_CF_pred(cur_user_id, i , similarity_matrix_users)
            #user_items_scores[i] = item_CF_pred(cur_user_id, i , similarity_matrix_items, N_KNNS)
            user_items_scores[i] = svd_CF_pred(cur_user_id, i)
            
    #推荐
    sort_index = sorted(((e,i) for i,e in enumerate(list(user_items_scores))), reverse=True)
    
    #创建DataFrame存放结果
    columns = ['item_id', 'score']
    df = pd.DataFrame(columns=columns)
    
    #存入结果
    for i in range(0, len(sort_index)):
        cur_item_index = sort_index[i][1]
        cur_item = list(items_index.keys())[list(items_index.values()).index(cur_item_index)]
        
        if ~np.isnan(sort_index[i][0]) and cur_item_index not in cur_user_items:
            df.loc[len(df)] = [cur_item, sort_index[i][0]]
            
    return df

## 4.读取测试数据

In [8]:
dpath = './data/'
df_triplet_test = pd.read_csv(dpath + 'triplet_dataset_sub_test.csv')

## 5.测试并计算评价指标
PR、覆盖度、RMSE

In [22]:
#统计用户总数
unique_users_test = df_triplet_test['user'].unique()

#为每个用户推荐item的数目
N_RS_ITEMS = 10

#性能评价参数初始化，用户计算Percison和Recall
n_hits = 0
n_total_rec_items = 0
n_test_items = 0

#所有被推荐商品的集合（对不同用户），用于计算覆盖度
all_rec_items = set()

#残差平方和，用于计算RMSE
rss_test = 0.0

#对每个测试用户
for user in unique_users_test:
    #测试集中该用户打过分的歌曲（用于计算评价指标的真实值）
    if user not in users_index: #user在训练集中没出现过，新用户不能用协同过滤
        print(str(user) + ' is a new user.\n')
        continue
    
    #测试集真实值
    df_user_records_test = df_triplet_test[df_triplet_test.user == user]
    
    #对每个测试用户，计算该用户对训练集中未出现过的商品的打分，并基于该打分进行推荐
    #返回结果为DataFrame
    df_rec_items = recommend(user)
    for i in range(N_RS_ITEMS):
        item = df_rec_items.iloc[i]['item_id']
        
        if item in df_user_records_test['song'].values:
            n_hits += 1
        all_rec_items.add(item)
        
    #计算RMSE
    #对测试集中的每条记录，计算真实值与预测之间的RMSE
    for i in range(df_user_records_test.shape[0]):
        item = df_user_records_test.iloc[i]['song']
        score = df_user_records_test.iloc[i]['fractional_play_count']
        
        df1 = df_rec_items[df_rec_items.item_id == item]
        if(df1.shape[0] == 0): #item不在推荐列表中，可能是新item在训练集中没出现过
            ptint(str(item) + ' is a new item or user\n')
            continue
        pred_score = df1['score'].values[0]
        
        rss_test += (pred_score - score)**2
        
    #推荐的item宗叔
    n_total_rec_items += N_RS_ITEMS
    
    #真实ite的总数
    n_test_items += df_user_records_test.shape[0]
    
#PS & recall
precision = n_hits / (1.0*n_total_rec_items)
recall = n_hits / (1.0*n_test_items)

#覆盖度：推荐商品占总需要推荐商品的比例
coverage = len(all_rec_items) / (1.0 * n_items)

#打分的均方差
rmse = np.sqrt(rss_test / df_triplet_test.shape[0])

67c5b5b1982902d15badd8ce0c18b3278ec4bfc0 is a new user.

62420be0fd0df5ab0eb4cba35a4bc7cb3e3b506a is a new user.

3ab78e39bddeaeb789edad041fff03050077417c is a new user.



In [23]:
precision

0.018257261410788383

In [24]:
recall

0.01759765364618051

In [25]:
coverage

0.22625

In [26]:
rmse

4.0520204009149134