In [1]:
import pandas as pd
import numpy as np
import pickle

#稀疏矩阵，打分表
import scipy.io as sio
import os

#距离
import scipy.spatial.distance as ssd

In [2]:
#用户和歌曲的索引
users_index = pickle.load(open("users_index.pkl", 'rb'))
songs_index = pickle.load(open("songs_index.pkl", 'rb'))

#训练集上用户的数目和歌曲的数目
num_users = len(users_index)
num_songs = len(songs_index)

#用户听过的歌曲
user_song_dict = pickle.load(open('user_song_dict.pkl', 'rb'))
#歌曲被听过的用户
song_user_dict = pickle.load(open('song_user_dict.pkl', 'rb'))

In [3]:
#使用用户的打分
user_song_interest = sio.mmread("user_song_interest")#.todense()

In [4]:
user_song_interest

<790x800 sparse matrix of type '<class 'numpy.float64'>'
	with 20104 stored elements in COOrdinate format>

In [8]:
#计算用户之间的相似度
def users_similarity(uid1, uid2 ):
    si={}  #有效歌曲（两个用户均有打分的item）的集合
    for song in user_song_dict[uid1]:  #uid1所有听过的歌曲
        if song in user_song_dict[uid2]:  #如果uid2也听过改歌曲
            si[song] = 1  #song作为一个有效的歌曲
        
    n=len(si)   #有效歌曲数，有效歌曲为即对uid听过歌曲且uid2也听过
    if (n==0):  #没有听过相同的歌曲，则相似度为0
        similarity=0.0  
        return similarity  
        
    #用户uid1的有效打分(减去该用户的平均打分)
    s1=np.array([user_song_scores[uid1,song] - user_mean_score[uid1] for song in si])  
        
    #用户uid2的有效打分(减去该用户的平均打分)
    s2=np.array([user_song_scores[uid2,song] - user_mean_score[uid2] for song in si])  
        
    similarity = 1 - ssd.cosine(s1, s2) 
    
    if np.isnan(similarity): #s1或s2的l2模为0（全部等于该用户的平均打分）
        similarity = 0.0
    return similarity 

In [10]:
#计算所有用户之间的相似度
users_similarity_matrix = np.matrix(np.zeros(shape=(num_users, num_users)), float)
for user_x in range(num_users):
    users_similarity_matrix[user_x, user_x] = 1.0
    
    #打印进度条
    if(user_x % 100 == 0):
        print ("ui=%d " % (user_x))

    for user_y in range(user_x+1, num_users):   
        users_similarity_matrix[user_y, user_x] = users_similarity(user_x, user_y)
        users_similarity_matrix[user_x, user_y] = users_similarity_matrix[user_y, user_x]

pickle.dump(users_similarity_matrix, open("users_similarity_matrix.pkl", 'wb')) 

ui=0 


  dist = 1.0 - uv / np.sqrt(uu * vv)


ui=100 
ui=200 
ui=300 
ui=400 
ui=500 
ui=600 
ui=700 


In [11]:
#测试用户是否会听改歌曲，通过评分来表示
def User_CF_pred(user, song): 
    sim_accumulate=0.0  
    rat_acc=0.0 
    for user_id in song_user_dict[song]:  #听过改歌曲的所有的用户
        #计算当前用户与听过改歌曲的用户之间的相似度
        sim = users_similarity_matrix[user_id, user]
            
        if sim != 0: 
            rat_acc += sim * (user_song_scores[user_id, song] - user_mean_score[user_id])   #用户user对item i的打分
            sim_accumulate += np.abs(sim)  
        
    if sim_accumulate != 0:  
        score = user_mean_score[user] + rat_acc/sim_accumulate
    else: #no similar users,return average rates of the user 
        score = user_mean_score[user]
    
    return score

In [12]:
#根据用户去预测是否会听歌曲
#user：user
#return: score, song（DataFrame）
def recommend(user):
    #转换用户标识为ID号
    cur_user_id = users_index[user]
    
    #训练集中该用户听过的歌曲
    cur_user_songs = user_song_dict[cur_user_id]

    #该用户对所有歌曲的评分情况
    user_song_scores_pre = np.zeros(num_songs)

    #预测打分
    for song in range(num_songs):  # all songs
        if song not in cur_user_songs: #训练集中没打过分
            user_song_scores_pre[song] = User_CF_pred(cur_user_id, song)  #预测打分
    
    #推荐
    #Sort the indices of user_item_scores based upon their value，Also maintain the corresponding score
    sort_index = sorted(((e,i) for i,e in enumerate(list(user_song_scores_pre))), reverse=True)
    
    #Create a dataframe from the following
    columns = ['song', 'score']
    df = pd.DataFrame(columns=columns)
         
    #Fill the dataframe with top 20 (n_rec_items) item based recommendations
    #sort_index = sort_index[0:n_rec_items]
    #Fill the dataframe with all items based recommendations
    for i in range(0,len(sort_index)):
        cur_song_index = sort_index[i][1] 
        cur_song = list (songs_index.keys()) [list (songs_index.values()).index (cur_song_index)]
            
        if ~np.isnan(sort_index[i][0]) and cur_song_index not in cur_user_songs:
            df.loc[len(df)]=[cur_song, sort_index[i][0]]
    
    return df

In [13]:
#读取测试数据
test_data = pd.read_csv('test_data.csv')
test_data.head()

Unnamed: 0.1,Unnamed: 0,user,song,play_count,score
0,4680,c1912062175dc4b3ea5a3a0cdb963c704bb9c881,SOVDSJC12A58A7A271,45,1
1,5684,4bcc4cfd9acf7e19bbccd398f8503ba79fb66513,SOAUWYT12A81C206F1,1,1
2,28695,c1255748c06ee3f6440c51c439446886c7807095,SOWCBKV12AC90732A6,3,1
3,24268,19d2a7ccba17143f6b0425fc9c62a5f8df8455f7,SOTRQEJ12AF72A45D7,12,1
4,32824,66abc1ae25dca07b75109164f9dacfe33f9572ba,SONHWUN12AC468C014,1,1


In [15]:
#通过测试数据去测试
#统计总的用户
unique_users_test = test_data['user'].unique()

#为每个用户推荐的歌曲的数目
n_rec_songs = 10

#性能评价参数初始化，用户计算Percison和Recall
n_hits = 0
n_total_rec_songs = 0
n_test_songs = 0

#所有被推荐歌曲的集合（对不同用户），用于计算覆盖度
all_rec_songs = set()

#对每个测试用户
for user in unique_users_test:
    #测试集中该用户打过分的电影（用于计算评价指标的真实值）
    if user not in users_index:   #user在训练集中没有出现过，新用户不能用协同过滤
        print(str(user) + ' is a new user.\n')
        continue
   
    user_records_test = test_data[test_data.user == user]
    
    #对每个测试用户，计算该用户对训练集中未出现过的商品的打分，并基于该打分进行推荐（top n_rec_items）
    #返回结果为DataFrame
    rec_songs = recommend(user)
    
    for i in range(n_rec_songs):
        song = rec_songs.iloc[i]['song']
        
        if song in user_records_test['song'].values:
            n_hits += 1
        all_rec_songs.add(song)

    
    #推荐的item总数
    n_total_rec_songs += n_rec_songs
    
    #真实item的总数
    n_test_songs += user_records_test.shape[0]

#Precision & Recall
precision = n_hits / (1.0*n_total_rec_songs)
recall = n_hits / (1.0*n_test_songs)
print('precision:{}, recall:{}'.format(precision, recall))

3b4bb393138bba331e3dde43dfdc05554f05a743 is a new user.

6b3d5eaba2e55699cb725d0c605c6eca1b302dfe is a new user.

c337bc0346e01c8b0105691b8f970b9a6960a572 is a new user.

a3a9329463c55f63876f84b0c47b4f90ca9db7bc is a new user.

bb0d8c40d8932c5960e8e4443aa391adb9dfe9da is a new user.

precision:0.008803301237964236, recall:0.008534471262835045
