In [1]:
import pandas as pd
import numpy as np
import pickle
import json  
from numpy.random import random

In [12]:
##读取数据
#用户和歌曲的索引
users_index = pickle.load(open("users_index.pkl", 'rb'))
songs_index = pickle.load(open("songs_index.pkl", 'rb'))

#训练集上用户的数目和歌曲的数目
num_users = len(users_index)
num_songs = len(songs_index)

#用户听过的歌曲
user_song_dict = pickle.load(open('user_song_dict.pkl', 'rb'))
#歌曲被听过的用户
song_user_dict = pickle.load(open('song_user_dict.pkl', 'rb'))

In [13]:
#读取训练数据
train_data = pd.read_csv('train_data.csv')
train_data.head()

Unnamed: 0.1,Unnamed: 0,user,song,play_count,interest,score
0,22617,9c2dfee26bbdd4fb19e9800244bea6e7181caeae,SOLRGVL12A8C143BC3,10,1,1
1,10053,20ad98ab543da9ec41c6ac3b6354c5ab3ca6bc5e,SOWKQYL12AB0183B15,3,1,1
2,18172,a2679496cd0af9779a92a13ff7c6af5c81ea8c7b,SOILFUU12AB017C75F,4,1,1
3,21418,690ab317e29d08acb0a11e82eb0f83428cf812f9,SOPNLBX12A8C1377D4,2,1,1
4,9042,5339907639d69d89e0447aad8e0277d5e455dc76,SORPVUD12A67020454,5,1,1


In [14]:
#初始化模型参数
#隐含变量的维数
K = 40

#item和用户的偏置项
bi = np.zeros((num_songs,1))    
bu = np.zeros((num_users,1))   

#item和用户的隐含向量
qi =  np.zeros((num_songs,K))    
pu =  np.zeros((num_users,K))   


for uid in range(num_users):  #对每个用户
    pu[uid] = np.reshape(random((K,1))/10*(np.sqrt(K)),K)
       
for iid in range(num_songs):  #对每个item
    qi[iid] = np.reshape(random((K,1))/10*(np.sqrt(K)),K)

#所有用户的平均打分
mu = train_data['score'].mean()  #average rating

In [15]:
#预测用户对item的打分
def svd_pred(uid, iid):  
    score = mu + bi[iid] + bu[uid] + np.sum(qi[iid]* pu[uid])  
        
    #将打分范围控制在1-5之间
    if score>5:  
        score = 5  
    elif score<1:  
        score = 1  
        
    return score  

In [17]:
users_index['b1054cf00f4b3d2b7fc1548b19e423a50639c22f']

777

In [18]:
#模型训练
#gamma：为学习率
#Lambda：正则参数
#steps：迭代次数

steps=50
gamma=0.04
Lambda=0.15

#总的打分记录数目
n_records = train_data.shape[0]

for step in range(steps):  
    print ('The ' + str(step) + '-th  step is running' )
    rmse_sum=0.0 
            
    #将训练样本打散顺序
    kk = np.random.permutation(n_records)  
    for j in range(n_records):  
        #每次一个训练样本
        line = kk[j]  
        
        uid = users_index [train_data.iloc[line]['user']]
        iid = songs_index [train_data.iloc[line]['song']]
    
        rating  = train_data.iloc[line]['score']
                
        #预测残差
        eui = rating - svd_pred(uid, iid)  
        #残差平方和
        rmse_sum += eui**2  
                
        #随机梯度下降，更新
        bu[uid] += gamma * (eui - Lambda * bu[uid])  
        bi[iid] += gamma * (eui - Lambda * bi[iid]) 
                
        temp = qi[iid]  
        qi[iid] += gamma * (eui* pu[uid]- Lambda*qi[iid] )  
        pu[uid] += gamma * (eui* temp - Lambda*pu[uid])  
            
    #学习率递减
    gamma=gamma*0.93  
    print ("the rmse of this step on train data is ",np.sqrt(rmse_sum/n_records))  

The 0-th  step is running
the rmse of this step on train data is  [0.84969742]
The 1-th  step is running
the rmse of this step on train data is  [0.08276218]
The 2-th  step is running
the rmse of this step on train data is  [0.0453674]
The 3-th  step is running
the rmse of this step on train data is  [0.0350407]
The 4-th  step is running
the rmse of this step on train data is  [0.03225819]
The 5-th  step is running
the rmse of this step on train data is  [0.03133052]
The 6-th  step is running
the rmse of this step on train data is  [0.03112185]
The 7-th  step is running
the rmse of this step on train data is  [0.03080814]
The 8-th  step is running
the rmse of this step on train data is  [0.03087137]
The 9-th  step is running
the rmse of this step on train data is  [0.03071762]
The 10-th  step is running
the rmse of this step on train data is  [0.03065549]
The 11-th  step is running
the rmse of this step on train data is  [0.03058517]
The 12-th  step is running
the rmse of this step on 

In [20]:
#保存参数模型
# A method for saving object data to JSON file
def save_json(filepath):
    dict_ = {}
    dict_['mu'] = mu
    dict_['K'] = K
    
    dict_['bi'] = bi.tolist()
    dict_['bu'] = bu.tolist()
    
    dict_['qi'] = qi.tolist()
    dict_['pu'] = pu.tolist()

    # Creat json and save to file
    json_txt = json.dumps(dict_)
    with open(filepath, 'w') as file:
        file.write(json_txt)

In [21]:
# A method for loading data from JSON file
def load_json(filepath):
    with open(filepath, 'r') as file:
        dict_ = json.load(file)

        mu = dict_['mu']
        K = dict_['K']

        bi = np.asarray(dict_['bi'])
        bu = np.asarray(dict_['bu'])
    
        qi = np.asarray(dict_['qi'])
        pu = np.asarray(dict_['pu'])

In [22]:
save_json('svd_model.json')
load_json('svd_model.json')

In [23]:
#测试
def svd_CF_recommend(user):
    cur_user_id = users_index[user]
    
    #训练集中该用户打过分的item
    cur_user_items = user_song_dict[cur_user_id]

    #该用户对所有item的打分
    user_items_scores = np.zeros(num_songs)

    #预测打分
    for i in range(num_songs):  # all items 
        if i not in cur_user_items: #训练集中没打过分
            user_items_scores[i] = svd_pred(cur_user_id, i)  #预测打分
    
    #推荐
    #Sort the indices of user_item_scores based upon their value，Also maintain the corresponding score
    sort_index = sorted(((e,i) for i,e in enumerate(list(user_items_scores))), reverse=True)
    
    #Create a dataframe from the following
    columns = ['item', 'score']
    df = pd.DataFrame(columns=columns)
         
    #Fill the dataframe with top 20 (n_rec_items) item based recommendations
    #sort_index = sort_index[0:n_rec_items]
    #Fill the dataframe with all items based recommendations
    for i in range(0,len(sort_index)):
        cur_item_index = sort_index[i][1] 
        cur_item = list (songs_index.keys()) [list (songs_index.values()).index (cur_item_index)]
            
        if ~np.isnan(sort_index[i][0]) and cur_item_index not in cur_user_items:
            df.loc[len(df)]=[cur_item, sort_index[i][0]]
    
    return df

In [25]:
#读取测试数据
test_data = pd.read_csv('test_data.csv')
test_data.head()

Unnamed: 0.1,Unnamed: 0,user,song,play_count,interest,score
0,8044,597464c554c28f95c08553bc7336dfaae2b3abba,SOTEGWG12AB01897AC,3,1,1
1,17527,ab792cde4fb868b3cf232791f173f7fd5e2ee83d,SOYIJIL12A6701F1C1,8,1,1
2,11652,db6a78c78c9239aba33861dae7611a6893fb27d5,SOYRAHL12A6310D821,8,1,1
3,37192,73a9b86c602d2ab5eb151b77f58bb6e7315dd7b2,SOGTDJQ12A8C13324F,5,1,1
4,36746,7e543508a213f4f22e0cb54ecf2df9c370070a28,SOVRZIX12AAF3B2A32,1,0,1


In [26]:
#统计总的用户
unique_users_test = test_data['user'].unique()

#为每个用户推荐的item的数目
n_rec_items = 10

#性能评价参数初始化，用户计算Percison和Recall
n_hits = 0
n_total_rec_items = 0
n_test_items = 0

#所有被推荐商品的集合（对不同用户），用于计算覆盖度
all_rec_items = set()

#残差平方和，用与计算RMSE
rss_test = 0.0

#对每个测试用户
for user in unique_users_test:
    #测试集中该用户打过分的电影（用于计算评价指标的真实值）
    if user not in users_index:   #user在训练集中没有出现过，新用户不能用协同过滤
        print(str(user) + ' is a new user.\n')
        continue
   
    user_records_test= test_data[test_data.user == user]
    
    #对每个测试用户，计算该用户对训练集中未出现过的商品的打分，并基于该打分进行推荐（top n_rec_items）
    #返回结果为DataFrame
    rec_items = svd_CF_recommend(user)
    pre_num = min(len(rec_items, n_rec_items))
    for i in range(pre_num):
        item = rec_items.iloc[i]['item']
        
        if item in user_records_test['song'].values:
            n_hits += 1
        all_rec_items.add(item)
    
    #推荐的item总数
    n_total_rec_items += pre_num
    
    #真实item的总数
    n_test_items += user_records_test.shape[0]

#Precision & Recall
precision = n_hits / (1.0*n_total_rec_items)
recall = n_hits / (1.0*n_test_items)
print('precision:{}, recall:{}'.format(precision, recall))

NameError: name 'items_index' is not defined