# 读入数据

In [1]:
#导包
import pandas as pd
import numpy as np

In [2]:
# 读入数据
df = pd.read_csv("example.txt",header=None)

In [4]:
df.shape

(17, 3)

In [5]:
df.head()

Unnamed: 0,0,1,2
0,1,1,4
1,1,2,3
2,1,5,5
3,2,1,5
4,2,3,4


In [6]:
# 修改列名
df.columns=['用户id','物品id','喜好程度']

In [15]:
df

Unnamed: 0,用户id,物品id,喜好程度
0,1,1,4
1,1,2,3
2,1,5,5
3,2,1,5
4,2,3,4
5,2,5,4
6,3,1,4
7,3,3,5
8,3,4,3
9,3,5,4


In [8]:
#查看用户
df['用户id'].value_counts()

3    4
6    3
2    3
1    3
5    2
4    2
Name: 用户id, dtype: int64

In [11]:
#查看物品
set(df['物品id'])

{1, 2, 3, 4, 5, 6}

# 根据用户id和物品id构建rating矩阵

In [13]:
# 使用 pivot函数
# 用户id作为行索引, 物品id作为列索引, 统计对应用户和物品之间的相关 喜好程度值
df_pivot = df.pivot(index="用户id",columns="物品id",values="喜好程度")

In [14]:
df_pivot
# 有值的代表该用户对该物品有评分
# 没值的代表该用户没有对该物品评过分

物品id,1,2,3,4,5,6
用户id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,4.0,3.0,,,5.0,
2,5.0,,4.0,,4.0,
3,4.0,,5.0,3.0,4.0,
4,,3.0,,,,5.0
5,,4.0,,,,4.0
6,,,2.0,4.0,,5.0


In [16]:
# 用户个数
df_pivot.shape[0]

6

In [17]:
# 物品个数
df_pivot.shape[1]

6

## 将数据中的空值填充为0

In [18]:
freq = df_pivot.fillna(0)

In [19]:
freq

物品id,1,2,3,4,5,6
用户id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,4.0,3.0,0.0,0.0,5.0,0.0
2,5.0,0.0,4.0,0.0,4.0,0.0
3,4.0,0.0,5.0,3.0,4.0,0.0
4,0.0,3.0,0.0,0.0,0.0,5.0
5,0.0,4.0,0.0,0.0,0.0,4.0
6,0.0,0.0,2.0,4.0,0.0,5.0


In [20]:
freq_matrix = freq.values
freq_matrix

array([[4., 3., 0., 0., 5., 0.],
       [5., 0., 4., 0., 4., 0.],
       [4., 0., 5., 3., 4., 0.],
       [0., 3., 0., 0., 0., 5.],
       [0., 4., 0., 0., 0., 4.],
       [0., 0., 2., 4., 0., 5.]])

In [21]:
np.mat(freq.values) #变成矩阵

matrix([[4., 3., 0., 0., 5., 0.],
        [5., 0., 4., 0., 4., 0.],
        [4., 0., 5., 3., 4., 0.],
        [0., 3., 0., 0., 0., 5.],
        [0., 4., 0., 0., 0., 4.],
        [0., 0., 2., 4., 0., 5.]])

## 使用余弦相似度计算相似度矩阵

在sklearn中有自带的余弦相似度计算函数

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
# 使用余弦相似度函数计算
user_similar = cosine_similarity(freq_matrix)

In [25]:
user_similar

array([[1.        , 0.74926865, 0.62667956, 0.21828206, 0.3       ,
        0.        ],
       [0.74926865, 1.        , 0.91301651, 0.        , 0.        ,
        0.1579597 ],
       [0.62667956, 0.91301651, 1.        , 0.        , 0.        ,
        0.40368671],
       [0.21828206, 0.        , 0.        , 1.        , 0.9701425 ,
        0.63913749],
       [0.3       , 0.        , 0.        , 0.9701425 , 1.        ,
        0.52704628],
       [0.        , 0.1579597 , 0.40368671, 0.63913749, 0.52704628,
        1.        ]])

In [26]:
pd.DataFrame(user_similar)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.749269,0.62668,0.218282,0.3,0.0
1,0.749269,1.0,0.913017,0.0,0.0,0.15796
2,0.62668,0.913017,1.0,0.0,0.0,0.403687
3,0.218282,0.0,0.0,1.0,0.970143,0.639137
4,0.3,0.0,0.0,0.970143,1.0,0.527046
5,0.0,0.15796,0.403687,0.639137,0.527046,1.0


In [27]:
user_similar.shape

(6, 6)

## 构建物品推荐函数

假如说我们现在想给第2个用户推荐商品

我们可以用这种方法提取出第2个用户的评分向量

In [30]:
freq_matrix

array([[4., 3., 0., 0., 5., 0.],
       [5., 0., 4., 0., 4., 0.],
       [4., 0., 5., 3., 4., 0.],
       [0., 3., 0., 0., 0., 5.],
       [0., 4., 0., 0., 0., 4.],
       [0., 0., 2., 4., 0., 5.]])

In [29]:
freq_matrix[2,:]

array([4., 0., 5., 3., 4., 0.])

In [31]:
user_id_action = freq_matrix[2,:]      #用户id 对所有商品的行为评分  
user_id_action

array([4., 0., 5., 3., 4., 0.])

In [32]:
user_id_action.shape

(6,)

可以用这种方法提取出所有用户对第4个物品的的评分向量

In [34]:
freq_matrix[:,4]

array([5., 4., 4., 0., 0., 0.])

In [35]:
item_id_action = freq_matrix[:,4]      #物品id 得到的所有用户评分  
item_id_action

array([5., 4., 4., 0., 0., 0.])

In [36]:
item_id_action.shape

(6,)

### 假如说我们现在想要找出和该用户最相似的三个用户

那么应该从这个user_similar 矩阵中提取出三个最大的值,所对应的用户

假设k 等于3,则用下面方法可以提取出这三个用户的ID

In [37]:
k = 3

In [38]:
#初始化一个分数和权重
score = 0
weight = 0

In [39]:
user_id = 2
item_id = 4

In [42]:
user_similar

array([[1.        , 0.74926865, 0.62667956, 0.21828206, 0.3       ,
        0.        ],
       [0.74926865, 1.        , 0.91301651, 0.        , 0.        ,
        0.1579597 ],
       [0.62667956, 0.91301651, 1.        , 0.        , 0.        ,
        0.40368671],
       [0.21828206, 0.        , 0.        , 1.        , 0.9701425 ,
        0.63913749],
       [0.3       , 0.        , 0.        , 0.9701425 , 1.        ,
        0.52704628],
       [0.        , 0.1579597 , 0.40368671, 0.63913749, 0.52704628,
        1.        ]])

In [41]:
user_similar[user_id]

array([0.62667956, 0.91301651, 1.        , 0.        , 0.        ,
       0.40368671])

In [43]:
np.argsort(user_similar[user_id])

array([3, 4, 5, 0, 1, 2], dtype=int64)

In [47]:
#对一个序列进行反转
np.argsort(user_similar[user_id])[::-1][1:k+1]

array([1, 0, 5], dtype=int64)

In [48]:
similar_index = np.argsort(user_similar[user_id])[::-1][1:k+1]
similar_index

array([1, 0, 5], dtype=int64)

## 计算用户评分平均值

In [49]:
user_id_action #第2个用户的所有评分

array([4., 0., 5., 3., 4., 0.])

In [56]:
# 用户评价分数的平均值
user_id_action.sum()

16.0

In [51]:
user_id_action!=0

array([ True, False,  True,  True,  True, False])

In [52]:
# 提取出用户都有评价的分数数据
user_id_action[user_id_action!=0]

array([4., 5., 3., 4.])

In [53]:
# 计算一共有多少个
user_id_action[user_id_action!=0].size

4

In [54]:
# 用户评分的平均值
user_mean = user_id_action.sum() / user_id_action[user_id_action!=0].size
user_mean

4.0

说明这个第2个用户,对所有物品的平均评分是4.0分

In [57]:
for i in similar_index:
    print(i)

1
0
5


In [58]:
#初始化一个分数和权重
score = 0
weight = 0

In [60]:
user_similar[1]

array([0.74926865, 1.        , 0.91301651, 0.        , 0.        ,
       0.1579597 ])

In [62]:
item_id_action[1]

4.0

In [63]:
for i in similar_index:
    # 现在我们有了三个用户, 我们要计算的是, 这三个用户对物品的推荐分数 ,然后还要进行加权
    if item_id_action[i] != 0:
        # 如果等于0 说明什么呢, 说明当前这个用户没有对这个物品评分,也就没有必要计算了
        # 如果不等于０　，则计算这个用户的推荐值
        user_id_action_for_i = freq_matrix[i,:]
        # 计算该用户的评分平均值
        user_id_mean_for_i = np.sum(user_id_action_for_i)/user_id_action_for_i[user_id_action_for_i!=0].size
        # 计算推荐分数
        #  user_similar[i]  这个提取出这两个用户的相似度,作为权重
        score += user_similar[i]*(item_id_action[i]-user_id_mean_for_i)
        weight += abs(user_similar[i])  

In [64]:
score

array([ 0.75024378,  0.41593532,  0.32234072,  0.21828206,  0.3       ,
       -0.05265323])

In [65]:
weight

array([1.74926865, 1.74926865, 1.53969607, 0.21828206, 0.3       ,
       0.1579597 ])

In [66]:
item_id_action

array([5., 4., 4., 0., 0., 0.])

In [38]:
user_id_action

array([4., 0., 5., 3., 4., 0.])

## 基于用户协同过滤的函数

In [67]:
#构建一个基于用户和物品的推荐
def Recommendation(user_id,item_id,similar,k=3):
    """不减平均数的计算方法"""
    score = 0
    weight = 0
    user_id_action = freq_matrix[user_id,:]      #用户user_id 对所有商品的行为评分  
    item_id_action = freq_matrix[:,item_id]      #物品item_id 得到的所有用户评分  

    user_id_similar = similar[user_id,:]      #用户user_id 对所有用户的相似度    
    similar_index = np.argsort(user_id_similar)[-(k+1):-1]  #最相似的k个用户的index（除了自己）
    
    for j in similar_index :
        if item_id_action[j]!=0:
            user_id_j_action = freq_matrix[j,:]

            score += user_id_similar[j]*(item_id_action[j])
#             print(score)
            weight += abs(user_id_similar[j])
#             print(weight)

    if weight==0:  
        return 0
    else:
        return score/float(weight)

In [68]:
#构建一个基于用户和物品的推荐
def Recommendation_mean(user_id,item_id,similar,k=10):
    """减去平均数的计算方法"""
    score = 0
    weight = 0
    user_id_action = freq_matrix[user_id,:]      #用户user_id 对所有商品的行为评分  
    item_id_action = freq_matrix[:,item_id]      #物品item_id 得到的所有用户评分  

    user_id_similar = similar[user_id,:]      #用户user_id 对所有用户的相似度    
    similar_index = np.argsort(user_id_similar)[-(k+1):-1]  #最相似的k个用户的index（除了自己）
    user_id_i_mean = np.sum(user_id_action)/user_id_action[user_id_action!=0].size#
    for j in similar_index :
        if item_id_action[j]!=0:
            user_id_j_action = freq_matrix[j,:]
            user_id_j_mean = np.sum(user_id_j_action)/user_id_j_action[user_id_j_action!=0].size
            score += user_id_similar[j]*(item_id_action[j]-user_id_j_mean)
#             print(score)
            weight += abs(user_id_similar[j])
#             print(weight)

    if weight==0:  
        return 0
    else:
        return user_id_j_mean + score/float(weight)

In [71]:
np.zeros((6,6))[2,4]

0.0

In [72]:
#构建预测函数
def predict_mean(user_similar):
    """预测函数的功能: 传入相似度矩阵, 通过对每个用户和每个物品进行计算, 计算出一个推荐矩阵"""
    user_count = freq_matrix.shape[0]#用户数
    item_count = freq_matrix.shape[1]#商品数
    predic_matrix = np.zeros((user_count,item_count)) #初始化一个全零矩阵
    #print(user_count)
    for user_id in range(user_count):
        #print(user_id)
        for item_id in range(item_count):
            if freq_matrix[user_id,item_id] == 0:
                #print (user_id,item_id)
                predic_matrix[user_id,item_id] = Recommendation_mean(user_id,item_id,user_similar)
    return predic_matrix

In [73]:
user_prediction_matrix = predict_mean(user_similar)

In [75]:
pd.DataFrame(user_prediction_matrix)

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.0,4.60727,3.0,0.0,4.421165
1,0.0,3.0,0.0,3.196655,0.0,5.0
2,0.0,3.0,0.0,0.0,0.0,5.0
3,4.0,0.0,2.0,4.0,5.0,0.0
4,4.0,0.0,2.0,4.0,5.0,0.0
5,4.187496,3.451941,0.0,0.0,3.906252,0.0


## 构建最终推荐函数

In [76]:
def get_topk(group,n):
    # 返回排序后的前几个值
    return group.sort_values("推荐指数",ascending=False)[:n]

In [77]:
recommendation_df = pd.DataFrame(user_prediction_matrix,columns=freq.columns,index=freq.index)
recommendation_df

物品id,1,2,3,4,5,6
用户id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,0.0,4.60727,3.0,0.0,4.421165
2,0.0,3.0,0.0,3.196655,0.0,5.0
3,0.0,3.0,0.0,0.0,0.0,5.0
4,4.0,0.0,2.0,4.0,5.0,0.0
5,4.0,0.0,2.0,4.0,5.0,0.0
6,4.187496,3.451941,0.0,0.0,3.906252,0.0


In [78]:
recommendation_df.stack()

用户id  物品id
1     1       0.000000
      2       0.000000
      3       4.607270
      4       3.000000
      5       0.000000
      6       4.421165
2     1       0.000000
      2       3.000000
      3       0.000000
      4       3.196655
      5       0.000000
      6       5.000000
3     1       0.000000
      2       3.000000
      3       0.000000
      4       0.000000
      5       0.000000
      6       5.000000
4     1       4.000000
      2       0.000000
      3       2.000000
      4       4.000000
      5       5.000000
      6       0.000000
5     1       4.000000
      2       0.000000
      3       2.000000
      4       4.000000
      5       5.000000
      6       0.000000
6     1       4.187496
      2       3.451941
      3       0.000000
      4       0.000000
      5       3.906252
      6       0.000000
dtype: float64

In [79]:
# 将数据进行转换
recommendation_df = recommendation_df.stack().reset_index()
recommendation_df

Unnamed: 0,用户id,物品id,0
0,1,1,0.0
1,1,2,0.0
2,1,3,4.60727
3,1,4,3.0
4,1,5,0.0
5,1,6,4.421165
6,2,1,0.0
7,2,2,3.0
8,2,3,0.0
9,2,4,3.196655


In [80]:
# 对列名进行修改
recommendation_df.rename(columns={0:"推荐指数"},inplace=True)

In [81]:
recommendation_df

Unnamed: 0,用户id,物品id,推荐指数
0,1,1,0.0
1,1,2,0.0
2,1,3,4.60727
3,1,4,3.0
4,1,5,0.0
5,1,6,4.421165
6,2,1,0.0
7,2,2,3.0
8,2,3,0.0
9,2,4,3.196655


In [82]:
# 根据用户ID列进行分组
grouped = recommendation_df.groupby("用户id")
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000000000B64CCC0>

In [83]:
def get_topk(group,n):
    # 返回排序后的前几个值
    return group.sort_values("推荐指数",ascending=False)[:n]

In [91]:
grouped.groups

{1: Int64Index([0, 1, 2, 3, 4, 5], dtype='int64'),
 2: Int64Index([6, 7, 8, 9, 10, 11], dtype='int64'),
 3: Int64Index([12, 13, 14, 15, 16, 17], dtype='int64'),
 4: Int64Index([18, 19, 20, 21, 22, 23], dtype='int64'),
 5: Int64Index([24, 25, 26, 27, 28, 29], dtype='int64'),
 6: Int64Index([30, 31, 32, 33, 34, 35], dtype='int64')}

In [86]:
grouped.apply(get_topk,3)

Unnamed: 0_level_0,Unnamed: 1_level_0,用户id,物品id,推荐指数
用户id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,1,3,4.60727
1,5,1,6,4.421165
1,3,1,4,3.0
2,11,2,6,5.0
2,9,2,4,3.196655
2,7,2,2,3.0
3,17,3,6,5.0
3,13,3,2,3.0
3,12,3,1,0.0
4,22,4,5,5.0


In [92]:
def get_recommendation(user_prediction_matrix,n=5):
    # 将用户预测数据, 构建成一个DataFrame
    recommendation_df = pd.DataFrame(user_prediction_matrix,columns=freq.columns,index=freq.index)
    # 将数据进行转换
    recommendation_df = recommendation_df.stack().reset_index()
    # 对列名进行修改
    recommendation_df.rename(columns={0:"推荐指数"},inplace=True)
    # 根据用户ID列进行分组
    grouped = recommendation_df.groupby("用户id")
    # 得到分组后的前几个数据
    topk = grouped.apply(get_topk,n=n)
    
    # 删除掉用户ID列
    topk = topk.drop(["用户id"],axis=1)
    # 删除掉多余的索引
    topk.index = topk.index.droplevel(1)
    # 索引重排
    topk.reset_index(inplace=True)
    return topk

In [93]:
get_recommendation(user_prediction_matrix , 3)

Unnamed: 0,用户id,物品id,推荐指数
0,1,3,4.60727
1,1,6,4.421165
2,1,4,3.0
3,2,6,5.0
4,2,4,3.196655
5,2,2,3.0
6,3,6,5.0
7,3,2,3.0
8,3,1,0.0
9,4,5,5.0
