In [1]:
import pandas as pd
import numpy as np

# 加载数据

In [2]:
movies = pd.read_csv('movies.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings = pd.read_csv('ratings.csv')

In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [6]:
print('{}用户对{}条电影信息进行了了{}条评价'.format(ratings.userId.unique().shape[0],movies.movieId.unique().shape[0],ratings.shape[0]))

610用户对9742条电影信息进行了了100836条评价


In [7]:
# 用户的数量
n_users = ratings.userId.unique().shape[0]
# 电影的数量
n_movies = movies.movieId.unique().shape[0]

# 划分测试集和训练集

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# 0.2测试和0.8训练
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=21)

# 推荐算法engine

In [10]:
# 创建user-item矩阵
train_data_matrix = np.zeros((n_users,n_movies))
test_data_matrix = np.zeros((n_users,n_movies))

movieId_list = list(movies.movieId)
num_list = [i for i in range(len(movieId_list))]
movieId_dict = dict(zip(movieId_list,num_list))

In [11]:
# 训练集矩阵
for line in train_data.itertuples():
    train_data_matrix[line[1] - 1, movieId_dict[line[2]]] = line[3]

# 测试集矩阵
for line in test_data.itertuples():
    test_data_matrix[line[1] - 1, movieId_dict[line[2]]] = line[3]

In [12]:
train_data_matrix

array([[0., 0., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 2., 2., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [13]:
test_data_matrix

array([[4. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [14]:
# 相似度计算
## sklearn的cosine_similarity函数计算余弦相似
from sklearn.metrics.pairwise import cosine_similarity
user_similarity = cosine_similarity(train_data_matrix)
print("用户相似度矩阵：", user_similarity)

用户相似度矩阵： [[1.         0.01798129 0.02870629 ... 0.22040955 0.09322355 0.11946323]
 [0.01798129 1.         0.         ... 0.0319276  0.0355827  0.07442285]
 [0.02870629 0.         1.         ... 0.02512459 0.         0.03742814]
 ...
 [0.22040955 0.0319276  0.02512459 ... 1.         0.10489906 0.26486288]
 [0.09322355 0.0355827  0.         ... 0.10489906 1.         0.05240013]
 [0.11946323 0.07442285 0.03742814 ... 0.26486288 0.05240013 1.        ]]


In [15]:
# 返回最相似十个用户
# way1 表示用户相似度方式为最简单的项／总和
import heapq
def similar_n_user(user_similarity, n=1, weight='way1'):
    for i in range(user_similarity.shape[0]):
        user_similarity[i][i] = 0
    
    # 多少个用户
    n_user_similarity = []
    for i in range(user_similarity.shape[0]):
        # 找到最相似的用户的下标列表
        u_li = heapq.nlargest(n,range(len(user_similarity[i])),user_similarity[i].__getitem__)
        dic = {}
        
        if weight == 'way1':
            # 权值方式一：相加求和
            count = 0
            for u in u_li:
                count += user_similarity[i][u]
        
        for u in u_li:
            dic[u] = user_similarity[i][u]
            if weight == 'way1':
                dic[u] = user_similarity[i][u]/count
            
        n_user_similarity.append(dic)
    return n_user_similarity

n_user_similarity = similar_n_user(user_similarity,10, weight='way1')

In [16]:
n_user_similarity

[{38: 0.09480729120710361,
  56: 0.09929465921467626,
  90: 0.104982690409632,
  265: 0.11339663514360451,
  287: 0.09496778454017026,
  329: 0.09607864840480679,
  367: 0.10581671391996149,
  468: 0.09715254253818725,
  560: 0.09613656699366763,
  596: 0.09736646762819022},
 {188: 0.10136947784948014,
  208: 0.08810976878543454,
  351: 0.09576157562171725,
  377: 0.11421148744185362,
  460: 0.10452270660512,
  514: 0.10253024794171009,
  522: 0.08536995440563382,
  527: 0.1005290983323243,
  536: 0.09939093302705751,
  600: 0.10820474998966866},
 {159: 0.08484557662226386,
  270: 0.1063202883433813,
  276: 0.08857808810824574,
  293: 0.08167270950247611,
  311: 0.10919070594461387,
  376: 0.10450261147440575,
  468: 0.09094119604977617,
  526: 0.1160154557381842,
  531: 0.11994156744805673,
  554: 0.09799180076859632},
 {134: 0.09255900344767794,
  155: 0.1004902394303254,
  194: 0.09243399394622015,
  215: 0.09397053891458744,
  220: 0.09409948383760165,
  274: 0.09544988910384532,
 

In [17]:
# 对用户生成推荐列表矩阵
def recommend_info(data, n_user_similarity):
    n_user_recommend = []
    pred_matrix = np.zeros((n_users,n_movies))
    for i in range(len(n_user_similarity)):
        dic = n_user_similarity[i]
        for key,value in dic.items():
            pred_matrix[i] += data[key]*value 
    return pred_matrix

pred_matrix = recommend_info(train_data_matrix, n_user_similarity)

In [18]:
# 输出推荐列表矩阵
pred_matrix

array([[1.91697779, 0.71859981, 1.3005053 , ..., 0.        , 0.        ,
        0.        ],
       [1.19809344, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.08566186, 1.04846226, 0.57163171, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [2.47391585, 1.4616235 , 1.18568157, ..., 0.        , 0.        ,
        0.        ],
       [1.08490208, 0.69820234, 0.3777942 , ..., 0.        , 0.        ,
        0.        ],
       [2.52823377, 2.615884  , 0.7118556 , ..., 0.        , 0.        ,
        0.        ]])

In [19]:
# 评分最高的十部电影
for i in heapq.nlargest(10,range(len(pred_matrix[0])),pred_matrix[0].__getitem__):
    print("第{}部，预测评分{}".format(i,pred_matrix[0][i]))

第899部，预测评分4.2145126865140154
第911部，预测评分4.173255405325228
第900部，预测评分4.083266557026686
第97部，预测评分4.071751507813002
第1503部，预测评分3.928285088163575
第46部，预测评分3.621403218854795
第1445部，预测评分3.535394348861431
第902部，预测评分3.511942293745491
第898部，预测评分3.318541591898933
第1431部，预测评分3.317869509466974


In [20]:
pred_matrix[0][899]

4.2145126865140154

In [21]:
pred_matrix[0][899]

4.2145126865140154