In [6]:
import pandas as pd

# 加载评分数据
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', engine='python', 
                      names=['user_id', 'movie_id', 'rating', 'timestamp'])

# 加载电影数据
movies = pd.read_csv('ml-1m/movies.dat', sep='::', engine='python', 
                     names=['movie_id', 'title', 'genres'], encoding='latin-1')

In [7]:
# 创建用户-物品评分矩阵
user_item_matrix = ratings.pivot_table(index='user_id', columns='movie_id', values='rating')

# 填充缺失值为 0（表示用户未评分）
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

# 计算用户相似度矩阵
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)
user_similarity_df

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.096382,0.120610,0.132455,0.090158,0.179222,0.059678,0.138241,0.226148,0.255288,...,0.170588,0.082006,0.069807,0.033663,0.114877,0.186329,0.135979,0.000000,0.174604,0.133590
2,0.096382,1.000000,0.151479,0.171176,0.114394,0.100865,0.305787,0.203337,0.190198,0.226861,...,0.112503,0.091222,0.268565,0.014286,0.183384,0.228241,0.206274,0.066118,0.066457,0.218276
3,0.120610,0.151479,1.000000,0.151227,0.062907,0.074603,0.138332,0.077656,0.126457,0.213655,...,0.092960,0.125864,0.161507,0.000000,0.097308,0.143264,0.107744,0.120234,0.094675,0.133144
4,0.132455,0.171176,0.151227,1.000000,0.045094,0.013529,0.130339,0.100856,0.093651,0.120738,...,0.163629,0.093041,0.382803,0.000000,0.082097,0.170583,0.127464,0.062907,0.064634,0.137968
5,0.090158,0.114394,0.062907,0.045094,1.000000,0.047449,0.126257,0.220817,0.261330,0.117052,...,0.100652,0.035732,0.061806,0.054151,0.179083,0.293365,0.172686,0.020459,0.027689,0.241437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.186329,0.228241,0.143264,0.170583,0.293365,0.093583,0.122441,0.227400,0.239607,0.338072,...,0.131294,0.209843,0.186426,0.103431,0.267405,1.000000,0.341462,0.124174,0.219115,0.411891
6037,0.135979,0.206274,0.107744,0.127464,0.172686,0.065788,0.111673,0.144395,0.225055,0.246902,...,0.142309,0.276134,0.129985,0.118749,0.141676,0.341462,1.000000,0.049015,0.252146,0.428240
6038,0.000000,0.066118,0.120234,0.062907,0.020459,0.065711,0.000000,0.019242,0.093470,0.113789,...,0.108837,0.106897,0.040689,0.000000,0.063967,0.124174,0.049015,1.000000,0.161714,0.099300
6039,0.174604,0.066457,0.094675,0.064634,0.027689,0.167303,0.014977,0.044660,0.046434,0.296776,...,0.118776,0.250994,0.053750,0.102168,0.068399,0.219115,0.252146,0.161714,1.000000,0.228332


In [9]:
def usercf_recommend(user_id, top_n=10, k=20):
    # 获取目标用户的相似用户
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).head(k+1)
    
    # 删除目标用户自身（相似度为1）
    similar_users = similar_users[1:]
    
    # 获取相似用户评分过的物品
    similar_users_ratings = user_item_matrix.loc[similar_users.index]
    
    # 计算加权评分（相似度 × 评分）
    weighted_ratings = similar_users_ratings.mul(similar_users, axis=0)
    
    # 计算预测评分（加权评分的均值）
    predicted_ratings = weighted_ratings.sum(axis=0) / similar_users.sum()
    
    # 过滤掉目标用户已经评分的物品
    user_rated_items = user_item_matrix.loc[user_id]
    predicted_ratings = predicted_ratings[user_rated_items == 0]
    
    # 返回预测评分最高的 top_n 个物品
    return predicted_ratings.sort_values(ascending=False).head(top_n)

In [10]:
# 为用户 1 生成推荐
user_id = 1
recommendations = usercf_recommend(user_id, top_n=10)

# 打印推荐结果
print("为用户", user_id, "推荐的电影：")
for movie_id in recommendations.index:
    print(movies[movies['movie_id'] == movie_id]['title'].values[0])

为用户 1 推荐的电影：
Little Mermaid, The (1989)
Jungle Book, The (1967)
Sleeping Beauty (1959)
Lion King, The (1994)
Fantasia (1940)
101 Dalmatians (1961)
Peter Pan (1953)
Star Wars: Episode VI - Return of the Jedi (1983)
Jurassic Park (1993)
Lady and the Tramp (1955)


In [8]:
# 划分训练集和测试集（按用户行为划分）
# 注意：此处将整体评分记录划分为训练集和测试集
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# 创建训练集的用户-物品矩阵
train_matrix = train_data.pivot_table(index='user_id', columns='movie_id', values='rating')

# 填充缺失值为 0（表示用户未评分）
train_matrix = train_matrix.fillna(0)

# 计算用户相似度矩阵（基于训练集）
user_similarity = cosine_similarity(train_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=train_matrix.index, columns=train_matrix.index)

def usercf_recommend(user_id, top_n=10, k=20):
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).head(k+1)
    similar_users = similar_users[1:]
    similar_ratings = train_matrix.loc[similar_users.index]
    weighted_ratings = similar_ratings.mul(similar_users, axis=0)
    predicted_ratings = weighted_ratings.sum(axis=0) / similar_users.sum()
    user_rated = train_matrix.loc[user_id]
    predicted_ratings = predicted_ratings[user_rated == 0]
    return predicted_ratings[user_rated == 0].sort_values(ascending=False).head(top_n)

# 生成评估所需的 y_true 和 y_pred
y_true = []
y_pred = []

# 构建测试集用户真实喜欢的物品集合（评分>=4）
test_likes = test_data[test_data['rating'] >= 4]
user_true_items = test_likes.groupby('user_id')['movie_id'].apply(set)

# 遍历所有测试用户
for user_id in user_true_items.index.unique():
    if user_id not in train_matrix.index:
        continue  # 跳过新用户
        
    # 获取推荐列表
    try:
        recommendations = usercf_recommend(user_id).index.tolist()
    except KeyError:
        continue
    
    # 获取真实喜欢的物品
    true_liked = user_true_items[user_id]
    
    # 构建该用户的标签对
    for item in set(recommendations).union(true_liked):
        y_true.append(1 if item in true_liked else 0)
        y_pred.append(1 if item in recommendations else 0)

# 计算评估指标
print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("F1-score:", f1_score(y_true, y_pred))

Precision: 0.2568068068068068
Recall: 0.1336638821833591
F1-score: 0.17581752350058824
