In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

### 构建数据集合

In [40]:
# 模拟数据集
data = {
    "email": ["user1", "user2", "user3", "user4", "user5"],
    "collections": [
        [1, 3, 5, 7, 8, 9, 12, 71, 87],  # 用户1收藏的项目ID
        [2, 3, 4, 6 ,7 ,8, 11, 72, 99],  # 用户2收藏的项目ID
        [1, 2, 5, 4, 9, 12, 99],  # 用户3收藏的项目ID
        [3, 4, 9, 10, 11, 87],   # 用户4收藏的项目ID
        [1, 3, 7, 9 ,13, 99] # 用户5收藏的项目ID
    ]
}

# 将数据转换为DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,email,collections
0,user1,"[1, 3, 5, 7, 8, 9, 12, 71, 87]"
1,user2,"[2, 3, 4, 6, 7, 8, 11, 72, 99]"
2,user3,"[1, 2, 5, 4, 9, 12, 99]"
3,user4,"[3, 4, 9, 10, 11, 87]"
4,user5,"[1, 3, 7, 9, 13, 99]"


### 构建用户－项目矩阵

In [30]:
# 构建用户-项目矩阵
def build_user_item_matrix(df):
    all_items = set(item for sublist in df['collections'] for item in sublist)
    user_item_matrix = {}
    for _, row in df.iterrows():
        email = row['email']
        collections = set(row['collections'])
        user_vector = {item: 1 if item in collections else 0 for item in all_items}
        user_item_matrix[email] = user_vector
    return pd.DataFrame(user_item_matrix).T

user_item_matrix = build_user_item_matrix(df)
print("用户-项目矩阵：")
print(user_item_matrix)

用户-项目矩阵：
       1   2   3   4   5   6   7   8   9   71  11  12  72  10  13  99  87
user1   1   0   1   0   1   0   1   1   1   1   0   1   0   0   0   0   1
user2   0   1   1   1   0   1   1   1   0   0   1   0   1   0   0   1   0
user3   1   1   0   1   1   0   0   0   1   0   0   1   0   0   0   1   0
user4   0   0   1   1   0   0   0   0   1   0   1   0   0   1   0   0   1
user5   1   0   1   0   0   0   1   0   1   0   0   0   0   0   1   1   0


### 基于用户协同过滤的推荐算法

In [37]:
# 基于用户的协同过滤推荐算法
def recommend_items(user_id, user_item_matrix, top_n=5, k_neighbors=3):
    # Step 1: 计算目标用户与其他用户之间的相似度
    target_user_vector = user_item_matrix.loc[user_id].values.reshape(1, -1)
    similarities = {}
    for other_user in user_item_matrix.index:
        if other_user != user_id:
            other_user_vector = user_item_matrix.loc[other_user].values.reshape(1, -1)
            similarity = cosine_similarity(target_user_vector, other_user_vector)[0][0]
            similarities[other_user] = similarity

    # Step 2: 找到最相似的K个用户
    similar_users = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:k_neighbors]

    # Step 3: 收集相似用户收藏但目标用户未收藏的项目
    recommendations = defaultdict(float)
    target_user_items = set(user_item_matrix.columns[user_item_matrix.loc[user_id] == 1])
    for other_user, similarity_score in similar_users:
        other_user_items = set(user_item_matrix.columns[user_item_matrix.loc[other_user] == 1])
        for item in other_user_items - target_user_items:
            recommendations[item] += similarity_score

    # Step 4: 按权重排序，返回前N个推荐项目
    return sorted(recommendations.items(), key=lambda x: x[1], reverse=True)[:top_n]

# 示例：为目标用户生成推荐
target_user = "user5"
recommendations = recommend_items(target_user, user_item_matrix, top_n=10)
print(f"\n为用户 {target_user} 推荐的项目：")
for item, score in recommendations:
    print(f"项目 {item} (推荐分数: {score:.2f})")


为用户 user5 推荐的项目：
项目 5 (推荐分数: 1.01)
项目 12 (推荐分数: 1.01)
项目 8 (推荐分数: 0.95)
项目 2 (推荐分数: 0.87)
项目 4 (推荐分数: 0.87)
项目 71 (推荐分数: 0.54)
项目 87 (推荐分数: 0.54)
项目 6 (推荐分数: 0.41)
项目 72 (推荐分数: 0.41)
项目 11 (推荐分数: 0.41)


### 性能评估

In [36]:
# 性能评估（可选）
def evaluate_recommendation(df, user_item_matrix, top_n=5, k_neighbors=3):
    precision_scores = []
    recall_scores = []

    for _, row in df.iterrows():
        user_id = row['email']
        true_items = set(row['collections'])
        recommended_items = set(item for item, _ in recommend_items(user_id, user_item_matrix, top_n, k_neighbors))

        # 计算准确率和召回率
        if len(recommended_items) > 0:
            precision = len(true_items & recommended_items) / len(recommended_items)
            recall = len(true_items & recommended_items) / len(true_items) if len(true_items) > 0 else 0
            precision_scores.append(precision)
            recall_scores.append(recall)

    avg_precision = np.mean(precision_scores)
    avg_recall = np.mean(recall_scores)
    return avg_precision, avg_recall

avg_precision, avg_recall = evaluate_recommendation(df, user_item_matrix, top_n=3)
print(f"\n平均准确率: {avg_precision:.2f}, 平均召回率: {avg_recall:.2f}")


平均准确率: 0.00, 平均召回率: 0.00
