基于用户的协同过滤

In [1]:



import numpy as np
import pandas as pd
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 1. 加载MovieLens 100K数据
def load_data(file_path):
    # 加载评分数据，u.data文件包含user_id、item_id、rating、timestamp，用制表符分隔
    data = pd.read_csv(file_path, sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    return data[['user_id', 'item_id', 'rating']]

# 2. 创建用户-物品评分矩阵
def create_user_item_matrix(data):
    # 生成用户-物品评分矩阵，未评分的项填0
    num_users = data['user_id'].nunique()
    num_items = data['item_id'].nunique()
    user_item_matrix = np.zeros((num_users, num_items))
    
    for row in data.itertuples():
        user_item_matrix[row.user_id - 1, row.item_id - 1] = row.rating  # 用户ID和物品ID从1开始
    return user_item_matrix

# 3. 计算用户之间的余弦相似度
def cosine_similarity(user_ratings):
    num_users = user_ratings.shape[0]
    similarity_matrix = np.zeros((num_users, num_users))
    
    for i in range(num_users):
        for j in range(num_users):
            if i != j:
                ratings_i = user_ratings[i]
                ratings_j = user_ratings[j]
                
                dot_product = np.dot(ratings_i, ratings_j)
                norm_i = np.linalg.norm(ratings_i)
                norm_j = np.linalg.norm(ratings_j)
                
                if norm_i > 0 and norm_j > 0:
                    similarity_matrix[i][j] = dot_product / (norm_i * norm_j)
                else:
                    similarity_matrix[i][j] = 0
    return similarity_matrix

# 4. 基于用户相似度预测评分
def predict_ratings(user_ratings, similarity_matrix):
    num_users, num_items = user_ratings.shape
    predicted_ratings = np.zeros((num_users, num_items))
    
    for i in range(num_users):
        for j in range(num_items):
            if user_ratings[i][j] == 0:
                weighted_sum = 0
                similarity_sum = 0
                for k in range(num_users):
                    if user_ratings[k][j] > 0 and k != i:
                        weighted_sum += similarity_matrix[i][k] * user_ratings[k][j]
                        similarity_sum += abs(similarity_matrix[i][k])
                
                if similarity_sum > 0:
                    predicted_ratings[i][j] = weighted_sum / similarity_sum
                else:
                    predicted_ratings[i][j] = 0
            else:
                predicted_ratings[i][j] = user_ratings[i][j]
    return predicted_ratings

# 5. 划分训练集和测试集
def train_test_split_matrix(ratings_matrix, test_size=0.25):
    train = ratings_matrix.copy()
    test = np.zeros(ratings_matrix.shape)
    
    for user in range(ratings_matrix.shape[0]):
        rated_items = np.where(ratings_matrix[user] > 0)[0]
        test_ratings = np.random.choice(rated_items, size=int(len(rated_items) * test_size), replace=False)
        
        train[user, test_ratings] = 0
        test[user, test_ratings] = ratings_matrix[user, test_ratings]
    
    return train, test

# 6. 评估算法：计算 RMSE 和 MAE
def evaluate(predictions, ground_truth):
    predictions = predictions[ground_truth > 0]
    ground_truth = ground_truth[ground_truth > 0]
    
    rmse = math.sqrt(mean_squared_error(ground_truth, predictions))
    mae = mean_absolute_error(ground_truth, predictions)
    
    return rmse, mae

# 主流程
# 加载数据
file_path = 'ml-100k/u.data'  # 请确保将路径更改为 u.data 文件的实际路径
data = load_data(file_path)

# 创建用户-物品评分矩阵
user_item_matrix = create_user_item_matrix(data)

# 划分训练集和测试集
train_matrix, test_matrix = train_test_split_matrix(user_item_matrix, test_size=0.25)

# 计算用户相似度矩阵
user_similarity = cosine_similarity(train_matrix)

# 生成预测评分矩阵
predicted_ratings = predict_ratings(train_matrix, user_similarity)

# 评估算法
rmse, mae = evaluate(predicted_ratings, test_matrix)
print("RMSE:", rmse)
print("MAE:", mae)


RMSE: 1.027174610928663
MAE: 0.8169512090050841
