In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
ratings = pd.read_csv("../data/ml-1m/ratings.dat", 
                      sep="::", engine="python",
                      names=["userId", "movieId", "rating", "timestamp"])

In [5]:
movies = pd.read_csv("../data/ml-1m/movies.dat", 
                     sep="::", engine="python",
                     names=["movieId", "title", "genres"],
                     encoding="latin-1")

In [6]:
#生成用户-物品评分矩阵

In [7]:
user_item = ratings.pivot_table(index = "userId", columns = "movieId",values = "rating")

In [8]:
print("User-item matrix shape:", user_item.shape)
user_item.head()

User-item matrix shape: (6040, 3706)


movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,


In [None]:
# 计算用户相似度矩阵（cosine）

In [9]:
user_item_filled = user_item.fillna(0)

In [10]:
#使用sklearn的consine_similarity计算用户之间的相似度

In [14]:
user_sim = cosine_similarity(user_item_filled)

In [15]:
#转化为dataframe 行列都是userid

In [16]:
user_sim_df = pd.DataFrame(user_sim,index=user_item.index,columns=user_item.index)

In [18]:
print("User similarity matrix shape:", user_sim_df.shape)
user_sim_df.iloc[:5, :5] 

User similarity matrix shape: (6040, 6040)


userId,1,2,3,4,5
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,0.096382,0.12061,0.132455,0.090158
2,0.096382,1.0,0.151479,0.171176,0.114394
3,0.12061,0.151479,1.0,0.151227,0.062907
4,0.132455,0.171176,0.151227,1.0,0.045094
5,0.090158,0.114394,0.062907,0.045094,1.0


In [None]:
#基于相似用户预测评分

In [19]:
def predict_user_based(user_id, item_id, k=20):
    """预测 user_id 对 item_id 的评分"""
    if item_id not in user_item.columns:
        return np.nan  # 电影不存在

    # 当前用户的相似度向量
    sims = user_sim_df.loc[user_id]

    # 找出对该电影有评分的其他用户
    rated_users = user_item[item_id].dropna().index

    # 按相似度排序，取前 k 个最相似的用户
    top_k_users = rated_users.intersection(sims.index)
    top_k_users = sims.loc[top_k_users].sort_values(ascending=False).head(k).index

    # 计算加权平均
    num, den = 0.0, 0.0
    for u in top_k_users:
        r = user_item.loc[u, item_id]
        if not np.isnan(r):
            sim = sims[u]
            num += sim * r
            den += abs(sim)
    if den == 0:
        return np.nan
    return num / den

In [20]:
pred = predict_user_based(1, 1, k=20)
print("Predicted rating for user 1 on movie 1:", pred)

Predicted rating for user 1 on movie 1: 4.734528614570308


In [None]:
# 推荐 + 测试

In [30]:
def topn_recommend(user_id, N = 5, K = 20):
    seen = set(ratings.loc[ratings.userId == user_id, "movieId"])
    preds = []

    for mid in user_item.columns:
        if mid in seen:
            continue
        pred = predict_user_based(user_id, mid, K)
        if not np.isnan(pred):
            preds.append((mid,pred))

    preds.sort(key=lambda x: x[1], reverse = True)
    return preds[:N]

In [31]:
recs = topn_recommend(1, N=5)
print("Top-5 Recommendations for user 1:")
for mid, score in recs:
    title = movies.loc[movies.movieId == mid, "title"].values[0]
    print(f"{title:<50} Predicted rating: {score:.2f}")

Top-5 Recommendations for user 1:
Gate of Heavenly Peace, The (1995)                 Predicted rating: 5.00
Schlafes Bruder (Brother of Sleep) (1995)          Predicted rating: 5.00
Follow the Bitch (1998)                            Predicted rating: 5.00
Ulysses (Ulisse) (1954)                            Predicted rating: 5.00
Song of Freedom (1936)                             Predicted rating: 5.00


In [None]:
# 评估模型

In [32]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

# 选取少部分有评分的数据进行预测评估
test = ratings.sample(5000, random_state=42)

preds = []
truth = []

for _, row in test.iterrows():
    user, movie, true_rating = row["userId"], row["movieId"], row["rating"]
    pred = predict_user_based(user, movie, k=20)
    if not np.isnan(pred):
        preds.append(pred)
        truth.append(true_rating)

rmse = math.sqrt(mean_squared_error(truth, preds))
mae = mean_absolute_error(truth, preds)

print(f"Valid predictions: {len(preds)}")
print(f"RMSE: {rmse:.4f}    MAE: {mae:.4f}")

Valid predictions: 5000
RMSE: 0.8373    MAE: 0.6664
