In [1]:
#准备数据与相似度矩阵

In [2]:
from sklearn.metrics.pairwise import cosine_similarity

In [1]:
import pandas as pd

In [37]:
import numpy as np

In [38]:
ratings = pd.read_csv("../data/ml-1m/ratings.dat", sep="::", engine="python",
                      names=["userId", "movieId", "rating", "timestamp"])

In [39]:
ui = ratings.pivot_table(index = "userId", columns = "movieId", values = "rating")

In [40]:
ui_filled = ui.fillna(0).values

In [41]:
movie_ids = ui.columns.to_numpy()

In [42]:
item_sim = cosine_similarity(ui_filled.T)

In [43]:
print("相似度矩阵 shape:", item_sim.shape)

相似度矩阵 shape: (3706, 3706)


In [44]:
#预测用户u对物品i的评分

In [45]:
def predict_item_based(user_id, item_id, k=20):
    if user_id not in ui.index or item_id not in ui.columns:
        return np.nan
    user_ratings = ui.loc[user_id].dropna()
    if user_ratings.empty:
        return np.nan

    target_idx = np.where(movie_ids == item_id)[0][0]
    neighbor_idxs = [np.where(movie_ids == mid)[0][0] for mid in user_ratings.index]
    sims = item_sim[target_idx, neighbor_idxs]
    vals = user_ratings.to_numpy()
    
    topk = np.argsort(sims)[-k:]
    sims, vals = sims[topk], vals[topk]
    denom = np.abs(sims).sum()
    return np.dot(sims, vals) / denom if denom != 0 else np.nan

In [46]:
# 生成推荐结果

In [47]:
def topn_recommend(user_id, N=10, k=20):
    seen = set(ratings.loc[ratings.userId == user_id, "movieId"])
    preds = []
    for mid in movie_ids:
        if mid in seen:
            continue
        p = predict_item_based(user_id, mid, k)
        if not np.isnan(p):
            preds.append((mid, p))
    preds.sort(key=lambda x: x[1], reverse=True)
    return preds[:N]

topn_recommend(1, N=5)

[(3209, 5.000000000000001),
 (1915, 5.0),
 (2438, 4.747242914370367),
 (3890, 4.693091529724568),
 (2569, 4.5713625523374395)]

In [None]:
# 把推荐的电影id转化成电影名字

In [50]:
movies = pd.read_csv(
    "../data/ml-1m/movies.dat",
    sep="::",
    engine="python",
    names=["movieId", "title", "genres"],
    encoding="latin-1"
)

In [51]:
def show_recommendations(user_id, N=5):
    preds = topn_recommend(user_id, N)
    recs = pd.DataFrame(preds, columns=["movieId", "pred_rating"])
    recs = recs.merge(movies[["movieId", "title", "genres"]], on="movieId")
    return recs[["title", "genres", "pred_rating"]]

In [52]:
show_recommendations(1, N=5)

Unnamed: 0,title,genres,pred_rating
0,"Loves of Carmen, The (1948)",Drama,5.0
1,Voyage to the Beginning of the World (1997),Drama,5.0
2,Outside Ozona (1998),Drama|Thriller,4.747243
3,Back Stage (2000),Documentary,4.693092
4,Among Giants (1998),Drama|Romance,4.571363


In [None]:
#做一个简单的评估

In [53]:
# 按用户切分

In [56]:
import numpy as np
import pandas as pd

def train_test_split_per_user(df, test_frac=0.2, seed=42):
    """保证每个用户在 test 也有样本；样本量为 max(1, 用户数×test_frac)"""
    np.random.seed(seed)
    train_parts, test_parts = [], []
    for uid, g in df.groupby("userId"):
        if len(g) == 1:
            train_parts.append(g);  # 只有一条就全部放 train
            continue
        n_test = max(1, int(len(g)*test_frac))
        idx = np.random.choice(g.index, n_test, replace=False)
        test_parts.append(g.loc[idx])
        train_parts.append(g.drop(idx))
    train = pd.concat(train_parts).reset_index(drop=True)
    test  = pd.concat(test_parts).reset_index(drop=True)
    return train, test

train, test = train_test_split_per_user(ratings, test_frac=0.2, seed=42)
len(train), len(test)

(802553, 197656)

In [57]:
# 用 train 构建矩阵 + 物品相似度 + 预测函数

In [58]:
from sklearn.metrics.pairwise import cosine_similarity

# 用户-物品评分矩阵（基于 train）
ui_train = train.pivot_table(index="userId", columns="movieId", values="rating", fill_value=0)
movie_ids = ui_train.columns.values
mid_to_idx = {m:i for i,m in enumerate(movie_ids)}

# 物品-物品余弦相似度（列向量）
item_item_sim = cosine_similarity(ui_train.T)  # shape: (M, M)

def predict_item_based(user_id, item_id, k=20):
    """对单个 (user, item) 预测评分；基于 K 个最相似已评分物品的加权平均"""
    if (user_id not in ui_train.index) or (item_id not in mid_to_idx):
        return np.nan
    urow = ui_train.loc[user_id].values
    rated_mask = urow > 0
    if not rated_mask.any():
        return np.nan

    j = mid_to_idx[item_id]
    sims_all = item_item_sim[j]
    sims = sims_all[rated_mask]
    rates = urow[rated_mask]

    # 取 Top-K 相似邻居
    top = np.argsort(-sims)[:k]
    top_sims, top_rates = sims[top], rates[top]
    denom = np.abs(top_sims).sum()
    if denom == 0:
        return np.nan
    return (top_sims @ top_rates) / denom

In [59]:
# 评估（RMSE / MAE）+ 打印结果

In [60]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def evaluate_model(test_df, k=20):
    preds, trues = [], []
    for _, row in test_df.iterrows():
        u, m, r = row["userId"], row["movieId"], row["rating"]
        p = predict_item_based(u, m, k)
        if not np.isnan(p):
            preds.append(p); trues.append(r)
    if len(preds) == 0:
        print("没有有效预测（可能 test 中多数条目在 train 未出现）。")
        return None, None, 0
    rmse = np.sqrt(mean_squared_error(trues, preds))
    mae  = mean_absolute_error(trues, preds)
    return rmse, mae, len(preds)

rmse, mae, n = evaluate_model(test, k=20)
print(f"Valid predictions: {n}")
print(f"RMSE: {rmse:.4f}    MAE: {mae:.4f}")

Valid predictions: 197622
RMSE: 0.9336    MAE: 0.7259
